如何使用Lucene对html文件进行索引

添加时间：2013-12-7

相关阅读: HTML

　　我修改了lucene的demo包的IndexHTML类，使其可以被其他Java类调用。
　　
　　IndexHTML类
　　
　　import org.apache.lucene.analysis.standard.StandardAnalyzer;
　　
　　import org.apache.lucene.document.Document;
　　
　　import org.apache.lucene.index.IndexReader;
　　
　　import org.apache.lucene.index.IndexWriter;
　　
　　import org.apache.lucene.index.Term;
　　
　　import org.apache.lucene.index.TermEnum;
　　
　　import java.io.File;import java.util.Date;
　　
　　import java.util.Arrays;
　　
　　//还需调用demo的其他类。
　　
　　import org.apache.lucene.demo;
　　
　　/**
　　
　　* Create html file index for searching
　　
　　* @author tyrone
　　
　　*
　　
　　*/public class IndexHTML { private String DocsPath=null;
　　
　　/**
　　
　　* the path for index file;
　　
　　*/ private String IndexFilePath=null;
　　
　　/**
　　
　　* true during deletion pass
　　
　　*/　 private boolean deleting = false;
　　
　　/**
　　
　　* existing index
　　
　　*/　 private IndexReader reader;
　　
　　/**
　　
　　* new index being built
　　
　　*/　 private IndexWriter writer;
　　
　　/**
　　
　　* document id iterator
　　
　　*/　 private TermEnum uidIter;
　　
　　private void indexDocs(File file)throws Exception {
　　
　　if (file.isDirectory())
　　
　　{
　　
　　// if a directory　 String[] files = file.list();
　　
　　// list its files　 Arrays.sort(files);
　　
　　// sort the files　 for (int i = 0; i < files.length;
　　
　　i++)　 // recursively index them　　this.indexDocs(new File(file, files[i]));
　　
　　} else if (file.getPath().endsWith(".html") || // index .html files　　file.getPath().endsWith(".htm") || // index .htm files　　file.getPath().endsWith(".txt")) { // index .txt files　　　if (this.uidIter != null) {　　String uid = HTMLDocument.uid(file);
　　
　　// construct uid for doc
　　
　　while (uidIter.term() != null && uidIter.term().field() == "uid" &&
　　
　　uidIter.term().text().compareTo(uid) <0) {
　　
　　if (deleting) {
　　
　　// delete stale docs
　　
　　System.out.println("deleting " +
　　
　　HTMLDocument.uid2url(uidIter.term().text()));
　　
　　reader.delete(uidIter.term());
　　
　　}
　　
　　uidIter.next();
　　
　　}
　　
　　if (uidIter.term() != null && uidIter.term().field() == "uid" &&
　　
　　uidIter.term().text().compareTo(uid) == 0) {
　　
　　uidIter.next();
　　
　　// keep matching docs
　　
　　} else if (!deleting) {
　　
　　// add new docs
　　
　　Document doc = HTMLDocument.Document(file);
　　
　　System.out.println("adding " + doc.get("url"));
　　
　　writer.addDocument(doc);
　　
　　}
　　
　　} else { // creating a new index
　　
　　Document doc = HTMLDocument.Document(file);
　　
　　System.out.println("adding " + doc.get("url"));
　　
　　writer.addDocument(doc);
　　
　　// add docs unconditionally
　　
　　}
　　
　　}　return;
　　
　　}
　　
　　/**
　　
　　* Walk directory hierarchy in uid order, while keeping uid iterator from
　　
　　* existing index in sync.　Mismatches indicate one of:
　　
　　* (a) old documents to be deleted;
　　
　　* (b) unchanged documents, to be left alone;
　　
　　* or (c) new documents, to be indexed.
　　
　　*/　 private void indexDocs(File file, String index, boolean create)
　　
　　throws Exception {
　　
　　if (!create) {
　　
　　// incrementally update
　　
　　reader = IndexReader.open(index);
　　
　　// open existing index
　　
　　uidIter = reader.terms(new Term("uid", ""));
　　
　　// init uid iterator
　　
　　this.indexDocs(file);
　　
　　if (deleting) {
　　
　　// delete rest of stale docs
　　
　　while (uidIter.term() != null && uidIter.term().field() == "uid") {
　　
　　System.out.println("deleting " +
　　
　　HTMLDocument.uid2url(uidIter.term().text()));
　　
　　reader.delete(uidIter.term());
　　
　　uidIter.next();
　　
　　}
　　
　　deleting = false;
　　
　　}
　　
　　uidIter.close();
　　
　　// close uid iterator
　　
　　reader.close();
　　
　　// close existing index
　　
　　} else
　　
　　// don't have exisiting
　　
　　this.indexDocs(file);
　　
　　}
　　
　　/**
　　
　　* if create=true, create a new index, else refresh old index.
　　
　　* @param create
　　
　　*/ public void run(boolean create)
　　
　　{
　　
　　try {
　　
　　String index = "index";
　　
　　File root = null;
　　
　　if (this.IndexFilePath!=null)
　　
　　{
　　
　　// index file path
　　
　　index = this.IndexFilePath;
　　
　　}
　　
　　if (this.DocsPath==null){
　　
　　System.out.println("root directory is not set");
　　
　　return;
　　
　　}
　　
　　root = new File(this.DocsPath);
　　
　　Date start = new Date();
　　
　　/**
　　
　　* not create then maintenance
　　
　　*/
　　
　　if (!create) {
　　
　　// delete stale docs
　　
　　this.deleting = true;
　　
　　this.indexDocs(root, index, create);
　　
　　}
　　
　　writer = new IndexWriter(index, new StandardAnalyzer(), create);
　　
　　writer.maxFieldLength = 1000000;
　　
　　this.indexDocs(root, index, create);
　　
　　// add new docs
　　
　　System.out.println("Optimizing index...");
　　
　　writer.optimize();
　　
　　writer.close();
　　
　　Date end = new Date();
　　
　　System.out.print(end.getTime() - start.getTime());
　　
　　System.out.println(" total milliseconds");
　　
　　} catch (Exception e) {
　　
　　System.out.println(" caught a " + e.getClass() +
　　
　　"\n with message: " + e.getMessage());
　　
　　}
　　
　　return;
　　
　　}
　　
　　/**
　　
　　* @return Returns the IndexFilePath.
　　
　　*/ public String getIndexFilePath() {　return IndexFilePath;
　　
　　}
　　
　　/**
　　
　　* @param IndexFilePath The IndexFilePath to set.
　　
　　*/ public void setIndexFilePath(String property1) {　this.IndexFilePath = property1;
　　
　　}
　　
　　/**
　　
　　* @return Returns the DocsPath.
　　
　　*/ public String getDocsPath() {　return DocsPath;
　　
　　}
　　
　　/**
　　
　　* @param DocsPath The DocsPath to set.
　　
　　*/ public void setDocsPath(String property1) {　this.DocsPath = property1;
　　
　　}
　　
　　/**
　　
　　* test
　　
　　* @param args
　　
　　*/ public static void main(String[] args){　IndexHTML ih=new IndexHTML();
　　
　　ih.setDocsPath("D:\\MyProject\\colimas\\clms-doc2\\html");
　　
　　ih.setIndexFilePath("D:\\MyProject\\colimas\\index");　ih.run(true); }}
　　
　　运行后生成3个文件_3i8.cfs，deletable，segments
　　
　　搜索文件类：
　　
　　/*
　　
　　* Created on 2005/07/28
　　
　　*
　　
　　* TODO To change the template for this generated file go to
　　
　　* Window - Preferences - Java - Code Style - Code Templates
　　
　　*/package com.nova.colimas.search.query;
　　
　　/** * @author tyrone * * TODO To change the template for this generated type comment go to
　　
　　* Window - Preferences - Java - Code Style - Code Templates
　　
　　*/public class HitsHTMLDoc {　private String Title;
　　
　　priva

相关如何使用Lucene对html文件进行索引

Tomcat5的web应用启动顺序详解 [2013-12-7]

Tomcat下JSP、Servlet和JavaBean配置 [2013-12-7]

tomcat5.5.9+sql2000数据库连接池配置 [2013-12-7]

tomcat向weblogic移植的中文乱码问题 [2013-12-7]

JBuilder2005+JBOSS+Oracle9i环境配置 [2013-12-7]

JBoss 4.0 简化了中间件的开发 [2013-12-7]