如何使用Lucene对html文件进行索引
我修改了lucene的demo包的IndexHTML类,使其可以被其他Java类调用。
IndexHTML类
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import java.io.File;import java.util.Date;
import java.util.Arrays;
//还需调用demo的其他类。
import org.apache.lucene.demo;
/**
* Create html file index for searching
* @author tyrone
*
*/public class IndexHTML { private String DocsPath=null;
/**
* the path for index file;
*/ private String IndexFilePath=null;
/**
* true during deletion pass
*/ private boolean deleting = false;
/**
* existing index
*/ private IndexReader reader;
/**
* new index being built
*/ private IndexWriter writer;
/**
* document id iterator
*/ private TermEnum uidIter;
private void indexDocs(File file)throws Exception {
if (file.isDirectory())
{
// if a directory String[] files = file.list();
// list its files Arrays.sort(files);
// sort the files for (int i = 0; i < files.length;
i++) // recursively index them this.indexDocs(new File(file, files[i]));
} else if (file.getPath().endsWith(".html") || // index .html files file.getPath().endsWith(".htm") || // index .htm files file.getPath().endsWith(".txt")) { // index .txt files if (this.uidIter != null) { String uid = HTMLDocument.uid(file);
// construct uid for doc
while (uidIter.term() != null && uidIter.term().field() == "uid" &&
uidIter.term().text().compareTo(uid) <0) {
if (deleting) {
// delete stale docs
System.out.println("deleting " +
HTMLDocument.uid2url(uidIter.term().text()));
reader.delete(uidIter.term());
}
uidIter.next();
}
if (uidIter.term() != null && uidIter.term().field() == "uid" &&
uidIter.term().text().compareTo(uid) == 0) {
uidIter.next();
// keep matching docs
} else if (!deleting) {
// add new docs
Document doc = HTMLDocument.Document(file);
System.out.println("adding " + doc.get("url"));
writer.addDocument(doc);
}
} else { // creating a new index
Document doc = HTMLDocument.Document(file);
System.out.println("adding " + doc.get("url"));
writer.addDocument(doc);
// add docs unconditionally
}
} return;
}
/**
* Walk directory hierarchy in uid order, while keeping uid iterator from
* existing index in sync. Mismatches indicate one of:
* (a) old documents to be deleted;
* (b) unchanged documents, to be left alone;
* or (c) new documents, to be indexed.
*/ private void indexDocs(File file, String index, boolean create)
throws Exception {
if (!create) {
// incrementally update
reader = IndexReader.open(index);
// open existing index
uidIter = reader.terms(new Term("uid", ""));
// init uid iterator
this.indexDocs(file);
if (deleting) {
// delete rest of stale docs
while (uidIter.term() != null && uidIter.term().field() == "uid") {
System.out.println("deleting " +
HTMLDocument.uid2url(uidIter.term().text()));
reader.delete(uidIter.term());
uidIter.next();
}
deleting = false;
}
uidIter.close();
// close uid iterator
reader.close();
// close existing index
} else
// don't have exisiting
this.indexDocs(file);
}
/**
* if create=true, create a new index, else refresh old index.
* @param create
*/ public void run(boolean create)
{
try {
String index = "index";
File root = null;
if (this.IndexFilePath!=null)
{
// index file path
index = this.IndexFilePath;
}
if (this.DocsPath==null){
System.out.println("root directory is not set");
return;
}
root = new File(this.DocsPath);
Date start = new Date();
/**
* not create then maintenance
*/
if (!create) {
// delete stale docs
this.deleting = true;
this.indexDocs(root, index, create);
}
writer = new IndexWriter(index, new StandardAnalyzer(), create);
writer.maxFieldLength = 1000000;
this.indexDocs(root, index, create);
// add new docs
System.out.println("Optimizing index...");
writer.optimize();
writer.close();
Date end = new Date();
System.out.print(end.getTime() - start.getTime());
System.out.println(" total milliseconds");
} catch (Exception e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
return;
}
/**
* @return Returns the IndexFilePath.
*/ public String getIndexFilePath() { return IndexFilePath;
}
/**
* @param IndexFilePath The IndexFilePath to set.
*/ public void setIndexFilePath(String property1) { this.IndexFilePath = property1;
}
/**
* @return Returns the DocsPath.
*/ public String getDocsPath() { return DocsPath;
}
/**
* @param DocsPath The DocsPath to set.
*/ public void setDocsPath(String property1) { this.DocsPath = property1;
}
/**
* test
* @param args
*/ public static void main(String[] args){ IndexHTML ih=new IndexHTML();
ih.setDocsPath("D:\\MyProject\\colimas\\clms-doc2\\html");
ih.setIndexFilePath("D:\\MyProject\\colimas\\index"); ih.run(true); }}
运行后生成3个文件_3i8.cfs,deletable,segments
搜索文件类:
/*
* Created on 2005/07/28
*
* TODO To change the template for this generated file go to
* Window - Preferences - Java - Code Style - Code Templates
*/package com.nova.colimas.search.query;
/** * @author tyrone * * TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/public class HitsHTMLDoc { private String Title;
priva