点击这里给我发消息 点击这里给我发消息

如何使用Lucene对html文件进行索引

添加时间:2013-12-7
    相关阅读: HTML

  我修改了lucene的demo包的IndexHTML类,使其可以被其他Java类调用。
  
  IndexHTML类
  
  import org.apache.lucene.analysis.standard.StandardAnalyzer;
  
  import org.apache.lucene.document.Document;
  
  import org.apache.lucene.index.IndexReader;
  
  import org.apache.lucene.index.IndexWriter;
  
  import org.apache.lucene.index.Term;
  
  import org.apache.lucene.index.TermEnum;
  
  import java.io.File;import java.util.Date;
  
  import java.util.Arrays;
  
  //还需调用demo的其他类。
  
  import org.apache.lucene.demo;
  
  /**
  
  * Create html file index for searching
  
  * @author tyrone
  
  *
  
  */public class IndexHTML { private String DocsPath=null;
  
  /**
  
  * the path for index file;
  
  */ private String IndexFilePath=null;
  
  /**
  
  * true during deletion pass
  
  */  private boolean deleting = false;
  
  /**
  
  * existing index
  
  */  private IndexReader reader;
  
  /**
  
  * new index being built
  
  */  private IndexWriter writer;
  
  /**
  
  * document id iterator
  
  */  private TermEnum uidIter;
  
  private void indexDocs(File file)throws Exception {
  
  if (file.isDirectory())
  
  {
  
  // if a directory  String[] files = file.list();
  
  // list its files  Arrays.sort(files);
  
  // sort the files  for (int i = 0; i < files.length;
  
  i++)  // recursively index them  this.indexDocs(new File(file, files[i]));
  
  } else if (file.getPath().endsWith(".html") || // index .html files  file.getPath().endsWith(".htm") || // index .htm files  file.getPath().endsWith(".txt")) { // index .txt files   if (this.uidIter != null) {  String uid = HTMLDocument.uid(file);
  
  // construct uid for doc
  
  while (uidIter.term() != null && uidIter.term().field() == "uid" &&
  
  uidIter.term().text().compareTo(uid) <0) {
  
  if (deleting) {
  
  // delete stale docs
  
  System.out.println("deleting " +
  
  HTMLDocument.uid2url(uidIter.term().text()));
  
  reader.delete(uidIter.term());
  
  }
  
  uidIter.next();
  
  }
  
  if (uidIter.term() != null && uidIter.term().field() == "uid" &&
  
  uidIter.term().text().compareTo(uid) == 0) {
  
  uidIter.next();
  
  // keep matching docs
  
  } else if (!deleting) {
  
  // add new docs
  
  Document doc = HTMLDocument.Document(file);
  
  System.out.println("adding " + doc.get("url"));
  
  writer.addDocument(doc);
  
  }
  
  } else { // creating a new index
  
  Document doc = HTMLDocument.Document(file);
  
  System.out.println("adding " + doc.get("url"));
  
  writer.addDocument(doc);
  
  // add docs unconditionally
  
  }
  
  } return;
  
  }
  
  /**
  
  * Walk directory hierarchy in uid order, while keeping uid iterator from
  
  * existing index in sync. Mismatches indicate one of:
  
  * (a) old documents to be deleted;
  
  * (b) unchanged documents, to be left alone;
  
  * or (c) new documents, to be indexed.
  
  */  private void indexDocs(File file, String index, boolean create)
  
  throws Exception {
  
  if (!create) {
  
  // incrementally update
  
  reader = IndexReader.open(index);
  
  // open existing index
  
  uidIter = reader.terms(new Term("uid", ""));
  
  // init uid iterator
  
  this.indexDocs(file);
  
  if (deleting) {
  
  // delete rest of stale docs
  
  while (uidIter.term() != null && uidIter.term().field() == "uid") {
  
  System.out.println("deleting " +
  
  HTMLDocument.uid2url(uidIter.term().text()));
  
  reader.delete(uidIter.term());
  
  uidIter.next();
  
  }
  
  deleting = false;
  
  }
  
  uidIter.close();
  
  // close uid iterator
  
  reader.close();
  
  // close existing index
  
  } else
  
  // don't have exisiting
  
  this.indexDocs(file);
  
  }
  
  /**
  
  * if create=true, create a new index, else refresh old index.
  
  * @param create
  
  */ public void run(boolean create)
  
  {
  
  try {
  
  String index = "index";
  
  File root = null;
  
  if (this.IndexFilePath!=null)
  
  {
  
  // index file path
  
  index = this.IndexFilePath;
  
  }
  
  if (this.DocsPath==null){
  
  System.out.println("root directory is not set");
  
  return;
  
  }
  
  root = new File(this.DocsPath);
  
  Date start = new Date();
  
  /**
  
  * not create then maintenance
  
  */
  
  if (!create) {
  
  // delete stale docs
  
  this.deleting = true;
  
  this.indexDocs(root, index, create);
  
  }
  
  writer = new IndexWriter(index, new StandardAnalyzer(), create);
  
  writer.maxFieldLength = 1000000;
  
  this.indexDocs(root, index, create);
  
  // add new docs
  
  System.out.println("Optimizing index...");
  
  writer.optimize();
  
  writer.close();
  
  Date end = new Date();
  
  System.out.print(end.getTime() - start.getTime());
  
  System.out.println(" total milliseconds");
  
  } catch (Exception e) {
  
  System.out.println(" caught a " + e.getClass() +
  
  "\n with message: " + e.getMessage());
  
  }
  
  return;
  
  }
  
  /**
  
  * @return Returns the IndexFilePath.
  
  */ public String getIndexFilePath() { return IndexFilePath;
  
  }
  
  /**
  
  * @param IndexFilePath The IndexFilePath to set.
  
  */ public void setIndexFilePath(String property1) { this.IndexFilePath = property1;
  
  }
  
  /**
  
  * @return Returns the DocsPath.
  
  */ public String getDocsPath() { return DocsPath;
  
  }
  
  /**
  
  * @param DocsPath The DocsPath to set.
  
  */ public void setDocsPath(String property1) { this.DocsPath = property1;
  
  }
  
  /**
  
  * test
  
  * @param args
  
  */ public static void main(String[] args){ IndexHTML ih=new IndexHTML();
  
  ih.setDocsPath("D:\\MyProject\\colimas\\clms-doc2\\html");
  
  ih.setIndexFilePath("D:\\MyProject\\colimas\\index"); ih.run(true); }}
  
  运行后生成3个文件_3i8.cfs,deletable,segments
  
  搜索文件类:
  
  /*
  
  * Created on 2005/07/28
  
  *
  
  * TODO To change the template for this generated file go to
  
  * Window - Preferences - Java - Code Style - Code Templates
  
  */package com.nova.colimas.search.query;
  
  /** * @author tyrone * * TODO To change the template for this generated type comment go to
  
  * Window - Preferences - Java - Code Style - Code Templates
  
  */public class HitsHTMLDoc { private String Title;
  
  priva
咨询热线:020-85648757 85648755 85648616 0755-27912581 客服:020-85648756 0755-27912581 业务传真:020-32579052
广州市网景网络科技有限公司 Copyright◎2003-2008 Veelink.com. All Rights Reserved.
广州商务地址:广东省广州市黄埔大道中203号(海景园区)海景花园C栋501室
= 深圳商务地址:深圳市宝源路华丰宝源大厦606
研发中心:广东广州市天河软件园海景园区 粤ICP备05103322号 工商注册