选择显示字体大小

使用lucene对html文件进行索引

我修改了lucene的demo包的indexhtml类,使其可以被其他java类调用。

indexhtml


import org.apache.lucene.analysis.standard.standardanalyzer;
import org.apache.lucene.document.document;
import org.apache.lucene.index.indexreader;
import org.apache.lucene.index.indexwriter;
import org.apache.lucene.index.term;
import org.apache.lucene.index.termenum;
import java.io.file;
import java.util.date;
import java.util.arrays;

//还需调用demo的其他类。
import org.apache.lucene.demo;
/**
* create html file index for searching
* @author tyrone
*
*/
public class indexhtml {
private string docspath=null;

/**
  * the path for index file;
  */
private string indexfilepath=null;

/**
  * true during deletion pass
  */
   private boolean deleting = false;
   /**
    * existing index
    */
   private indexreader reader;
   /**
    * new index being built
    */
   private indexwriter writer;
   /**
    * document id iterator
    */
   private termenum uiditer;
  
  
private void indexdocs(file file)throws exception {
  if (file.isdirectory()) {     // if a directory
   string[] files = file.list();     // list its files
   arrays.sort(files);     // sort the files
   for (int i = 0; i < files.length; i++)   // recursively index them
    this.indexdocs(new file(file, files&#91;i&#93;));
  
  } else if (file.getpath().endswith(&quot;.html&quot;) // index .html files
    file.getpath().endswith(&quot;.htm&quot;) // index .htm files
    file.getpath().endswith(&quot;.txt&quot;)) { // index .txt files
  
   if (this.uiditer != null) {
    string uid = htmldocument.uid(file); // construct uid for doc
    
    while (uiditer.term() != null && uiditer.term().field() == &quot;uid&quot; &&
      uiditer.term().text().compareto(uid) < 0) {
     if (deleting) {     // delete stale docs
      system.out.println(&quot;deleting &quot; +
        htmldocument.uid2url(uiditer.term().text()));
      reader.delete(uiditer.term());
     }
     uiditer.next();
    }
    if (uiditer.term() != null && uiditer.term().field() == &quot;uid&quot; &&
      uiditer.term().text().compareto(uid) == 0) {
     uiditer.next();     // keep matching docs
    } else if (!deleting) {     // add new docs
     document doc = htmldocument.document(file);
     system.out.println(&quot;adding &quot; + doc.get(&quot;url&quot;));
     writer.adddocument(doc);
    }
   } else { // creating a new index
    document doc = htmldocument.document(file);
    system.out.println(&quot;adding &quot; + doc.get(&quot;url&quot;));
    writer.adddocument(doc); // add docs unconditionally
   }
  }
  return;
}

   /**
    * walk directory hierarchy in uid order, while keeping uid iterator from
    * existing index in sync.  mismatches indicate one of:
    * (a) old documents to be deleted;
    * (b) unchanged documents, to be left alone;
    * or (c) new documents, to be indexed.
    */

   private void indexdocs(file file, string index, boolean create)
        throws exception {
    if (!create) {      // incrementally update
    
     reader = indexreader.open(index);    // open existing index
     uiditer = reader.terms(new term(&quot;uid&quot;, &quot;&quot;)); // init uid iterator
    
     this.indexdocs(file);
    
     if (deleting) {      // delete rest of stale docs
      while (uiditer.term() != null && uiditer.term().field() == &quot;uid&quot;) {
       system.out.println(&quot;deleting &quot; +
         htmldocument.uid2url(uiditer.term().text()));
       reader.delete(uiditer.term());
       uiditer.next();
      }
      deleting = false;
     }
    
     uiditer.close();      // close uid iterator
     reader.close();      // close existing index
    
    } else       // don't have exisiting
     this.indexdocs(file);    
    
   }
   /**
    * if create=true, create a new index, else refresh old index.
    * @param create
    */
public void run(boolean create) {
     try {
        string index = &quot;index&quot;;
        file root = null;
   if (this.indexfilepath!=null) {    // index file path
     index = this.indexfilepath;
   }
      if (this.docspath==null){
         system.out.println(&quot;root directory is not set&quot;);
         return;
        }
        root = new file(this.docspath);
        date start = new date();
        /**
         * not create then maintenance
         */
        if (!create) {      // delete stale docs
         this.deleting = true;
         this.indexdocs(root, index, create);
        }

        writer = new indexwriter(index, new standardanalyzer(), create);
        writer.maxfieldlength = 1000000;

        this.indexdocs(root, index, create);    // add new docs

        system.out.println(&quot;optimizing index...&quot;);
        writer.optimize();
        writer.close();

        date end = new date();

        system.out.print(end.gettime() - start.gettime());
        system.out.println(&quot; total milliseconds&quot;);
      } catch (exception e) {
        system.out.println(&quot; caught a &quot; + e.getclass() +
      &quot;\n with message: &quot; + e.getmessage());
      }
  return;
}

/**
  * @return returns the indexfilepath.
  */
public string getindexfilepath() {
  return indexfilepath;
}
/**
  * @param indexfilepath the indexfilepath to set.
  */
public void setindexfilepath(string property1) {
  this.indexfilepath = property1;
}
/**
  * @return returns the docspath.
  */
public string getdocspath() {
  return docspath;
}
/**
  * @param docspath the docspath to set.
  */
public void setdocspath(string property1) {
  this.docspath = property1;
}

/**
  * test
  * @param args
  */
public static void main(string&#91;&#93; args){
  indexhtml ih=new indexhtml();
  ih.setdocspath(&quot;d:\\myproject\\colimas\\clms-doc2\\html&quot;);
  ih.setindexfilepath(&quot;d:\\myproject\\colimas\\index&quot;);
  ih.run(true);
}
}


运行后生成3个文件_3i8.cfs,deletable,segments

搜索文件类:

/*
* created on 2005/07/28
*
* todo to change the template for this generated file go to
* window - preferences - java - code style - code templates
*/
package com.nova.colimas.search.query;

/**
* @author tyrone
*
* todo to change the template for this generated type comment go to
* window - preferences - java - code style - code templates
*/
public class hitshtmldoc {

private string title;

private string path;

private string url;


/**
  * @return returns the url.
  */
public string geturl() {
  return url;
}
/**
  * @param url the url to set.
  */
public void seturl(string property1) {
  this.url = property1;
}
/**
  * @return returns the path.
  */
public string getpath() {
  return path;
}
/**
  * @param path the path to set.
  */
public void setpath(string property1) {
  this.path = property1;
}
/**
  * @return returns the title.
  */
public string gettitle() {
  return title;
}
/**
  * @param title the title to set.
  */
public void settitle(string property1) {
  this.title = property1;
}
}



import org.apache.lucene.analysis.analyzer;
import org.apache.lucene.analysis.standard.standardanalyzer;
import org.apache.lucene.document.document;
import org.apache.lucene.search.searcher;
import org.apache.lucene.search.indexsearcher;
import org.apache.lucene.search.query;
import org.apache.lucene.search.hits;
import org.apache.lucene.queryparser.queryparser;
/**
* @author tyrone
*
* todo to change the template for this generated type comment go to
* window - preferences - java - code style - code templates
*/
public class searchfiles {

   private hits hits;

   public hits gethits(){
    return hits;
   }

   public hitshtmldoc&#91;&#93; run(string indexfilepath,string line){
    hitshtmldoc&#91;&#93; hitdocs;
    try {
     searcher searcher = new indexsearcher(indexfilepath);
     analyzer analyzer = new standardanalyzer();
     query query = queryparser.parse(line, &quot;contents&quot;, analyzer);
     system.out.println(&quot;searching for: &quot; + query.tostring(&quot;contents&quot;));
     this.hits = searcher.search(query);
     if (this.hits.length()==0) return null;
     system.out.println(this.hits.length() + &quot; total matching documents&quot;);
     hitdocs=new hitshtmldoc&#91;this.hits.length()&#93;;
     for (int i = 0; i < hits.length(); i++) {
      document doc = this.hits.doc(i);
      string path = doc.get(&quot;path&quot;);
      if (path != null) {
       hitdocs&#91;i&#93;.setpath(path);
      } else {
       string url=doc.get(&quot;url&quot;);
       if (url != null) {
        hitdocs&#91;i&#93;=new hitshtmldoc();
        hitdocs&#91;i&#93;.seturl(url);
        string title=doc.get(&quot;title&quot;);
        if (title!=null)
         hitdocs&#91;i&#93;.settitle(title);
       } else {
        system.out.println(i + &quot;. &quot; + &quot;no path nor url for this document&quot;);
       }
      }
      
     }  
     searcher.close();
     return hitdocs;
    }catch(exception e){
     system.out.println(&quot; caught a &quot; + e.getclass() +
       &quot;\n with message: &quot; + e.getmessage());    
    }
    return null;
   }
   /**
    * test
    * args=queries
    * @author tyrone
    *
    */
   public static void main(string&#91;&#93; args){
    searchfiles se=new searchfiles();
    string query=&quot;&quot;;
    hitshtmldoc&#91;&#93; hitsdoc;
    for (int i=0;i<args.length;i++)
     query=query+args&#91;i&#93;+&quot; &quot;;
    hitsdoc=se.run(&quot;d:\\myproject\\colimas\\index&quot;,query);
    if (hitsdoc==null){
     system.out.println(&quot;nothing&quot;);
     return;
    }
    for (int l=0;l<hitsdoc.length;l++){
     system.out.println(&quot;url:&quot;+hitsdoc&#91;l&#93;.geturl());
     system.out.println(&quot;path:&quot;+hitsdoc&#91;l&#93;.getpath());
     system.out.println(&quot;title:&quot;+hitsdoc&#91;l&#93;.gettitle());
    }
   }

}




注意事项

1 引用lucene debug你的应用程序时虽然不需要下面的jar包,但每次会提示urlclasspath.class异常,为方便起见还是下载这些jar包。
relaxngdatatype.jar
commons-beanutils.jar
commons-collections.jar
commons-digester.jar
commons-logging.jar
commons-validator.jar
jakarta-oro.jar
struts-legacy.jar

2 生成index文件的目录里不能有其他目录,如果有则会试图删除或报错


 


关键字 本文所属关键字

相关 与本文相关文章

分类 所有文章关键字导航

源码编程相关

Java   Asp   PHP   .Net   XML   C/C++   CGI   VB   Jsp   J2ee   J2se   J2me   EJB   Servlet   Tomcat   Resin   Struts   Weblogic   Eclipse   ANT   GUI   JMS   Web servise   IDEA   Webphere   Hibernate   Spring   Jboss   Applet   Swing   Socket   Javamail   Perl   Ajax   P2P   安全   模式   框架   测试   开源   游戏

SQL数据库相关

My-SQL   Ms-SQL   Access   DB2   Oracle   Sybase   SQLserver   索引   存储过程   加密   数据库   分页   视图  

手机无线相关

3G   Wap   CDMA   GRPS   GSM   IVR   彩信   短信   无线   增值业务

网页设计制作相关

HTML   CSS   网页配色   网页特效   Javascript   VBscript   Dreamweaver   Frontpage   JS   Web   网站设计

网站建设推广相关

建站经验   网站优化   网站排名   推广   Alexa

操作系统/服务器相关

Windows XP   Windows 2000   Windows 2003   Windows Me   Windows 9.x   Linux   UNIX   注册表   操作系统   服务器   应用服务器

图形图像多媒体相关

Photoshop   Fireworks   Flash   Coreldraw   Illustrator   Freehand   Photoimpact   多媒体   图形图像

标准 网站致力的规范

Valid CSS!

无不良内容,无不良广告,无恶意代码

Valid XHTML 1.0 Transitional

creativecommons