我修改了lucene的demo包的indexhtml类,使其可以被其他java类调用。
indexhtml类
import org.apache.lucene.analysis.standard.standardanalyzer;
import org.apache.lucene.document.document;
import org.apache.lucene.index.indexreader;
import org.apache.lucene.index.indexwriter;
import org.apache.lucene.index.term;
import org.apache.lucene.index.termenum;
import java.io.file;
import java.util.date;
import java.util.arrays;
//还需调用demo的其他类。
import org.apache.lucene.demo;
/**
* create html file index for searching
* @author tyrone
*
*/
public class indexhtml {
private string docspath=null;
/**
* the path for index file;
*/
private string indexfilepath=null;
/**
* true during deletion pass
*/
private boolean deleting = false;
/**
* existing index
*/
private indexreader reader;
/**
* new index being built
*/
private indexwriter writer;
/**
* document id iterator
*/
private termenum uiditer;
private void indexdocs(file file)throws exception {
if (file.isdirectory()) { // if a directory
string[] files = file.list(); // list its files
arrays.sort(files); // sort the files
for (int i = 0; i < files.length; i++) // recursively index them
this.indexdocs(new file(file, files[i]));
} else if (file.getpath().endswith(".html") // index .html files
file.getpath().endswith(".htm") // index .htm files
file.getpath().endswith(".txt")) { // index .txt files
if (this.uiditer != null) {
string uid = htmldocument.uid(file); // construct uid for doc
while (uiditer.term() != null && uiditer.term().field() == "uid" &&
uiditer.term().text().compareto(uid) < 0) {
if (deleting) { // delete stale docs
system.out.println("deleting " +
htmldocument.uid2url(uiditer.term().text()));
reader.delete(uiditer.term());
}
uiditer.next();
}
if (uiditer.term() != null && uiditer.term().field() == "uid" &&
uiditer.term().text().compareto(uid) == 0) {
uiditer.next(); // keep matching docs
} else if (!deleting) { // add new docs
document doc = htmldocument.document(file);
system.out.println("adding " + doc.get("url"));
writer.adddocument(doc);
}
} else { // creating a new index
document doc = htmldocument.document(file);
system.out.println("adding " + doc.get("url"));
writer.adddocument(doc); // add docs unconditionally
}
}
return;
}
/**
* walk directory hierarchy in uid order, while keeping uid iterator from
* existing index in sync. mismatches indicate one of:
* (a) old documents to be deleted;
* (b) unchanged documents, to be left alone;
* or (c) new documents, to be indexed.
*/
private void indexdocs(file file, string index, boolean create)
throws exception {
if (!create) { // incrementally update
reader = indexreader.open(index); // open existing index
uiditer = reader.terms(new term("uid", "")); // init uid iterator
this.indexdocs(file);
if (deleting) { // delete rest of stale docs
while (uiditer.term() != null && uiditer.term().field() == "uid") {
system.out.println("deleting " +
htmldocument.uid2url(uiditer.term().text()));
reader.delete(uiditer.term());
uiditer.next();
}
deleting = false;
}
uiditer.close(); // close uid iterator
reader.close(); // close existing index
} else // don't have exisiting
this.indexdocs(file);
}
/**
* if create=true, create a new index, else refresh old index.
* @param create
*/
public void run(boolean create) {
try {
string index = "index";
file root = null;
if (this.indexfilepath!=null) { // index file path
index = this.indexfilepath;
}
if (this.docspath==null){
system.out.println("root directory is not set");
return;
}
root = new file(this.docspath);
date start = new date();
/**
* not create then maintenance
*/
if (!create) { // delete stale docs
this.deleting = true;
this.indexdocs(root, index, create);
}
writer = new indexwriter(index, new standardanalyzer(), create);
writer.maxfieldlength = 1000000;
this.indexdocs(root, index, create); // add new docs
system.out.println("optimizing index...");
writer.optimize();
writer.close();
date end = new date();
system.out.print(end.gettime() - start.gettime());
system.out.println(" total milliseconds");
} catch (exception e) {
system.out.println(" caught a " + e.getclass() +
"\n with message: " + e.getmessage());
}
return;
}
/**
* @return returns the indexfilepath.
*/
public string getindexfilepath() {
return indexfilepath;
}
/**
* @param indexfilepath the indexfilepath to set.
*/
public void setindexfilepath(string property1) {
this.indexfilepath = property1;
}
/**
* @return returns the docspath.
*/
public string getdocspath() {
return docspath;
}
/**
* @param docspath the docspath to set.
*/
public void setdocspath(string property1) {
this.docspath = property1;
}
/**
* test
* @param args
*/
public static void main(string[] args){
indexhtml ih=new indexhtml();
ih.setdocspath("d:\\myproject\\colimas\\clms-doc2\\html");
ih.setindexfilepath("d:\\myproject\\colimas\\index");
ih.run(true);
}
}
/*
* created on 2005/07/28
*
* todo to change the template for this generated file go to
* window - preferences - java - code style - code templates
*/
package com.nova.colimas.search.query;
/**
* @author tyrone
*
* todo to change the template for this generated type comment go to
* window - preferences - java - code style - code templates
*/
public class hitshtmldoc {
private string title;
private string path;
private string url;
/**
* @return returns the url.
*/
public string geturl() {
return url;
}
/**
* @param url the url to set.
*/
public void seturl(string property1) {
this.url = property1;
}
/**
* @return returns the path.
*/
public string getpath() {
return path;
}
/**
* @param path the path to set.
*/
public void setpath(string property1) {
this.path = property1;
}
/**
* @return returns the title.
*/
public string gettitle() {
return title;
}
/**
* @param title the title to set.
*/
public void settitle(string property1) {
this.title = property1;
}
}
import org.apache.lucene.analysis.analyzer;
import org.apache.lucene.analysis.standard.standardanalyzer;
import org.apache.lucene.document.document;
import org.apache.lucene.search.searcher;
import org.apache.lucene.search.indexsearcher;
import org.apache.lucene.search.query;
import org.apache.lucene.search.hits;
import org.apache.lucene.queryparser.queryparser;
/**
* @author tyrone
*
* todo to change the template for this generated type comment go to
* window - preferences - java - code style - code templates
*/
public class searchfiles {
private hits hits;
public hits gethits(){
return hits;
}
public hitshtmldoc[] run(string indexfilepath,string line){
hitshtmldoc[] hitdocs;
try {
searcher searcher = new indexsearcher(indexfilepath);
analyzer analyzer = new standardanalyzer();
query query = queryparser.parse(line, "contents", analyzer);
system.out.println("searching for: " + query.tostring("contents"));
this.hits = searcher.search(query);
if (this.hits.length()==0) return null;
system.out.println(this.hits.length() + " total matching documents");
hitdocs=new hitshtmldoc[this.hits.length()];
for (int i = 0; i < hits.length(); i++) {
document doc = this.hits.doc(i);
string path = doc.get("path");
if (path != null) {
hitdocs[i].setpath(path);
} else {
string url=doc.get("url");
if (url != null) {
hitdocs[i]=new hitshtmldoc();
hitdocs[i].seturl(url);
string title=doc.get("title");
if (title!=null)
hitdocs[i].settitle(title);
} else {
system.out.println(i + ". " + "no path nor url for this document");
}
}
}
searcher.close();
return hitdocs;
}catch(exception e){
system.out.println(" caught a " + e.getclass() +
"\n with message: " + e.getmessage());
}
return null;
}
/**
* test
* args=queries
* @author tyrone
*
*/
public static void main(string[] args){
searchfiles se=new searchfiles();
string query="";
hitshtmldoc[] hitsdoc;
for (int i=0;i<args.length;i++)
query=query+args[i]+" ";
hitsdoc=se.run("d:\\myproject\\colimas\\index",query);
if (hitsdoc==null){
system.out.println("nothing");
return;
}
for (int l=0;l<hitsdoc.length;l++){
system.out.println("url:"+hitsdoc[l].geturl());
system.out.println("path:"+hitsdoc[l].getpath());
system.out.println("title:"+hitsdoc[l].gettitle());
}
}
}
Java Asp PHP .Net XML C/C++ CGI VB Jsp J2ee J2se J2me EJB Servlet Tomcat Resin Struts Weblogic Eclipse ANT GUI JMS Web servise IDEA Webphere Hibernate Spring Jboss Applet Swing Socket Javamail Perl Ajax P2P 安全 模式 框架 测试 开源 游戏
Windows XP Windows 2000 Windows 2003 Windows Me Windows 9.x Linux UNIX 注册表 操作系统 服务器 应用服务器