lucene全文檢索word內容

創建索引 Lucene 3.0+

package test;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.poi.hwpf.extractor.WordExtractor;
/**
 * 創建索引 Lucene 3.0+
 * @author Administrator
 *
 */
public class LuceneTest {

	/**
	 * @param args
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		//保存索引文件的地方
		String indexDir = "D:\\indexDir";
		//將要搜索word文件的地方
		String dateDir = "D:\\dateDir";
		IndexWriter indexWriter = null;
		//創建Directory對象
		Directory dir = new SimpleFSDirectory(new File(indexDir));
		//創建IndexWriter對象,第一個參數是Directory,第二個是分詞器,第三個表示是否是創建,如果爲false爲在此基礎上面修改,第四表示表示分詞的最大值,比如說new MaxFieldLength(2),就表示兩個字一分,一般用IndexWriter.MaxFieldLength.LIMITED 
		indexWriter = new IndexWriter(dir,new StandardAnalyzer(Version.LUCENE_30),true,IndexWriter.MaxFieldLength.UNLIMITED);
		File[] files = new File(dateDir).listFiles();
		for (int i = 0; i < files.length; i++) {
			Document doc = new Document();
			InputStream in = new FileInputStream(files[i]);
			WordExtractor w = new WordExtractor(in);
			//創建Field對象,並放入doc對象中
			doc.add(new Field("contents", w.getText(),Field.Store.YES,Field.Index.ANALYZED)); 
			doc.add(new Field("filename", files[i].getName(), 
								Field.Store.YES, Field.Index.NOT_ANALYZED));
			doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),Field.Store.YES,Field.Index.NOT_ANALYZED));
			//寫入IndexWriter
			indexWriter.addDocument(doc);
		}
		//查看IndexWriter裏面有多少個索引
		System.out.println("numDocs"+indexWriter.numDocs());
		indexWriter.close();
		
	}

}
搜索索引 Lucene 3.0+

package test;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
/**
 * 搜索索引 Lucene 3.0+
 * @author Administrator
 *
 */
public class LuceneSearch {

	public static void main(String[] args) throws IOException, ParseException {
		//保存索引文件的地方
		String indexDir = "D:\\indexDir";
		Directory dir = new SimpleFSDirectory(new File(indexDir));
		//創建 IndexSearcher對象,相比IndexWriter對象,這個參數就要提供一個索引的目錄就行了
		IndexSearcher indexSearch = new IndexSearcher(dir);
		//創建QueryParser對象,第一個參數表示Lucene的版本,第二個表示搜索Field的字段,第三個表示搜索使用分詞器
		QueryParser queryParser = new QueryParser(Version.LUCENE_30,
				"contents", new StandardAnalyzer(Version.LUCENE_30));
		//生成Query對象
		Query query = queryParser.parse("高鐵");
		//搜索結果 TopDocs裏面有scoreDocs[]數組,裏面保存着索引值
		TopDocs hits = indexSearch.search(query, 10);
		//hits.totalHits表示一共搜到多少個
		System.out.println("找到了"+hits.totalHits+"個");
		//循環hits.scoreDocs數據,並使用indexSearch.doc方法把Document還原,再拿出對應的字段的值
		for (int i = 0; i < hits.scoreDocs.length; i++) {
			ScoreDoc sdoc = hits.scoreDocs[i];
			Document doc = indexSearch.doc(sdoc.doc);
			System.out.println(doc.get("filename"));			
		}		
		indexSearch.close();
	}
}

用到的jar包:

lucene-core-3.0.3.jar

poi-3.7-20101029.jar

poi-ooxml-3.7-20101029.jar

poi-ooxml-schemas-3.7-20101029.jar

poi-scratchpad-3.7-20101029.jar



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章