Lucene是一個全文檢索引擎工具包,貌似挺好用。某些時候我們需要在數據庫全表掃描篩選數據時,如果數據量龐大,往往要等待很多時間,這對用戶來說是很不友好的。那麼這時Lucene就可以派上用場。
Lucene首先將預檢索資源封裝成document對象,然後根據你自定義的字段建立索引,這其實和數據庫的行爲類似。
1.文檔字符化
2. 對查詢關鍵字分詞
3..建立索引
4. 搜索
程序運行環境JDK1.6,主要使用的幾個jar包:
具體實例代碼如下:
CreateIndex.java (根據指定文件創建索引庫)
package lucene;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import jeasy.analysis.MMAnalyzer;
public class CreateIndex {
/**
* 創建索引庫
*/
public static void initIndex() {
File fileDir = new File(Constants.indexPath);
IndexWriter indexWriter = null;
try {
Analyzer analyzer = new MMAnalyzer(); // 極易中文分詞器
//這裏的true表示是否重新創建
indexWriter = new IndexWriter(fileDir, analyzer,true , MaxFieldLength.LIMITED);
List<File> fileList = getFiles();
int count = fileList.size();
for (int i = 0; i < count; i++) {
Document document = null;
document = fileToText(fileList.get(i));
indexWriter.addDocument(document);
}
System.out.println("創建索引庫成功!");
} catch (Exception e) {
System.err.println("不能正確創建索引庫");
fileDir.deleteOnExit(); // 如果創建索引庫失敗,則刪除已經創建的索引目錄,下次重新創建
} finally {
try {
if (indexWriter != null)
indexWriter.close();
} catch (Exception e) {
System.err.println("不能關閉indexWriter");
}
}
}
/**
* 將文件內容封裝成Document
* @param file
* @return
* @throws IOException
*/
private static Document fileToText(File file) throws IOException {
Document document = new Document();
document.add(new Field("path",file.getAbsolutePath(),Store.YES,Index.NOT_ANALYZED));
document.add(new Field("colContent",getContent(file),Store.YES,Index.ANALYZED));
return document;
}
/**
* 文檔字符化
* @param file
* @return
*/
public static String getContent(File file) {
byte[]buffer = new byte[1024];
StringBuffer sb = new StringBuffer();
try {
InputStream in = new FileInputStream(file);
while(in.read(buffer) > 0) {
sb.append(new String(buffer));
}
} catch (Exception e) {
e.printStackTrace();
}
return sb.toString();
}
public static List<File> getFiles() {
File file = new File(Constants.filePath);
return Arrays.asList(file.listFiles());
}
public static void main(String[] args) {
initIndex();
}
}
IndexSearch.java (根據關鍵詞進行搜索)
package lucene;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import jeasy.analysis.MMAnalyzer;
public class IndexSearch {
private static Analyzer analyzer = new MMAnalyzer(); // 極易中文分詞器
public static void main(String[] args) throws Exception {
IndexSearch search = new IndexSearch();
search.search("姓名");
}
public void search(String keyWord) throws Exception {
IndexSearcher indexSearcher = null;
QueryParser queryParser = new MultiFieldQueryParser(new String[]{"colContent"}, analyzer);
Query query = null;
try {
// 將關鍵字轉換成索引庫可以識別的Query對象
query = queryParser.parse(keyWord);
} catch (ParseException e) {
System.err.println("關鍵詞解析失敗!");
}
indexSearcher = new IndexSearcher(Constants.indexPath);
if (indexSearcher != null) {
TopDocs topDocs;
try {
topDocs = indexSearcher.search(query, null, 10);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
Document document = null;
System.out.println("共找到匹配文件: " + scoreDocs.length + "個");
for (int i = 0; i < scoreDocs.length; i++) {
ScoreDoc scorDoc = scoreDocs[i];
int doc = scorDoc.doc;
document = indexSearcher.doc(doc);
System.out.println("文件路徑:"+document.getField("path").stringValue());
System.out.println("內容:"+document.getField("colContent").stringValue());
System.out.println("++++++++++++++++++++++++++++++");
}
} catch (IOException e) {
System.err.println("索引庫查詢失敗");
e.printStackTrace();
} finally {
try {
if (indexSearcher != null) {
indexSearcher.close();
}
} catch (Exception e) {
System.err.println("不能關閉indexSearcher連接");
}
}
}
}
}
一些文件
程序執行結果:
注意 : 我這裏是分兩個類分別運行的,首先運行CreateIndex類建立索引庫,然後,執行IndexSearch進行搜索。。更多內容在慢慢研究中