最近開始學習Lucene,java版本的lucene,入門還是比較快的,網上也很多關於lucene的學習資料。我在這裏總結一下。
1. 搭建Lucene的開發環境:在classpath中添加lucene-core-2.9.1.jar包,在csdn可以下載到該jar包
2.加入庖丁解牛jar包
我用是庖丁解牛 2.0.0版本
下載地址:http://code.google.com/p/paoding/downloads/list
SVN地址:http://paoding.googlecode.com/svn/trunk/paoding-analysis/
裏面有lucene-highlighter-2.2.0.jar, paoding-analysis.jar , commons-logging.jar , 相關的包要加入到classpath中
3。在lucene中加入庖丁解牛,配置文件
---------------------------------------------------------------
paoding-dic-home.properties
#values are "system-env" or "this";
#if value is "this" , using the paoding.dic.home as dicHome if configed!
#paoding.dic.home.config-fisrt=system-env
paoding.dic.home.config-fisrt=this
#dictionary home (directory)
#"classpath:xxx" means dictionary home is in classpath.
#e.g "classpath:dic" means dictionaries are in "classes/dic" directory or any other classpath directory
//下載庖丁包有該字典文件,加入到項目的根目錄
paoding.dic.home=/paoding/dic
#seconds for dic modification detection
paoding.dic.detector.interval=60
---------------------------------------------------------------
paoding-knives.properties 文件配置
paoding.knife.class.letterKnife=net.paoding.analysis.knife.LetterKnife
paoding.knife.class.numberKnife=net.paoding.analysis.knife.NumberKnife
paoding.knife.class.cjkKnife=net.paoding.analysis.knife.CJKKnife
---------------------------------------------------------------
附上代碼:
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import net.paoding.analysis.examples.gettingstarted.BoldFormatter;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class LuceneChinese {
//數據文件夾
private static final String DATA_DIR="C://unicode";
//索引存放文件夾
private static final String INDEX_DIR="C://dir";
//字段
private static final String FIELD_NAME="content";
public static void main(String [] args)throws Exception{
createIndex();
search("");
}
/**
* 創建索引
*/
public static void createIndex(){
System.out.println("-------------------建立索引開始-----------------------");
long timeStart=System.currentTimeMillis();
try{
// PaodingChineseAnalyzer實現Analyzer接口,繼承PaodingAnalyze,重寫tokenizer方法,實現過濾分詞
Analyzer analyzer = new PaodingChineseAnalyzer(new File("E://stopwords.txt"));
IndexWriter writer = new IndexWriter(FSDirectory.open(new File(INDEX_DIR)),
analyzer,true, IndexWriter.MaxFieldLength.LIMITED);
// 根據指定的目錄把該目錄下所有txt文件索引起來
indexDoc(writer, new File(DATA_DIR));
// 優化, 可以提高搜索速度。
writer.optimize();
writer.close();
}catch (IOException e) {
// TODO: handle exception
e.printStackTrace();
}
long timeEnd=System.currentTimeMillis();
System.out.println("-------------------建立索引耗時: "+(timeEnd-timeStart)+" 毫秒-----------------------");
}
/**
* 搜索
* @param keyword
* @throws IOException
* @throws ParseException
*/
public static void search(String queryString)
throws IOException,ParseException{
//輸入搜索關鍵字
if (queryString==null||queryString=="") {
System.out.print("Search for:");
InputStreamReader in=new InputStreamReader(System.in);
BufferedReader reader=new BufferedReader(in);
queryString=reader.readLine();
if(queryString==""){
System.exit(0);
}
}
long timeStart=System.currentTimeMillis();
//讀取索引文件
Directory directory=FSDirectory.open(new File(INDEX_DIR));
//PaodingChineseAnalyzer實現Analyzer接口,繼承PaodingAnalyzer
Analyzer analyzer = new PaodingChineseAnalyzer();
IndexReader reader = IndexReader.open(directory, true);
QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
Query query = parser.parse(queryString);
//創建索引查詢器
Searcher searcher = new IndexSearcher(directory);
query = query.rewrite(reader);
Hits hits = searcher.search(query);
//高亮顯示標籤,默認是<b></b>
//SimpleHTMLFormatter shf = new SimpleHTMLFormatter("<span style="color:red" mce_style="color:red">", "</span>");
BoldFormatter formatter = new BoldFormatter();
//構造高亮器,指定高亮的格式,指定查詢計分器
Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
//設置塊劃分器
highlighter.setTextFragmenter(new SimpleFragmenter(50));
System.out.println("共搜索到: "+hits.length()+" 條資源");
System.out.println("---------------------------------------------");
for (int i = 0; i < hits.length(); i++) {
String text = hits.doc(i).get(FIELD_NAME);
String path = hits.doc(i).get("path");
int maxNumFragmentsRequired = 5;
String fragmentSeparator = "...";
TermPositionVector tpv = (TermPositionVector) reader
.getTermFreqVector(hits.id(i), FIELD_NAME);
TokenStream tokenStream = TokenSources.getTokenStream(tpv);
String result = highlighter.getBestFragments(tokenStream, text,
maxNumFragmentsRequired, fragmentSeparator);
System.out.println("/n文件路徑:" + path);
System.out.println("/n" + result);
}
reader.close();
System.out.println("共搜索到: "+hits.length()+" 條資源");
long timeEnd=System.currentTimeMillis();
System.out.println("-------------------查詢耗時: "+(timeEnd-timeStart)+" 毫秒-----------------------");
}
/**
* 對指定的目錄進行索引
*
* @param writer
* IndexWriter
* @param root
* 指定的目錄
*/
private static void indexDoc(IndexWriter writer, File root) {
// 不去索引不能讀的文件
if (root.canRead()) {
if (root.isDirectory()) {
File[] files = root.listFiles();
if (files.length != 0) {
for (int i = 0; i < files.length; i++) {
// 遞歸調用
indexDoc(writer, files[i]);
}
}
} else {
try {
// 文件的文本內容
InputStream in=new FileInputStream(root);
byte b[]=new byte[in.available()];
in.read(b);
String content = new String(b,"GBK");
// 創建一個lucene document
Document d = new Document();
// 把文件的文本內容添加進來 進行索引,保存
d.add(new Field(FIELD_NAME, content, Field.Store.YES,
Field.Index.TOKENIZED,Field.TermVector.WITH_POSITIONS_OFFSETS));
// 同時把path也加入進來,只存儲,不索引
d.add(new Field("path", root.getAbsolutePath(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
// 把document寫入索引文件
writer.addDocument(d);
System.out.println("add file: " + root.getAbsolutePath());
} catch (FileNotFoundException e) {
System.out.println("file not found, ignored.");
e.printStackTrace();
} catch (IOException e) {
}
}
}
}
}
PaodingChineseAnalyzer 代碼
import java.io.File;
import java.io.Reader;
import java.util.Set;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import net.paoding.analysis.analyzer.PaodingTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader;
public class PaodingChineseAnalyzer extends PaodingAnalyzer{
private static String[] stopWords = {
"www","的","和","與","時","在",
"是","被","所","那","這","有",
"將","會","爲","對","了","過",
"去"};
private Set stopSet;
public PaodingChineseAnalyzer() {
stopSet = StopFilter.makeStopSet(stopWords);
}
public PaodingChineseAnalyzer(String[] stopWords) {
stopSet = StopFilter.makeStopSet(stopWords);
}
//讀取外部stopwords文件
public PaodingChineseAnalyzer(File stopwordsFile){
try{
stopSet=WordlistLoader.getWordSet(stopwordsFile);
}catch (Exception e) {
e.printStackTrace();
}
}
//過濾分詞
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new PaodingTokenizer(reader, getKnife(),createTokenCollector());
//加入過濾分詞方法,lucene也提供了很多過濾分詞方法,可以選擇使用
result=new StopFilter(result, stopSet);
result = new LowerCaseFilter(result);
return result;
}
}
由於時間問題,說明和註釋不是很清楚,如有什麼疑問,大家可以留言討論,相互學習。。