基於Lucence的同義詞分詞器

package org.lucene.util;


import java.io.Reader;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;


import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;


/**
 * 同義詞分詞器
 * @author 
 *
 */
public class MySameAnalyzer extends Analyzer {
private SamewordContext samewordContext;

public MySameAnalyzer(SamewordContext swc) {
samewordContext = swc;
}


/**
public TokenStream tokenStream(String fieldName, Reader reader) {
Dictionary dic = Dictionary.getInstance("D:\\tools\\javaTools\\lucene\\mmseg4j-1.8.5\\data");
return new MySameTokenFilter(
new MMSegTokenizer(new MaxWordSeg(dic), reader),samewordContext);
}
    */


@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
//獲取中文分詞器MMSeg的詞庫
Dictionary dic = Dictionary.getInstance();
//創建Tokenizer
Tokenizer  tokenizer=new MMSegTokenizer(new MaxWordSeg(dic), reader);
//創建TokenStream,使用自定義的同義詞過濾器
TokenStream ts= new MySameTokenFilter(tokenizer,samewordContext);
//創建TokenStreamComponents
TokenStreamComponents  tscs=new TokenStreamComponents(tokenizer,ts);
return tscs;
}


}


package org.lucene.util;


import java.io.IOException;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;




public class MySameTokenFilter extends TokenFilter {
private CharTermAttribute cta = null;
private PositionIncrementAttribute pia = null;
//存放當前的狀態,用於狀態還原
private AttributeSource.State current;
private Stack<String> sames = null;
private SamewordContext samewordContext;




protected MySameTokenFilter(TokenStream input,SamewordContext samewordContext) {
super(input);
cta = this.addAttribute(CharTermAttribute.class);
pia = this.addAttribute(PositionIncrementAttribute.class);
sames = new Stack<String>();
this.samewordContext = samewordContext;
}


@Override
public boolean incrementToken() throws IOException {
if(sames.size()>0) {
//將元素出棧,並且獲取這個同義詞
String str = sames.pop();
//還原狀態
restoreState(current);
cta.setEmpty();
cta.append(str);
//設置位置爲0
pia.setPositionIncrement(0);
return true;
}
boolean inct=this.input.incrementToken();
if(!inct) return false;
if(addSames(cta.toString())) {
//如果存在同義詞,將當前的狀態先保存
current = captureState();
}
return true;
}

/**
* 添加同義詞
* @param name
* @return
*/
private boolean addSames(String name) {
String[] sws = samewordContext.getSamewords(name);
if(sws!=null) {
for(String str:sws) {
sames.push(str);
}
return true;
}
return false;
}









}



package org.lucene.util;


public interface SamewordContext {
/**
* 獲取一個詞的同義詞列表
* @param name
* @return
*/
public String[] getSamewords(String name);
}




package org.lucene.util;


import java.util.HashMap;
import java.util.Map;


public class SimpleSamewordContext implements SamewordContext {
//存放同義詞
Map<String,String[]> maps = new HashMap<String,String[]>();

public SimpleSamewordContext() {
//創建同義詞列表
maps.put("數據",new String[]{"信息","知識","智慧"});
maps.put("當今",new String[]{"現在","目前","當前","眼下"});
maps.put("採用",new String[]{"使用","利用"});
}


/**
* 獲取一個詞的同義詞列表
*/
@Override
public String[] getSamewords(String name) {
return maps.get(name);
}


}






@Test
public void test05()  {
try {
Analyzer a2 = new MySameAnalyzer(new SimpleSamewordContext());
String txt = "系統建設採用代表當今雲計算、大數據和互聯網主流併成熟的技術進行架構設計,相應的軟件開發和產品選型應充分考慮未來發展方向,同時保證平臺在技術先進和可靠性。";
Directory dir = new RAMDirectory();
//Directory dir =FSDirectory.open(new File("indexs\\index04"));
IndexWriter writer = new IndexWriter(dir,new IndexWriterConfig(Version.LUCENE_4_9, a2));
Document doc = new Document();
doc.add(new  TextField("content",txt,Field.Store.YES));
writer.addDocument(doc);
writer.close();
IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir));
//QueryParser  parser=new QueryParser(Version.LUCENE_4_9,"content",a2);
//Query query=parser.parse("眼下");
Query query=new TermQuery(new Term("content","眼下"));
TopDocs tds = searcher.search(query,10);
System.out.println(tds.scoreDocs.length);
Document d = searcher.doc(tds.scoreDocs[0].doc);
System.out.println("content="+d.get("content"));
AnalyzerUtils.displayAllTokenInfo(txt, a2);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章