- solr5.5.4
http://mirror.bit.edu.cn/apache/lucene/solr/ ansj
https://github.com/NLPchina/ansj_seg
下載ansj源碼,在ansj_lucene5_plug中添加org.ansj.solr.AnsjTokenizerFactory
package org.ansj.solr;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.ansj.lucene.util.AnsjTokenizer;
import org.ansj.recognition.impl.StopRecognition;
import org.ansj.splitWord.analysis.IndexAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class AnsjTokenizerFactory extends TokenizerFactory {
public final Logger logger = LoggerFactory.getLogger(getClass());
boolean pstemming;
boolean isQuery;
private String stopwordsDir;
public List<StopRecognition> filters;
public AnsjTokenizerFactory(Map<String, String> args) {
super(args);
filters = new ArrayList<StopRecognition>();
getLuceneMatchVersion();
isQuery = getBoolean(args, "isQuery", true);
pstemming = getBoolean(args, "pstemming", false);
stopwordsDir = get(args, "stopwords");
addStopwords(stopwordsDir);
}
// add stopwords list to filter
private void addStopwords(String dir) {
if (dir == null) {
logger.info("no stopwords dir");
return;
}
// read stoplist
logger.info("stopwords: " + dir);
File file = new File(dir);
InputStreamReader reader;
try {
reader = new InputStreamReader(new FileInputStream(file), "UTF-8");
BufferedReader br = new BufferedReader(reader);
StopRecognition testFilter = new StopRecognition();
String word = br.readLine();
while (word != null) {
testFilter.insertStopWords(word);
word = br.readLine();
}
filters.add(testFilter);
br.close();
} catch (FileNotFoundException e) {
logger.info("No stopword file found");
} catch (IOException e) {
logger.info("stopword file io exception");
}
}
@Override
public Tokenizer create(AttributeFactory factory) {
if (isQuery == true) {
// query
return new AnsjTokenizer(new ToAnalysis(), filters, null);
} else {
// index
return new AnsjTokenizer(new IndexAnalysis(), filters, null);
}
}
}
打包編譯得到ansj_lucene5_plug-5.1.2.0.jar
將下邊軟件包移動到solr-5.5.4\server\solr-webapp\webapp\WEB-INF\lib下,
http://pan.baidu.com/s/1qY8Ycn6密碼:xj74
分詞配置文件(library.properties)放到/solr/server/resources目錄下。
修改schema
<fieldType name="text_ansj" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="org.ansj.solr.AnsjTokenizerFactory" isQuery="false" stopwords="D:/solr-5.5.4/server/library/stopwords.txt"/>
</analyzer>
<analyzer type="query">
<tokenizer
class="org.ansj.solr.AnsjTokenizerFactory"/>
</analyzer>
</fieldType>