1、solr導入到eclipse
下載solr-5.4.1-src.tgz,官網地址http://www.apache.org/dyn/closer.lua/lucene/solr/5.4.1
解壓solr-5.4.1-src.tgz到D:\project\java\solr-5.4.1目錄,在目錄的命令行下輸入ant eclipse,然後進入漫長的等待過程,中間需要從網上下載很多依賴包。
編譯時,可能會報Ivy could not be found in you ant classpath,去ivy官網(http://ant.apache.org/ivy/download.cgi)下載ivy.jar即可。
直到出現BUILD SUCCESSFUL,使用eclipse導入。
打開org.apache.solr.client.solrj.StartSolrJetty,設置solr.solr.home
public class StartSolrJetty
{
public static void main( String[] args )
{
System.setProperty("solr.solr.home", "solr/example/solr");
Server server = new Server();
ServerConnector connector = new ServerConnector(server, new HttpConnectionFactory());
// Set some timeout options to make debugging easier.
connector.setIdleTimeout(1000 * 60 * 60);
connector.setSoLingerTime(-1);
connector.setPort(8983);
server.setConnectors(new Connector[] { connector });
WebAppContext bb = new WebAppContext();
bb.setServer(server);
bb.setContextPath("/solr");
bb.setWar("solr/webapp/web");
server.setHandler(bb);
try {
System.out.println(">>> STARTING EMBEDDED JETTY SERVER, PRESS ANY KEY TO STOP");
server.start();
while (System.in.available() == 0) {
Thread.sleep(5000);
}
server.stop();
server.join();
}
catch (Exception e) {
e.printStackTrace();
System.exit(100);
}
}
}
2、實時更新詞庫
本文使用Jcseg這個中文分詞庫,查看org.lionsoul.jcseg.analyzer.v5x.JcsegTokenizerFactory的源碼,詞庫數據保存在ADictionary dic這個變量中,
public class JcsegTokenizerFactory extends TokenizerFactory
{
private int mode;
private JcsegTaskConfig config = null;
private ADictionary dic = null; // 詞庫變量
/**
* set the mode arguments in the schema.xml
* configuration file to change the segment mode for jcseg
*
*/
public JcsegTokenizerFactory(Map<String, String> args)
{
super(args);
String _mode = args.get("mode");
if ( _mode == null ) mode = JcsegTaskConfig.COMPLEX_MODE;
else
{
_mode = _mode.toLowerCase();
if ( "simple".equals(_mode) )
mode = JcsegTaskConfig.SIMPLE_MODE;
else if ( "detect".equals(_mode) )
mode = JcsegTaskConfig.DETECT_MODE;
else
mode = JcsegTaskConfig.COMPLEX_MODE;
}
//initialize the task config and the dictionary
config = new JcsegTaskConfig();
dic = DictionaryFactory.createDefaultDictionary(config);
}
public void setConfig( JcsegTaskConfig config )
{
this.config = config;
}
public void setDict( ADictionary dic )
{
this.dic = dic;
}
public JcsegTaskConfig getTaskConfig()
{
return config;
}
public ADictionary getDict()
{
return dic;
}
@Override
public Tokenizer create( AttributeFactory factory )
{
try {
return new JcsegTokenizer(mode, config, dic);
} catch (JcsegException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
}
繼續查看org.apache.solr.handler.FieldAnalysisRequestHandler代碼,知道可以通過SolrQueryRequest獲取到TokenizerFactory
只需要取得JcsegTokenizerFactory對應實例,就能取得dic,通過add和remove方法實時更新詞庫。
dic.add(ILexicon.CJK_WORD, word, IWord.T_CJK_WORD);
dic.remove(ILexicon.CJK_WORD, word);
自定義實現request handler,在request handler裏面通過SolrQueryRequest取得IndexSchema->IndexAnalyzer->TokenizerFactory,最終取得dic實例操作詞庫,內存中的詞庫更新後,也需要保存到本地詞庫文件中,避免重啓後丟失詞庫。實現代碼
package com.penngo.solr;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.common.StringUtils;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.handler.RequestHandlerBase;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.lionsoul.jcseg.analyzer.v5x.JcsegTokenizerFactory;
import org.lionsoul.jcseg.tokenizer.core.ADictionary;
import org.lionsoul.jcseg.tokenizer.core.ILexicon;
import org.lionsoul.jcseg.tokenizer.core.IWord;
import org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig;
import com.fasterxml.jackson.databind.ObjectMapper;
public class TestHandler extends RequestHandlerBase{
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
SolrParams params = req.getParams();
System.out.println("params=======" + params);
JcsegTaskConfig config = new JcsegTaskConfig();
String addDatas = params.get("add");
Map<String,Object> dataResult = new HashMap<String,Object>();
ObjectMapper mapper = new ObjectMapper();
String lexiconPath = config.getLexiconPath()[0];
String fileLex = lexiconPath + "/lex-penngo.lex";
IndexSchema indexSchema = req.getSchema();
FieldType filetype = indexSchema.getFieldTypeByName("textComplex");
Analyzer analyzer = filetype.getIndexAnalyzer();
TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
TokenizerFactory tfac = tokenizerChain.getTokenizerFactory();
if (tfac instanceof JcsegTokenizerFactory) {
JcsegTokenizerFactory jtf = (JcsegTokenizerFactory) tfac;
ADictionary dic = jtf.getDict();
if (dic != null) {
if (StringUtils.isEmpty(addDatas) == false) {
FileOutputStream fos = new FileOutputStream(new File(fileLex), true);
OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
BufferedWriter bw = new BufferedWriter(osw);
ArrayList<List<String>> wordList = mapper.readValue(addDatas, ArrayList.class);
for(List<String> word: wordList){
String name = word.get(0);
String type = word.get(1);
String pinyin = word.get(2);
String syn = word.get(3);
IWord iword = dic.get(ILexicon.CJK_WORD, name);
// 如果不存在,則添加到詞庫
if (iword == null) {
dic.add(ILexicon.CJK_WORD, name, IWord.T_CJK_WORD);
iword = dic.get(ILexicon.CJK_WORD, name);
iword.addPartSpeech(type);
iword.setPinyin(pinyin);
String[] syns = syn.split(",");
for (String s : syns) {
iword.addSyn(s);
}
StringBuffer sff = new StringBuffer();
sff.append(name).append("/").append(type).append("/").append(pinyin).append("/").append(syn);
// 把分詞添加到詞庫文件lex-penngo.lex中
bw.write(sff.toString());
bw.newLine();
}
}
bw.close();
osw.close();
fos.close();
}
dataResult.put("status", "ok");
}
}
rsp.add("response", dataResult);
}
public String getDescription() {
return null;
}
}
solrconfig.xml添加配置
<requestHandler name="/test" class="com.penngo.solr.TestHandler">
<lst name="defaults">
<str name="wt">json</str>
<str name="indent">true</str>
</lst>
</requestHandler>
以"公衆號"這個詞來測試
jcseg自帶詞庫分詞結果
客戶端通過接口添加分詞後結果,php代碼
<?php
$url = "http://localhost:8983/solr/news/test";
$data = array(
array("公衆號", "n", "gong zhong hao", "大衆號,社會號,penngo")
);
$url = $url . "?add=" . urlencode(json_encode($data));
echo $url . "\n";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
$result = curl_exec($ch);
curl_close($ch);
print_r($result);
?>