solr5.5.4擴展ansj_lucene5

  1. solr5.5.4
    http://mirror.bit.edu.cn/apache/lucene/solr/
  2. ansj
    https://github.com/NLPchina/ansj_seg
    下載ansj源碼,在ansj_lucene5_plug中添加org.ansj.solr.AnsjTokenizerFactory

    這裏寫圖片描述

package org.ansj.solr;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.ansj.lucene.util.AnsjTokenizer;
import org.ansj.recognition.impl.StopRecognition;
import org.ansj.splitWord.analysis.IndexAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class AnsjTokenizerFactory extends TokenizerFactory {
    public final Logger logger = LoggerFactory.getLogger(getClass());

    boolean pstemming;
    boolean isQuery;
    private String stopwordsDir;
    public List<StopRecognition> filters;

    public AnsjTokenizerFactory(Map<String, String> args) {
        super(args);
        filters = new ArrayList<StopRecognition>();
        getLuceneMatchVersion();
        isQuery = getBoolean(args, "isQuery", true);
        pstemming = getBoolean(args, "pstemming", false);
        stopwordsDir = get(args, "stopwords");
        addStopwords(stopwordsDir);
    }

    // add stopwords list to filter
    private void addStopwords(String dir) {
        if (dir == null) {
            logger.info("no stopwords dir");
            return;
        }

        // read stoplist
        logger.info("stopwords: " + dir);
        File file = new File(dir);
        InputStreamReader reader;
        try {
            reader = new InputStreamReader(new FileInputStream(file), "UTF-8");
            BufferedReader br = new BufferedReader(reader);
            StopRecognition testFilter = new StopRecognition();
            String word = br.readLine();
            while (word != null) {
                testFilter.insertStopWords(word);
                word = br.readLine();
            }

            filters.add(testFilter);

            br.close();
        } catch (FileNotFoundException e) {
            logger.info("No stopword file found");

        } catch (IOException e) {
            logger.info("stopword file io exception");
        }
    }

    @Override
    public Tokenizer create(AttributeFactory factory) {
        if (isQuery == true) {
            // query
            return new AnsjTokenizer(new ToAnalysis(), filters, null);
        } else {
            // index
            return new AnsjTokenizer(new IndexAnalysis(), filters, null);
        }
    }
}

打包編譯得到ansj_lucene5_plug-5.1.2.0.jar
將下邊軟件包移動到solr-5.5.4\server\solr-webapp\webapp\WEB-INF\lib下,
http://pan.baidu.com/s/1qY8Ycn6密碼:xj74
這裏寫圖片描述
分詞配置文件(library.properties)放到/solr/server/resources目錄下。
修改schema

<fieldType name="text_ansj" class="solr.TextField" positionIncrementGap="100">  
   <analyzer type="index">  
      <tokenizer class="org.ansj.solr.AnsjTokenizerFactory"  isQuery="false" stopwords="D:/solr-5.5.4/server/library/stopwords.txt"/> 
  </analyzer>  
  <analyzer type="query">  
      <tokenizer
  class="org.ansj.solr.AnsjTokenizerFactory"/>  
  </analyzer>  
</fieldType>  
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章