Lucene2.9加入PaodingAnalyzer2.0，並實現stopwords

最近開始學習Lucene，java版本的lucene，入門還是比較快的，網上也很多關於lucene的學習資料。我在這裏總結一下。

1. 搭建Lucene的開發環境：在classpath中添加lucene-core-2.9.1.jar包，在csdn可以下載到該jar包

2.加入庖丁解牛jar包

我用是庖丁解牛 2.0.0版本

下載地址：http://code.google.com/p/paoding/downloads/list
SVN地址：http://paoding.googlecode.com/svn/trunk/paoding-analysis/

裏面有lucene-highlighter-2.2.0.jar， paoding-analysis.jar ， commons-logging.jar ，相關的包要加入到classpath中

3。在lucene中加入庖丁解牛，配置文件

---------------------------------------------------------------

paoding-dic-home.properties

#values are "system-env" or "this";
#if value is "this" , using the paoding.dic.home as dicHome if configed!
#paoding.dic.home.config-fisrt=system-env

paoding.dic.home.config-fisrt=this

#dictionary home (directory)
#"classpath:xxx" means dictionary home is in classpath.
#e.g "classpath:dic" means dictionaries are in "classes/dic" directory or any other classpath directory

//下載庖丁包有該字典文件，加入到項目的根目錄
paoding.dic.home=/paoding/dic

#seconds for dic modification detection

paoding.dic.detector.interval=60

---------------------------------------------------------------

paoding-knives.properties 文件配置

paoding.knife.class.letterKnife=net.paoding.analysis.knife.LetterKnife
paoding.knife.class.numberKnife=net.paoding.analysis.knife.NumberKnife
paoding.knife.class.cjkKnife=net.paoding.analysis.knife.CJKKnife

---------------------------------------------------------------

附上代碼：

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;

import net.paoding.analysis.analyzer.PaodingAnalyzer;
import net.paoding.analysis.examples.gettingstarted.BoldFormatter;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class LuceneChinese {

//數據文件夾
private static final String DATA_DIR="C://unicode";

//索引存放文件夾
private static final String INDEX_DIR="C://dir";

//字段
private static final String FIELD_NAME="content";

public static void main(String [] args)throws Exception{

  createIndex();
  search("");
}
/**
* 創建索引
*/
public static void createIndex(){
  System.out.println("-------------------建立索引開始-----------------------");
  long timeStart=System.currentTimeMillis();
  try{
   // PaodingChineseAnalyzer實現Analyzer接口，繼承PaodingAnalyze，重寫tokenizer方法，實現過濾分詞
   Analyzer analyzer = new PaodingChineseAnalyzer(new File("E://stopwords.txt"));
   IndexWriter writer = new IndexWriter(FSDirectory.open(new File(INDEX_DIR)),
     analyzer,true, IndexWriter.MaxFieldLength.LIMITED);
   // 根據指定的目錄把該目錄下所有txt文件索引起來
   indexDoc(writer, new File(DATA_DIR));
   // 優化，可以提高搜索速度。
   writer.optimize();
   writer.close();
  }catch (IOException e) {
   // TODO: handle exception
   e.printStackTrace();
  }
  long timeEnd=System.currentTimeMillis();
  System.out.println("-------------------建立索引耗時: "+(timeEnd-timeStart)+" 毫秒-----------------------");
}
/**
* 搜索
* @param keyword
* @throws IOException
* @throws ParseException
*/
public static void search(String queryString)
  throws IOException,ParseException{

  //輸入搜索關鍵字
  if (queryString==null||queryString=="") {
   System.out.print("Search for:");
   InputStreamReader in=new InputStreamReader(System.in);
   BufferedReader reader=new BufferedReader(in);
   queryString=reader.readLine();
   if(queryString==""){
    System.exit(0);
   }
  }

  long timeStart=System.currentTimeMillis();

//讀取索引文件
  Directory directory=FSDirectory.open(new File(INDEX_DIR));
  //PaodingChineseAnalyzer實現Analyzer接口，繼承PaodingAnalyzer
  Analyzer analyzer = new PaodingChineseAnalyzer();
  IndexReader reader = IndexReader.open(directory, true);
  QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
  Query query = parser.parse(queryString);
  //創建索引查詢器
  Searcher searcher = new IndexSearcher(directory);
  query = query.rewrite(reader);
  Hits hits = searcher.search(query);

  //高亮顯示標籤，默認是<b></b>
  //SimpleHTMLFormatter shf = new SimpleHTMLFormatter("<span style="color:red" mce_style="color:red">", "</span>");
  BoldFormatter formatter = new BoldFormatter();
  //構造高亮器，指定高亮的格式，指定查詢計分器
  Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
  //設置塊劃分器
  highlighter.setTextFragmenter(new SimpleFragmenter(50));

  System.out.println("共搜索到: "+hits.length()+" 條資源");
  System.out.println("---------------------------------------------");
  for (int i = 0; i < hits.length(); i++) {
   String text = hits.doc(i).get(FIELD_NAME);
   String path = hits.doc(i).get("path");
   int maxNumFragmentsRequired = 5;
   String fragmentSeparator = "...";
   TermPositionVector tpv = (TermPositionVector) reader
     .getTermFreqVector(hits.id(i), FIELD_NAME);
   TokenStream tokenStream = TokenSources.getTokenStream(tpv);
   String result = highlighter.getBestFragments(tokenStream, text,
     maxNumFragmentsRequired, fragmentSeparator);
   System.out.println("/n文件路徑：" + path);
   System.out.println("/n" + result);
  }
  reader.close();
  System.out.println("共搜索到: "+hits.length()+" 條資源");
  long timeEnd=System.currentTimeMillis();
  System.out.println("-------------------查詢耗時: "+(timeEnd-timeStart)+" 毫秒-----------------------");
}
/**
     * 對指定的目錄進行索引
     *
     * @param writer
     *            IndexWriter
     * @param root
     *            指定的目錄
     */
    private static void indexDoc(IndexWriter writer, File root) {
        // 不去索引不能讀的文件
        if (root.canRead()) {
            if (root.isDirectory()) {
                File[] files = root.listFiles();
                if (files.length != 0) {
                    for (int i = 0; i < files.length; i++) {
                        // 遞歸調用
                        indexDoc(writer, files[i]);
                    }
                }
            } else {
                try {
                // 文件的文本內容
                InputStream in=new FileInputStream(root);
                byte b[]=new byte[in.available()];
                in.read(b);
                    String content = new String(b,"GBK");
                    // 創建一個lucene document
                    Document d = new Document();
                    // 把文件的文本內容添加進來進行索引，保存
                    d.add(new Field(FIELD_NAME, content, Field.Store.YES,
                            Field.Index.TOKENIZED,Field.TermVector.WITH_POSITIONS_OFFSETS));
                    // 同時把path也加入進來，只存儲，不索引
                    d.add(new Field("path", root.getAbsolutePath(),
                            Field.Store.YES, Field.Index.NOT_ANALYZED));
                    // 把document寫入索引文件
                    writer.addDocument(d);
                    System.out.println("add file: " + root.getAbsolutePath());
                } catch (FileNotFoundException e) {
                    System.out.println("file not found, ignored.");
                    e.printStackTrace();
                } catch (IOException e) {

                }
            }
        }
    }

}

PaodingChineseAnalyzer 代碼

import java.io.File;
import java.io.Reader;
import java.util.Set;

import net.paoding.analysis.analyzer.PaodingAnalyzer;
import net.paoding.analysis.analyzer.PaodingTokenizer;

import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader;

public class PaodingChineseAnalyzer extends PaodingAnalyzer{

private static String[] stopWords = {

"www","的","和","與","時","在",

"是","被","所","那","這","有",

"將","會","爲","對","了","過",

        "去"};
private Set stopSet;

public PaodingChineseAnalyzer() {
         stopSet = StopFilter.makeStopSet(stopWords);
}
public PaodingChineseAnalyzer(String[] stopWords) {
         stopSet = StopFilter.makeStopSet(stopWords);
}

//讀取外部stopwords文件
public PaodingChineseAnalyzer(File stopwordsFile){
   try{
    stopSet=WordlistLoader.getWordSet(stopwordsFile);
   }catch (Exception e) {
    e.printStackTrace();
   }
}

//過濾分詞
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new PaodingTokenizer(reader, getKnife(),createTokenCollector());

//加入過濾分詞方法，lucene也提供了很多過濾分詞方法，可以選擇使用
   result=new StopFilter(result, stopSet);
   result = new LowerCaseFilter(result);
      return result;
}
}

由於時間問題，說明和註釋不是很清楚，如有什麼疑問，大家可以留言討論，相互學習。。

kennydkkk

發佈了33 篇原創文章 · 獲贊 1 · 訪問量 15萬+

私信關注

Lucene2.9加入PaodingAnalyzer2.0，並實現stopwords

如何使用 JS 判斷用戶是否處於活躍狀態

Mono 支持LoongArch架構

lightdb秒級增加列和刪除列（not null帶默認值）

lightdb數據庫超時相關控制參數

通過HPA+CronHPA組合應對業務複雜彈性伸縮場景

❤️‍🔥 Solon Cloud Event 新的事務特性與應用

lightdb mysql 8.0兼容之不可見主鍵

使用 JS 實現在瀏覽器控制檯打印圖片 console.image()

基於Ubuntu-22.04安裝K8s-v1.28.2實驗（四）使用域名訪問網站應用

關於dll文件的生成以及找不到jni.h的解決辦法

DOM初步創建xml

VC++ 6.0 實現 JNI （jni 入門）

java中的IO流總結

Lucene2.9.1筆記(同樣適用於Lucene 3.0 )

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結