分詞的基本原理:
1、分詞是用來對文本按語言特徵按算法進行過濾、分組處理的一種技術。
2、分詞的對象是文本,而不是圖像動畫腳本等等。
3、分詞的方式就是過濾和分組。
4、過濾主要把文本中那些沒有實際意義的字或詞過濾掉。
5、分組就是按照”分詞數據庫“內已添加好的詞,進行匹配。
下面來看Lucene分詞器的使用
- package com.qianyan.analyzer;
- import java.io.IOException;
- import java.io.StringReader;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.SimpleAnalyzer;
- import org.apache.lucene.analysis.Token;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.analysis.WhitespaceAnalyzer;
- import org.apache.lucene.analysis.cjk.CJKAnalyzer;
- import org.apache.lucene.analysis.cn.ChineseAnalyzer;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- public class TestAnalyzer {
- public static void main(String[] args) throws IOException {
- Analyzer analyzer = new StandardAnalyzer(); //標準 過濾停用次
- //Analyzer analyzer = new SimpleAnalyzer(); //簡單 過濾空格和符號
- //Analyzer analyzer = new WhitespaceAnalyzer(); //過濾空格
- //Analyzer analyzer = new ChineseAnalyzer(); //lucene下的中文分詞器 拆分每個字符,過濾符號
- //Analyzer analyzer = new CJKAnalyzer(); //中文 兩字兩字拆分 英文和standard功能一樣
- String input = "this is test lucene analyzer class!";
- TokenStream tokenStream = analyzer.tokenStream("", new StringReader(input));
- Token token = new Token();
- while(null != tokenStream.next(token))
- System.out.println(token.term());
- }
- }
對於初學者,我們只需要掌握這些經典的分詞器就足夠了。
但在實際的開發過程中,滿足我們需要得,是一些基於lucene分詞之上的第三方中文分詞包,在這裏我們只介紹 ”庖丁分詞包“,命令借鑑了”庖丁解牛“這個成語。
庖丁解牛,我國古代成語,出自《莊子》,比喻經過反覆實踐,掌握了事物的客觀規律,做事得心應手,運用自如。
下載網址鏈接:http://code.google.com/p/paoding/
解壓後我們需要對項目添加2個jar包,解壓目錄下的paoding-analysis.jar 和lib下的 commons-logging.jar 。令把dic文件夾複製到我們的項目src目錄下。
- package com.qianyan.analyzer;
- import java.io.IOException;
- import java.io.StringReader;
- import net.paoding.analysis.analyzer.PaodingAnalyzer;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.Token;
- import org.apache.lucene.analysis.TokenStream;
- public class TestPaodingAnalyzer {
- public static void main(String[] args) throws IOException {
- Analyzer analyzer = new PaodingAnalyzer();
- String input = "我愛北京天安門!";
- TokenStream ts = analyzer.tokenStream("", new StringReader(input));
- Token token = new Token();
- while(null != (token = ts.next(null)))
- System.out.println(token.term());
- }
- }
大家通過這個例子可以看到,paoding分詞器相當的強大,它的語法在此不過多介紹,有興趣的朋友可以看解壓後的中文操作手冊。
下面來看下實際中運用
首先根據paoding分詞器建立索引:
- package com.qianyan.index;
- import java.io.IOException;
- import net.paoding.analysis.analyzer.PaodingAnalyzer;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- public class TestPaodingIndex {
- public static void main(String[] args) throws IOException{
- String[] ids = {"1", "2", "3", "4"};
- String[] names = {"張三", "李四", "李五", "趙六"};
- String[] addresses = {"居住在北京", "南京", "北京海淀", "南寧"};
- String[] birthdays = {"19820720", "19840203", "19770409", "19830130"};
- Analyzer analyzer = new PaodingAnalyzer();
- String indexDir = "E:/luceneindex";
- Directory dir = FSDirectory.getDirectory(indexDir);
- IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
- for(int i = 0; i < ids.length; i++){
- Document document = new Document();
- document.add(new Field("id", ids[i], Field.Store.YES, Field.Index.ANALYZED));
- document.add(new Field("name", names[i], Field.Store.YES, Field.Index.ANALYZED));
- document.add(new Field("address", addresses[i], Field.Store.YES, Field.Index.ANALYZED));
- document.add(new Field("birthday", birthdays[i], Field.Store.YES, Field.Index.ANALYZED));
- writer.addDocument(document);
- }
- writer.optimize();
- writer.close();
- }
- }
然後來看簡單的檢索類
- package com.qianyan.search;
- import java.io.IOException;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.PrefixQuery;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.search.WildcardQuery;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- public class TestPaodingSearch {
- public static void main(String[] args) throws IOException {
- String indexDir = "E:/luceneindex";
- Directory dir = FSDirectory.getDirectory(indexDir);
- IndexSearcher searcher = new IndexSearcher(dir);
- ScoreDoc[] hits = null;
- /*Term term = new Term("address", "北京");
- TermQuery query = new TermQuery(term);
- */
- /*Term term = new Term("name", "張");
- PrefixQuery query = new PrefixQuery(term);*/
- Term term = new Term("name", "李*");
- WildcardQuery query = new WildcardQuery(term);
- TopDocs topDocs = searcher.search(query, 100);
- hits = topDocs.scoreDocs;
- for(int i = 0; i < hits.length; i++){
- Document doc = searcher.doc(hits[i].doc);
- System.out.print(hits[i].score + " ");
- System.out.print(doc.get("id") + " ");
- System.out.print(doc.get("name") + " ");
- System.out.print(doc.get("address") + " ");
- System.out.println(doc.get("birthday") + " ");
- }
- searcher.close();
- dir.close();
- }
- }
下面是來看QueryParser檢索類
- package com.qianyan.search;
- import java.io.IOException;
- import net.paoding.analysis.analyzer.PaodingAnalyzer;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.queryParser.ParseException;
- import org.apache.lucene.queryParser.QueryParser;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TopDocCollector;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- public class TestQueryParser {
- public static void main(String[] args) throws IOException, ParseException {
- Analyzer analyzer = new PaodingAnalyzer();
- String indexDir = "E:/luceneindex";
- Directory dir = FSDirectory.getDirectory(indexDir);
- IndexSearcher searcher = new IndexSearcher(dir);
- ScoreDoc[] hits = null;
- QueryParser parser = new QueryParser("address", analyzer); //name爲默認字段檢索
- Query query = parser.parse("北京");
- //Query query = parser.parse("birthday:[19820720 TO 19840203]"); //中括號包含首尾,花括號不包含。TO指範圍
- //Query query = parser.parse("張~"); //前綴檢索
- //Query query = parser.parse("上海 北京");
- //Query query = parser.parse("(居住 or 北京) and 海淀");
- //Query query = parser.parse("上海 北京 AND NOT name:李四");
- //Query query = parser.parse("name:李*"); //前綴檢索
- TopDocCollector topdoc = new TopDocCollector(100);
- searcher.search(query, topdoc);
- hits = topdoc.topDocs().scoreDocs;
- for(int i = 0; i < hits.length; i++){
- Document doc = searcher.doc(hits[i].doc);
- //System.out.println(hits[i].score);
- System.out.print(doc.get("id") + " ");
- System.out.print(doc.get("name") + " ");
- System.out.print(doc.get("address") + " ");
- System.out.println(doc.get("birthday") + " ");
- }
- searcher.close();
- dir.close();
- }
- }
下面我們來學習Paoding對文件的索引
- package com.qianyan.file;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import net.paoding.analysis.analyzer.PaodingAnalyzer;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- public class TestFileIndex {
- public static void main(String[] args) throws IOException {
- String dataDir = "E:\\lucenedata";
- String indexDir = "E:\\luceneindex";
- File[] files = new File(dataDir).listFiles();
- Analyzer analyzer = new PaodingAnalyzer();
- Directory dir = FSDirectory.getDirectory(indexDir);
- IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
- for(int i = 0; i < files.length; i++){
- StringBuffer strBuffer = new StringBuffer();
- String line = "";
- FileInputStream fs = new FileInputStream(files[i].getCanonicalPath());
- BufferedReader reader = new BufferedReader(new InputStreamReader(fs));
- line = reader.readLine();
- while(null != line){
- strBuffer.append(line).append("\n");
- line = reader.readLine();
- }
- Document document = new Document();
- document.add(new Field("fileName", files[i].getName(), Field.Store.YES, Field.Index.ANALYZED));
- document.add(new Field("contents", strBuffer.toString(), Field.Store.YES, Field.Index.ANALYZED));
- writer.addDocument(document);
- fs.close();
- reader.close();
- }
- writer.close();
- dir.close();
- }
- }
然後是對之前索引的檢索類:
- package com.qianyan.file;
- import java.io.IOException;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.search.WildcardQuery;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- public class TestSearch {
- public static void main(String[] args) throws IOException {
- String indexDir = "E:/luceneindex";
- Directory dir = FSDirectory.getDirectory(indexDir);
- IndexSearcher searcher = new IndexSearcher(dir);
- ScoreDoc[] hits = null;
- Term term = new Term("fileName", "星期");
- TermQuery query = new TermQuery(term);
- TopDocs topDocs = searcher.search(query, 100);
- hits = topDocs.scoreDocs;
- for(int i = 0; i < hits.length; i++){
- Document doc = searcher.doc(hits[i].doc);
- System.out.print(hits[i].score + " ");
- System.out.println(doc.get("fileName") + " ");
- System.out.println(doc.get("contents") + " ");
- }
- searcher.close();
- dir.close();
- }
- }