一,概念介紹
目前系統中存在着大量的報文信息,每條報文的數據量較小,大概2000-3000字節左右,但是總體報文的條數較多.這些報文信息屬於非結構化數據,目前查詢這些海量非結構化數據的速度較慢,而通過全文檢索技術能高效地管理這些非結構化數據。
全文檢索技術是指計算機索引程序通過掃描文章中的每一個詞,對每一個詞建立一個索引,指明該詞在文章中出現的次數和位置,當用戶查詢時,檢索程序就根據事先建立的索引進行查找,並將查找的結果反饋給用戶的檢索方式。這個過程類似於通過字典中的檢索字表查字的過程。
二,全文檢索的實現過程
根據全文檢索的定義可以看出全文檢索大體分兩個過程,索引創建和搜索索引。
索引創建:將數據提取信息,創建索引的過程。
搜索索引:得到用戶的查詢請求,搜索創建的索引,然後返回結果的過程。
索引創建:
1)有一系列被索引文件
2)被索引文件經過語法分析和語言處理形成一系列詞
3)經過索引創建形成詞典和反向索引表
4)通過索引存儲將索引寫入硬盤
搜索索引
1)用戶輸入查詢語句。
2)對查詢語句經過語法分析和語言分析得到一系列詞
3)通過語法分析得到一個查詢樹
4)通過索引存儲將索引讀入到內存
5)利用查詢樹搜索索引,從而得到每個詞(Term)的文檔鏈表,對文檔鏈表進行交,差,並得到結果文檔
6)將搜索到的結果文檔對查詢的相關性進行排序
7)返回查詢結果給用戶
三,簡單的代碼
helloword代碼
package com.liuzm.lucene.day1;
import java.io.File;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;
public class HelloWorld {
// 數據源
private String doc1 = "hello java";
private String doc2 = "hello java world";
private String doc3 = "hello lucene world";
private String path = "D:/DevelopTools/eclipse4ee/workspace/lucene/firstIndex";
private Version matchVersion = Version.LUCENE_47;
/**
* 索引庫的創建
* Directory : 對Lucene索引庫的一個抽象
* Analyzer : 詞法分析器,決定分詞的力度
* Document : Lucene中存儲的基礎對象,代表表中一條數據
* Field : 表中數據的字段
* @throws Exception
*/
@Test
public void creatIndex() throws Exception {
// 準備索引的輸出目錄
Directory directory = FSDirectory.open(new File(path));
// 創建分析器
Analyzer analyzer = new StandardAnalyzer(matchVersion);
// 索引寫入器的配置對象
IndexWriterConfig config = new IndexWriterConfig(matchVersion, analyzer);
// 每次創建索引庫前先刪除原先的索引庫,即每次創建新的索引庫
config.setOpenMode(OpenMode.CREATE);
// 創建索引寫入器
IndexWriter indexWriter = new IndexWriter(directory, config);
// 添加和寫入索引
FieldType fieldType = new FieldType();
// 設置是否需要存儲
fieldType.setIndexed(true); // 是否支持搜索
fieldType.setStored(true); // 是否把數據放入數據區
fieldType.setTokenized(true); // 是否需要分詞
// 創建文檔
Document document1 = new Document();
document1.add(new Field("title", "doc1", fieldType));
document1.add(new Field("content", doc1, fieldType));
// document1.add(new TextField("title", "doc1", Store.YES));
// document1.add(new TextField("content", doc1, Store.YES));
indexWriter.addDocument(document1);
Document document2 = new Document();
document2.add(new TextField("title", "doc2", Store.YES));
document2.add(new TextField("content", doc2, Store.YES));
indexWriter.addDocument(document2);
Document document3 = new Document();
document3.add(new TextField("title", "doc3", Store.YES));
document3.add(new TextField("content", doc3, Store.YES));
indexWriter.addDocument(document3);
// 關閉流資源,並寫入索引
indexWriter.close();
}
/**
* 索引的搜索
*
* @throws Exception
*/
@Test
public void searchIndex() throws Exception {
// 查詢關鍵字
String queryStr = "hello";
// 準備索引的搜索目錄
Directory directory = FSDirectory.open(new File(path));
// 創建分析器
Analyzer analyzer = new StandardAnalyzer(matchVersion);
// 索引讀取器
DirectoryReader directoryReader = DirectoryReader.open(directory);
//IndexReader indexReader = IndexReader.open(directory);
// 創建索引搜索器
IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
//IndexSearcher indexSearcher = new IndexSearcher(indexReader);
// 創建一個詞法分析器
QueryParser parser = new QueryParser(matchVersion, "content", analyzer);
// 創建查詢對象
Query query = parser.parse(queryStr);
// 使用索引搜索器查詢,獲取有符合條件的前100個文檔
TopDocs topDocs = indexSearcher.search(query, 100);
// 輸出文檔個數
System.out.println(topDocs.totalHits);
// 獲取文檔
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
// 遍歷
for (ScoreDoc scoreDoc : scoreDocs) {
// 內部文檔編號
int docId = scoreDoc.doc;
// 獲取Document對象
Document document = indexSearcher.doc(docId);
System.out.println(document);
System.out.println("docId: " + docId);
System.out.println("title: " + document.get("title"));
System.out.println("content : " + document.get("content"));
System.out.println();
}
}
}
對於索引的CRUD
package com.liuzm.lucene.day1;
import java.io.File;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;
public class IndexCRUD {
private String doc1 = "hello java";
private String doc2 = "hello java world";
private String doc3 = "hello lucene world";
private String path = "D:/DevelopTools/eclipse4ee/workspace/lucene/CRUDIndex";
private Version matchVersion = Version.LUCENE_47;
@Test
public void creatIndex() throws Exception {
// 準備索引輸出目錄
Directory directory = FSDirectory.open(new File(path));
// 創建分析器
Analyzer analyzer = new SmartChineseAnalyzer(matchVersion);
// 索引寫入器配置
IndexWriterConfig config = new IndexWriterConfig(matchVersion, analyzer);
config.setOpenMode(OpenMode.CREATE);
// 創建索引寫入器
IndexWriter indexWriter = new IndexWriter(directory, config);
// 添加和寫入索引
FieldType fieldType = new FieldType();
// 設置是否存入數據區
fieldType.setStored(true);
// 創建文檔
Document document1 = new Document();
document1.add(new TextField("id", "1", Store.YES));
document1.add(new TextField("title", "doc1", Store.YES));
document1.add(new TextField("content", doc1, Store.YES));
indexWriter.addDocument(document1);
Document document2 = new Document();
document2.add(new TextField("id", "2", Store.YES));
document2.add(new TextField("title", "doc2", Store.YES));
document2.add(new TextField("content", doc2, Store.YES));
indexWriter.addDocument(document2);
Document document3 = new Document();
document3.add(new TextField("id", "3", Store.YES));
document3.add(new TextField("title", "doc3", Store.YES));
document3.add(new TextField("content", doc3, Store.YES));
indexWriter.addDocument(document3);
// 關閉並寫入所以呢
indexWriter.close();
}
/**
* 刪除索引
*
* @throws Exception
*/
@Test
public void deleteIndex() throws Exception {
// 索引目錄
Directory directory = FSDirectory.open(new File(path));
// 分詞器
Analyzer analyzer = new SmartChineseAnalyzer(matchVersion);
// 索引寫入器配置對象
IndexWriterConfig config = new IndexWriterConfig(matchVersion, analyzer);
// 索引寫入器
IndexWriter indexWriter = new IndexWriter(directory, config);
// 調用刪除方法,實現刪除
// indexWriter.deleteDocuments(new Term("id","2"));
// 詞法分析器
QueryParser parser = new QueryParser(matchVersion, "id", analyzer);
// 創建查詢對象
Query query = parser.parse("3");
indexWriter.deleteDocuments(query);
// 關閉資源
indexWriter.close();
}
/**
* 更新索引
*
* @throws Exception
*/
@Test
public void updateIndex() throws Exception {
// 準備目錄
Directory directory = FSDirectory.open(new File(path));
// 分詞器
Analyzer analyzer = new SmartChineseAnalyzer(matchVersion);
// 索引寫入器配置
IndexWriterConfig config = new IndexWriterConfig(matchVersion, analyzer);
// 創建索引寫入器
IndexWriter indexWriter = new IndexWriter(directory, config);
// 準備更新數據
Document document = new Document();
document.add(new TextField("id", "2", Store.YES));
document.add(new TextField("title", "doc2", Store.YES));
document.add(new TextField("content", "hello更改後的doc2", Store.YES));
//document.add(new TextField("content", "更改後的doc2", Store.YES)); // 開頭不能爲中文???
// 調用更新方法,實現更新操作(先刪除,後添加的操作)
indexWriter.updateDocument(new Term("id","2"), document);
// 關閉資源
indexWriter.close();
}
@Test
public void searchIndex() throws Exception {
// 準備搜索關鍵字
String QueryStr = "hello";
// 準備索引搜索目錄
Directory directory = FSDirectory.open(new File(path));
// 創建分析器
Analyzer analyzer = new SmartChineseAnalyzer(matchVersion);
// 索引讀取器
IndexReader indexReader = IndexReader.open(directory);
// 創建索引搜索器
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
// 創建詞法分析器
QueryParser queryParser = new QueryParser(matchVersion, "content",analyzer);
// 創建查詢對象
Query query = queryParser.parse(QueryStr);
// 使用搜索器的search方法檢索,獲取有符合條件的前N個文檔
TopDocs topDocs = indexSearcher.search(query, 100);
System.out.println(topDocs.totalHits);
// 獲取文檔
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
int docId = scoreDoc.doc;
Document doc = indexSearcher.doc(docId);
System.out.println("docId :" + docId);
System.out.println("id: " + doc.get("id"));
System.out.println("title : " + doc.get("title"));
System.out.println("content: " + doc.get("content"));
System.out.println();
}
}
}
分詞器測試
package com.liuzm.lucene.day1;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.ClassicAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.junit.Test;
/**
* 分詞器
* @author Administrator
*
*/
public class AnalyzerTest {
private String en = "ho my lady gaga";
private String cn = "迅雷不及掩耳盜鈴兒響叮當仁不讓";
private String str = "源代碼教育FullText框架的學習,哈哈";
private Version matchVersion = Version.LUCENE_47;
public void testAnalyzer(Analyzer analyzer, String str){
try {
// 獲取需要解析文本的“詞元流”
TokenStream tokenStream = analyzer.tokenStream("name", str);
// 重置解析
tokenStream.reset();
// 循環獲取解析結果
while(tokenStream.incrementToken()){
System.out.println(tokenStream);
}
// System.out.println(tokenStream.incrementToken());
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
public void testStandardAnalyzer() throws Exception {
testAnalyzer(new StandardAnalyzer(matchVersion ), cn);
}
@Test
public void testSimpleAnalyzer() throws Exception {
testAnalyzer(new SimpleAnalyzer(matchVersion), cn);
}
@Test
public void testChineseAnalyzer() throws Exception {
testAnalyzer(new ChineseAnalyzer(), cn);
}
/*
* 二分分詞
*/
@Test
public void testCJKAnalyzer() throws Exception {
testAnalyzer(new CJKAnalyzer(matchVersion), cn);
}
@Test
public void testClassicAnalyzer() throws Exception {
testAnalyzer(new ClassicAnalyzer(matchVersion), cn);
}
/**
* 字典分詞
* @throws Exception
*/
@Test
public void testSmartCnAnalyzer1() throws Exception {
CharArraySet stopWords = new CharArraySet(matchVersion, 10, true);
stopWords.add("的");
stopWords.add(",");
testAnalyzer(new SmartChineseAnalyzer(matchVersion,stopWords ), str);
}
@Test
public void testSmartCnAnalyzer2() throws Exception {
Reader reader = IOUtils.getDecodingReader(this.getClass().getClassLoader().getResourceAsStream("stopwords.txt"),IOUtils.CHARSET_UTF_8);
CharArraySet stopWords = CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(reader, "//", Version.LUCENE_47));
testAnalyzer(new SmartChineseAnalyzer(matchVersion,stopWords ), str);
}
}