Lucene是全文檢索引擎工具包,這裏要注意,它不是完整的搜索引擎,它只是一個工具包,下一篇要講的Elasticsearch纔是搜索引擎。Lucene也是Apache的。
博主是做java後臺開發的,平時不怎麼寫Web端代碼,所以這篇博客的代碼也是博主照着“java知識分享網”上面的一個博客系統來敲的,也算是“現炒現賣”,和大家一起學習。
第一步,要在pom.xml文件裏面添加Shiro依賴,如下。
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>5.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>5.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>5.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-smartcn</artifactId>
<version>5.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>5.3.1</version>
</dependency>
第二步,編寫Lucene工具類,如下,這是一個博客系統的Lucene工具類,實現了文檔的增刪改查,註釋我寫的非常詳細,大家應該都能看懂。
package com.zznode.lucene;
import java.io.StringReader;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.assertj.core.util.DateUtil;
import org.mockito.internal.util.StringUtil;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import com.zznode.entity.Blog;
import io.micrometer.core.instrument.util.StringEscapeUtils;
import lombok.extern.slf4j.Slf4j;
/**
* 博客索引類
*
* @author Administrator
*
*/
@Component
public class BlogIndex {
@Value("${lucene.indexDir}")
private String indexDir;
private Directory dir;
private SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
/**
* 獲取IndexWriter實例
*
* @return
* @throws Exception
*/
private IndexWriter getWriter() throws Exception {
// 索引庫存放的位置
dir = FSDirectory.open(Paths.get(indexDir));
// 創建標準分詞器
// Analyzer analyzer = new StandardAnalyzer();
// 創建中文分詞器
SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
// 創建IndexWriterConfig,注入分詞器
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
// 創建寫索引實例
IndexWriter writer = new IndexWriter(dir, iwc);
return writer;
}
/**
* 添加博客索引
*
* @param blog
* @throws Exception
*/
public void addIndex(Blog blog) throws Exception {
// 獲得寫索引實例
IndexWriter writer = getWriter();
// 創建一個文檔
Document doc = new Document();
// 可以把每個Document理解成數據庫表中的一行數據,這行數據有四個字段,分別是id、title、releaseDate、content
doc.add(new StringField("id", String.valueOf(blog.getId()), Field.Store.YES));
doc.add(new TextField("title", blog.getTitle(), Field.Store.YES));
doc.add(new StringField("releaseDate", sdf.format(new Date()), Field.Store.YES));
doc.add(new TextField("content", blog.getContentNoTag(), Field.Store.YES));
// 將這個文檔添加進索引實例
writer.addDocument(doc);
writer.close();
}
/**
* 刪除指定博客的索引
*
* @param blogId
* @throws Exception
*/
public void deleteIndex(String blogId) throws Exception {
// 獲得寫索引實例
IndexWriter writer = getWriter();
// 將索引實例中的這個文檔刪除
writer.deleteDocuments(new Term("id", blogId));
// 強制刪除
writer.forceMergeDeletes();
writer.commit();
writer.close();
}
/**
* 更新博客索引
*
* @param blog
* @throws Exception
*/
public void updateIndex(Blog blog) throws Exception {
// 獲得寫索引實例
IndexWriter writer = getWriter();
// 創建一個文檔
Document doc = new Document();
doc.add(new StringField("id", String.valueOf(blog.getId()), Field.Store.YES));
doc.add(new TextField("title", blog.getTitle(), Field.Store.YES));
doc.add(new StringField("releaseDate", sdf.format(new Date()), Field.Store.YES));
doc.add(new TextField("content", blog.getContentNoTag(), Field.Store.YES));
// 更新進索引實例中的這個文檔
writer.updateDocument(new Term("id", String.valueOf(blog.getId())), doc);
writer.close();
}
/**
* 查詢博客信息
*
* @param q
* @return
* @throws Exception
*/
public List<Blog> searchBlog(String q) throws Exception {
dir = FSDirectory.open(Paths.get(indexDir));
// 創建讀索引實例
IndexReader reader = DirectoryReader.open(dir);
// 創建索引查詢實例
IndexSearcher is = new IndexSearcher(reader);
// 創建多條件查詢實例
BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();
// 創建中文分詞器
SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();
// 查詢title字段
QueryParser parser = new QueryParser("title", analyzer);
Query query = parser.parse(q);
// 查詢content字段
QueryParser parser2 = new QueryParser("content", analyzer);
Query query2 = parser2.parse(q);
// 將上面兩個查詢實例添加進booleanQuery,其中BooleanClause是表示布爾查詢子句關係的類
// BooleanClause.Occur.MUST:必須包含
// BooleanClause.Occur.MUST_NOT:不能包含
// BooleanClause.Occur.SHOULD:可以包含
booleanQuery.add(query, BooleanClause.Occur.SHOULD);
booleanQuery.add(query2, BooleanClause.Occur.SHOULD);
// 返回查詢結果的前100條
TopDocs hits = is.search(booleanQuery.build(), 100);
// 計算得分。
QueryScorer scorer = new QueryScorer(query);
// 獲取得分高的片段,就是得到一段包含所查詢的關鍵字的摘要。
// 比如你在百度中搜索“java”,會查出下面這些內容:
// “java-中國數萬程序員的選擇-官方首頁
// java-致力於互聯網應用研發培訓,中國程序員認可的培訓機構......”這就叫摘要
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
// 對查詢的數據進行格式化(默認是粗體)
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b><font color='red'>", "</font></b>");
// 高亮顯示
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, scorer);
// 把片段set進去
highlighter.setTextFragmenter(fragmenter);
List<Blog> blogList = new LinkedList<Blog>();
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = is.doc(scoreDoc.doc);
Blog blog = new Blog();
blog.setId(Integer.parseInt(doc.get("id")));
blog.setReleaseDateStr(doc.get("releaseDate"));
String title = doc.get("title");
// 將content內容轉義,比如將<div></div>標籤轉義成<div></div>
String content = StringEscapeUtils.escapeHtml(doc.get("content"));
if (title != null) {
// TokenStream將查詢出來的title轉化成流(或者說轉化成很多片段)
TokenStream tokenStream = analyzer.tokenStream("title", new StringReader(title));
// 將權重高的摘要(即最佳片段)顯示出來
String hTitle = highlighter.getBestFragment(tokenStream, title);
if (StringUtils.isBlank(hTitle)) {
blog.setTitle(title);
} else {
blog.setTitle(hTitle);
}
}
if (content != null) {
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(content));
String hContent = highlighter.getBestFragment(tokenStream, content);
if (StringUtils.isBlank(hContent)) {
if (content.length() <= 200) {
blog.setContent(content);
} else {
blog.setContent(content.substring(0, 200));
}
} else {
blog.setContent(hContent);
}
}
blogList.add(blog);
}
return blogList;
}
}
下面,我們看一個controller的save方法的例子,代碼如下。在添加或修改博客信息入庫之後,也要相應的在lucene索引實例裏面添加或修改一個文檔。
@RequestMapping("/save")
public String save(Blog blog, HttpServletResponse response) throws Exception {
int resultTotal = 0;
if (blog.getId() == null) {
// 如果博客id是空,說明是增加記錄
resultTotal = blogService.add(blog);
// 在lucene索引實例裏面增加一個文檔
blogIndex.addIndex(blog);
} else {
// 如果博客id不是空,說明是更新記錄
resultTotal = blogService.update(blog);
// 在lucene索引實例裏面更新加一個文檔
blogIndex.updateIndex(blog);
}
JSONObject result = new JSONObject();
if (resultTotal > 0) {
result.put("success", true);
} else {
result.put("success", false);
}
ResponseUtil.write(response, result);
return null;
}
我們再看一個controller的delete方法的例子,代碼如下。在從數據庫裏面刪除博客信息之後,也要相應的在lucene索引實例裏面刪除這些文檔。
@RequestMapping("/delete")
public String delete(@RequestParam(value = "ids", required = false) String ids, HttpServletResponse response)
throws Exception {
String[] idsStr = ids.split(",");
for (int i = 0; i < idsStr.length; i++) {
blogService.delete(Integer.parseInt(idsStr[i]));
// 在lucene索引實例裏面刪除加一個文檔
blogIndex.deleteIndex(idsStr[i]);
}
JSONObject result = new JSONObject();
result.put("success", true);
ResponseUtil.write(response, result);
return null;
}
最後我們再看一個controller的search方法的例子,代碼如下。
@RequestMapping("/q")
public ModelAndView search(@RequestParam(value = "q", required = false) String q,
@RequestParam(value = "page", required = false) String page, HttpServletRequest request) throws Exception {
// 設置每一頁的大小是3條記錄
int pageSize = 3;
if (StringUtil.isEmpty(page)) {
// 如果page爲空,那麼就默認是第一頁
page = "1";
}
ModelAndView mav = new ModelAndView();
mav.addObject("pageTitle", "搜索關鍵字'" + q + "'結果頁面_java開源博客系統");
mav.addObject("mainPage", "foreground/blog/result.jsp");
// lucene索引實例裏面查找關鍵字q的信息
List<Blog> blogList = blogIndex.searchBlog(q);
// blogList長度,page*pageSize,取小的,賦給toIndex
Integer toIndex = blogList.size() >= Integer.parseInt(page) * pageSize ? Integer.parseInt(page) * pageSize
: blogList.size();
// 截取(page-1)*pageSize到toIndex的blogList
mav.addObject("blogList", blogList.subList((Integer.parseInt(page) - 1) * pageSize, toIndex));
mav.addObject("pageCode", this.genUpAndDownPageCode(Integer.parseInt(page), blogList.size(), q, pageSize,
request.getServletContext().getContextPath()));
mav.addObject("q", q);
mav.addObject("resultTotal", blogList.size());
mav.setViewName("mainTemp");
return mav;
}
至此,Lucene就介紹完了,博主覺得掌握這些基本就差不多夠了。以後項目中用到更深的東西的時候我們再研究。