之前看過了solr的全文檢索工具,原理比較簡單,理解起來也快;這次我們項目上要求用Elasticsearch實現全文檢索,據說這個插件功能更厲害,但是也沒有具體研究過;這裏就省略了es的部署過程和集成springboot的方法了,直接附上我的後臺查詢代碼;
import com.pridecn.file.domain.EsFileInfo;
import com.pridecn.file.service.ElasticsearchService;
import io.searchbox.client.JestClient;
import io.searchbox.core.Search;
import io.searchbox.core.SearchResult;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@Service
public class ElasticsearchServiceImpl implements ElasticsearchService {
@Autowired
JestClient jestClient;
@Override
public List<EsFileInfo> findPublishedFileByKeyWord(String keyWord, int pageNum, int pageSize) {
//處理特殊字符
keyWord = QueryParser.escape(keyWord);
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
searchSourceBuilder.query(QueryBuilders.boolQuery()
.should(/*QueryBuilders.queryStringQuery(keyWord).field("FILE_NAME")*/QueryBuilders.matchQuery("FILE_NAME",keyWord).analyzer("ik_smart"))
.should(/*QueryBuilders.queryStringQuery(keyWord).field("attachment.content")*/QueryBuilders.matchQuery("attachment.content",keyWord).analyzer("ik_smart")));
//初始化高亮對象
HighlightBuilder highlightBuilder = new HighlightBuilder();
highlightBuilder.field("FILE_NAME");//高亮title
highlightBuilder.field("attachment.content");
highlightBuilder.preTags("<span style='color:red'>").postTags("</span>");//高亮標籤
//設置高亮
searchSourceBuilder.highlighter(highlightBuilder);
//設置起始頁
searchSourceBuilder.from((pageNum - 1) * pageSize);
//設置頁大小
searchSourceBuilder.size(pageSize);
//指定索引
Search search = new Search.Builder(searchSourceBuilder.toString())
.addIndex("book")
.build();
SearchResult result = null ;
List<EsFileInfo> list = new ArrayList<>();
try {
//執行查詢操作
result = jestClient.execute(search);
System.out.println("本次查詢共查到:"+result.getTotal()+"個關鍵字!"+result.getJsonObject());
List<SearchResult.Hit<EsFileInfo,Void>> hits = result.getHits(EsFileInfo.class);
for (SearchResult.Hit<EsFileInfo,Void> hit : hits) {
EsFileInfo source = hit.source;
//獲取高亮後的內容
Map<String, List<String>> highlight = hit.highlight;
List<String> file_name = highlight.get("FILE_NAME");//高亮後的title
if(file_name!=null){
source.setFile_name(file_name.get(0));
}
List<String> content = highlight.get("attachment.content");//高亮後的title
if(content!=null){
source.getEsDoc().setContent(content.get(0));
}
System.out.println("姓名:"+source.getFile_name());
System.out.println("作者:"+source.getEsDoc().getAuthor());
System.out.println("內容:"+source.getEsDoc().getContent());
list.add(source);
}
return list;
} catch (IOException e) {
e.printStackTrace();
return new ArrayList<>();
}
}
@Override
public int findPublishedCountByKeyWord(String keyWord) {
//處理特殊字符
keyWord = QueryParser.escape(keyWord);
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
searchSourceBuilder.query(QueryBuilders.boolQuery()
.should(QueryBuilders.queryStringQuery(keyWord).field("FILE_NAME"))
.should(QueryBuilders.queryStringQuery(keyWord).field("attachment.content")));
//初始化高亮對象
HighlightBuilder highlightBuilder = new HighlightBuilder();
highlightBuilder.field("FILE_NAME");//高亮title
highlightBuilder.field("attachment.content");
highlightBuilder.preTags("<span style='color:red'>").postTags("</span>");//高亮標籤
//設置高亮
searchSourceBuilder.highlighter(highlightBuilder);
//設置頁大小
searchSourceBuilder.size(10000);
//指定索引
Search search = new Search.Builder(searchSourceBuilder.toString())
.addIndex("book")
.build();
SearchResult result = null ;
try {
result = jestClient.execute(search);
System.out.println("本次查詢共查到:"+result.getTotal()+"個關鍵字!"+result.getJsonObject());
List<SearchResult.Hit<EsFileInfo,Void>> hits = result.getHits(EsFileInfo.class);
return hits.size();
} catch (IOException e) {
e.printStackTrace();
return new ArrayList<>().size();
}
}
}
import com.google.gson.annotations.SerializedName;
import org.apache.poi.hmef.Attachment;
import java.util.Map;
/**
* es查詢出的文件信息結果類
*/
public class EsFileInfo {
@SerializedName("FILE_ID")
private String file_id;
@SerializedName("FILE_NAME")
private String file_name;
@SerializedName("FILE_SAVE_NAME")
private String file_save_name;
//編譯成另一個名字
@SerializedName("attachment")
private EsDoc esDoc;
public String getFile_id() {
return file_id;
}
public void setFile_id(String file_id) {
this.file_id = file_id;
}
public String getFile_name() {
return file_name;
}
public void setFile_name(String file_name) {
this.file_name = file_name;
}
public String getFile_save_name() {
return file_save_name;
}
public void setFile_save_name(String file_save_name) {
this.file_save_name = file_save_name;
}
public EsDoc getEsDoc() {
return esDoc;
}
public void setEsDoc(EsDoc esDoc) {
this.esDoc = esDoc;
}
}
package com.pridecn.file.domain;
import com.google.gson.annotations.SerializedName;
/**
* 文件實體附件類
*/
public class EsDoc {
// @SerializedName("attachment.author")
private String author;
// @SerializedName("attachment.content")
private String content;
// @SerializedName("attachment.date")
private String date;
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
}
QueryBuilders的幾個分詞方法的區別:
/**
* 默認的standard analyzer分詞規則:<br>
* 去掉大部分標點符號,並以此分割原詞爲多個詞,把分分割後的詞轉爲小寫放入token組中。<br>
* 對於not-analyzed的詞,直接把原詞放入token組中。<br>
* matchQuery的機制是:先檢查字段類型是否是analyzed,如果是,則先分詞,再去去匹配token;如果不是,則直接去匹配token。<br>
* id=id2,默認分詞,id2不分詞。<br>
* 以wwIF5-vP3J4l3GJ6VN3h爲例:<br>
* id是的token組是[wwif5,vp3j4l3gj6vn3h]<br>
* id2的token組是[wwIF5-vP3J4l3GJ6VN3h]<br>
* 可以預計以下結果:<br>
* 1.matchQuery("id", "字符串"),"字符串"分詞後有[wwif5,vp3j4l3gj6vn3h]其中之一時,有值。<br>
* 如:wwIF5-vP3J4l3GJ6VN3h,wwif5-vp3j4l3gj6vn3h,wwIF5,wwif5,wwIF5-6666等等。<br>
* 2.matchQuery("id2", "wwIF5-vP3J4l3GJ6VN3h"),有值。<br>
* 特別說明:<br>
* 在創建索引時,如果沒有指定"index":"not_analyzed"<br>
* 會使用默認的analyzer進行分詞。當然你可以指定analyzer。<br>
* 在瀏覽器中輸入:<br>
* http://localhost:9200/_analyze?pretty&analyzer=standard&text=J4Kz1%26L
* bvjoQFE9gHC7H<br>
* 可以看到J4Kz1&LbvjoQFE9gHC7H被分成了:j4kz1和lbvjoqfe9ghc7h<br>
///////////////////////////////////////////////////////////////////////////////////////////////////////////
*
* 默認的standard analyzer分詞規則:<br>
* 去掉大部分標點符號,並以此分割原詞爲多個詞,把分分割後的詞轉爲小寫放入token組中。<br>
* 對於not-analyzed的詞,直接把原詞放入token組中。<br>
* termQuery的機制是:直接去匹配token。<br>
* id=id2,默認分詞,id2不分詞。<br>
* 以wwIF5-vP3J4l3GJ6VN3h爲例:<br>
* id是的token組是[wwif5,vp3j4l3gj6vn3h]<br>
* id2的token組是[wwIF5-vP3J4l3GJ6VN3h]<br>
* 可以預計以下結果:<br>
* 1.termQuery("id", "wwif5"),有值。<br>
* 2.termQuery("id", "vp3j4l3gj6vn3h"),有值。<br>
* 3.termQuery("id2", "wwIF5-vP3J4l3GJ6VN3h"),有值。<br>
總結:match query搜索的時候,首先會解析查詢字符串,進行分詞,然後查詢,而term query,輸入的查詢內容是什麼,就會按照什麼去查詢,並不會解析查詢內容,對它分詞。QueryBuilders.queryStringQuery(keyWord).field("FILE_NAME")與matchquery相似