背景:
最近做了一個App需要使用的搜索建議的功能,效果就是我們再使用百度搜索的時候:
可以看到,每輸入一個字符就會發送一個請求;
這個功能看似很高大上,其實做一個簡單的實現還是很簡單的。
原理:
這個技術,主要有以下的技術點:
- Elasticsearch7.4.1(以下稱爲ES)推薦
- ik中文分詞插件
首先需要將數據存放到ES中,當然存放的時候需要做一些簡單的處理,需要將搜索的field做分詞放在列表中,存入到ES之中。由於數據爬取是採用的Python,所以在這裏貼Python代碼,數據採集是使用的JD的商品數據
//由傳過來的字符生成suggest的數組
def gen_suggest(index, info_tuple):
# 由字符串生成建議
used_word = set()
suggest = []
for text, wight in info_tuple:
if text:
words = es.indices.analyze(index="jd_product", body={"analyzer":"ik_max_word", "text": "{0}".format(text)})
analyzed_word = set([r["token"] for r in words["tokens"] if len(r["token"])>1])
print(analyzed_word)
naw_words = analyzed_word - used_word
else:
naw_words = set()
if naw_words:
suggest.append({"input": list(naw_words),"weight":wight})
print(suggest)
return suggest
def save_ES(result):
jd = JD_Product()
jd.by_self = result["by-self"]
jd.comment_cnt = result["comment-cnt"]
jd.title = result["title"]
jd.pid = result["pid"]
jd.image_data_lazy_img = result["image-data-lazy-img"]
jd.image_src = result["image-src"]
jd.price = result["price"]
jd.shop_name = result["shop-name"]
# 生成建議的數據
jd.suggest = gen_suggest(JD_Product, ((jd.title, 10), (jd.shop_name, 7)))
jd.save()
此時存放在ES中的數據是這樣的:
{
"_index": "jd_product",
"_type": "_doc",
"_id": "opSOOXEBy66jXuB0CIVc",
"_version": 1,
"_seq_no": 15536,
"_primary_term": 1,
"found": true,
"_source": {
"by_self": "",
"comment_cnt": "2200+",
"title": "京東國際貝玲妃(Benefit)妝前乳/打底霜 毛孔遮蓋臉部底霜(反恐精英/反孔) 22ml 【部分滿199減100】護膚彩妝一站購全,點擊進入點擊進入",
"pid": "25715154185",
"image_data_lazy_img": "//img10.360buyimg.com/n7/jfs/t1/105249/25/17056/74155/5e84303eE9afd3891/fd91ef9acfdff12f.jpg",
"image_src": "//img10.360buyimg.com/n7/jfs/t1/105249/25/17056/74155/5e84303eE9afd3891/fd91ef9acfdff12f.jpg",
"price": "¥198.00",
"shop_name": "星線美妝海外專營店",
"suggest": [
{
"input": [
"22",
"點擊",
"22ml",
"精英",
"100",
"ml",
"反恐",
"進入",
"199",
"京東",
"遮蓋",
"反恐精英",
"打底",
"國際",
"一站",
"部分",
"毛孔",
"護膚",
"臉部",
"benefit"
],
"weight": 10
},
{
"input": [
"專營店",
"海外",
"專營"
],
"weight": 7
}
]
}
}
可以看到tittle分詞了;這樣就可以實現API了。
實現:
suggest的API採用SpringBoot實現,由於ES版本比較高(7.4.1),所以使用原生的RestHighLevelClient來進行操作。
添加相關依賴:
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>7.4.1</version>
</dependency>
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>elasticsearch-rest-high-level-client</artifactId>
<version>7.4.1</version>
</dependency>
此外需要在pom.xml中添加以下配置,避免出錯
<properties>
<java.version>1.8</java.version>
<elasticsearch.version>7.4.1</elasticsearch.version>
</properties>
ESClient工廠:
import org.apache.http.HttpHost;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestClientBuilder;
import org.elasticsearch.client.RestHighLevelClient;
import java.io.IOException;
public class ESClientSpringFactory {
public static int CONNECT_TIMEOUT_MILLIS = 1000;
public static int SOCKET_TIMEOUT_MILLIS = 30000;
public static int CONNECTION_REQUEST_TIMEOUT_MILLIS = 500;
public static int MAX_CONN_PER_ROUTE = 10;
public static int MAX_CONN_TOTAL = 30;
private static HttpHost HTTP_HOST;
private RestClientBuilder builder;
private RestClient restClient;
private RestHighLevelClient restHighLevelClient;
private static ESClientSpringFactory esClientSpringFactory = new ESClientSpringFactory();
private ESClientSpringFactory(){}
public static ESClientSpringFactory build(HttpHost httpHost,
Integer maxConnectNum, Integer maxConnectPerRoute){
HTTP_HOST = httpHost;
MAX_CONN_TOTAL = maxConnectNum;
MAX_CONN_PER_ROUTE = maxConnectPerRoute;
return esClientSpringFactory;
}
public static ESClientSpringFactory build(HttpHost httpHost,Integer connectTimeOut, Integer socketTimeOut,
Integer connectionRequestTime,Integer maxConnectNum, Integer maxConnectPerRoute){
HTTP_HOST = httpHost;
CONNECT_TIMEOUT_MILLIS = connectTimeOut;
SOCKET_TIMEOUT_MILLIS = socketTimeOut;
CONNECTION_REQUEST_TIMEOUT_MILLIS = connectionRequestTime;
MAX_CONN_TOTAL = maxConnectNum;
MAX_CONN_PER_ROUTE = maxConnectPerRoute;
return esClientSpringFactory;
}
public void init(){
builder = RestClient.builder(HTTP_HOST);
setConnectTimeOutConfig();
setMutiConnectConfig();
restClient = builder.build();
restHighLevelClient = new RestHighLevelClient(builder);
System.out.println("init factory");
}
// 配置連接時間延時
public void setConnectTimeOutConfig(){
builder.setRequestConfigCallback(requestConfigBuilder -> {
requestConfigBuilder.setConnectTimeout(CONNECT_TIMEOUT_MILLIS);
requestConfigBuilder.setSocketTimeout(SOCKET_TIMEOUT_MILLIS);
requestConfigBuilder.setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MILLIS);
return requestConfigBuilder;
});
}
// 使用異步httpclient時設置併發連接數
public void setMutiConnectConfig(){
builder.setHttpClientConfigCallback(httpClientBuilder -> {
httpClientBuilder.setMaxConnTotal(MAX_CONN_TOTAL);
httpClientBuilder.setMaxConnPerRoute(MAX_CONN_PER_ROUTE);
return httpClientBuilder;
});
}
public RestClient getClient(){
return restClient;
}
public RestHighLevelClient getRhlClient(){
return restHighLevelClient;
}
public void close() {
if (restClient != null) {
try {
restClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
System.out.println("close client");
}
}
ESConfig:
import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpHost;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Scope;
@Configuration
@Slf4j
@ComponentScan(basePackageClasses=ESClientSpringFactory.class)
public class ESConfig {
@Value("${elasticSearch.host}")
private String host;
@Value("${elasticSearch.port}")
private int port;
@Value("${elasticSearch.client.connectNum}")
private Integer connectNum;
@Value("${elasticSearch.client.connectPerRoute}")
private Integer connectPerRoute;
@Bean
public HttpHost httpHost() {
return new HttpHost(host, port, "http");
}
@Bean(initMethod = "init", destroyMethod = "close")
public ESClientSpringFactory getFactory() {
return ESClientSpringFactory.
build(httpHost(), connectNum, connectPerRoute);
}
@Bean
@Scope("singleton")
public RestClient getRestClient() {
return getFactory().getClient();
}
@Bean
@Scope("singleton")
public RestHighLevelClient getRHLClient() {
return getFactory().getRhlClient();
}
}
yaml中的配置數據:
elasticSearch.host=ip地址
elasticSearch.port=9200
elasticSearch.client.connectNum=10
elasticSearch.client.connectPerRoute=50
編寫一個測試:
@Qualifier("getRHLClient")
@Autowired
RestHighLevelClient restHighLevelClient;
@Test
public void getSuggest()throws Exception{
String data = "手機";
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
// 構建模糊相關的參數
FuzzyOptions fuzzy = FuzzyOptions.builder().setFuzzyPrefixLength(1).setFuzziness(0).setFuzzyMinLength(3).build();
CompletionSuggestionBuilder completionSuggestionBuilder = new CompletionSuggestionBuilder("suggest");
SuggestionBuilder termSuggestionBuilder = SuggestBuilders.completionSuggestion("suggest").prefix(data,fuzzy);
SuggestBuilder suggestBuilder = new SuggestBuilder();
suggestBuilder.addSuggestion("my-suggest", termSuggestionBuilder);
searchSourceBuilder.suggest(suggestBuilder);
// 返回指定的字段
String[] incloud = {"shop_name","title"};
String[] excloud = {};
searchSourceBuilder.fetchSource(incloud,excloud);
SearchRequest searchRequest = new SearchRequest("jd_product"); //索引
searchRequest.source(searchSourceBuilder);
SearchResponse response = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
Suggest suggestions = response.getSuggest(); //SearchHits提供有關所有匹配的全局信息,例如總命中數或最高分數:
System.out.println("suggestions = " + suggestions);
}
獲取到返回的數據: