SpringBoot整合Elasticsearch7.4.1實現建議搜索(Python爬取數據並存es)

背景:

最近做了一個App需要使用的搜索建議的功能,效果就是我們再使用百度搜索的時候:

image-20200404205652051

可以看到,每輸入一個字符就會發送一個請求;

image-20200404205929411

這個功能看似很高大上,其實做一個簡單的實現還是很簡單的。

原理:

這個技術,主要有以下的技術點:

  • Elasticsearch7.4.1(以下稱爲ES)推薦
  • ik中文分詞插件

首先需要將數據存放到ES中,當然存放的時候需要做一些簡單的處理,需要將搜索的field做分詞放在列表中,存入到ES之中。由於數據爬取是採用的Python,所以在這裏貼Python代碼,數據採集是使用的JD的商品數據

//由傳過來的字符生成suggest的數組
def gen_suggest(index, info_tuple):
    #     由字符串生成建議
    used_word = set()
    suggest = []
    for text, wight in info_tuple:
        if text:
            words = es.indices.analyze(index="jd_product", body={"analyzer":"ik_max_word", "text": "{0}".format(text)})
            analyzed_word = set([r["token"] for r in words["tokens"] if len(r["token"])>1])
            print(analyzed_word)
            naw_words = analyzed_word - used_word
        else:
            naw_words = set()
        if naw_words:
            suggest.append({"input": list(naw_words),"weight":wight})
    print(suggest)
    return suggest


def save_ES(result):
    jd = JD_Product()
    jd.by_self = result["by-self"]
    jd.comment_cnt = result["comment-cnt"]
    jd.title = result["title"]
    jd.pid = result["pid"]
    jd.image_data_lazy_img = result["image-data-lazy-img"]
    jd.image_src = result["image-src"]
    jd.price = result["price"]
    jd.shop_name = result["shop-name"]
    # 生成建議的數據
    jd.suggest = gen_suggest(JD_Product, ((jd.title, 10), (jd.shop_name, 7)))
    jd.save()

此時存放在ES中的數據是這樣的:

{
	"_index": "jd_product",
	"_type": "_doc",
	"_id": "opSOOXEBy66jXuB0CIVc",
	"_version": 1,
	"_seq_no": 15536,
	"_primary_term": 1,
	"found": true,
	"_source": {
		"by_self": "",
		"comment_cnt": "2200+",
		"title": "京東國際貝玲妃(Benefit)妝前乳/打底霜 毛孔遮蓋臉部底霜(反恐精英/反孔) 22ml 【部分滿199減100】護膚彩妝一站購全,點擊進入點擊進入",
		"pid": "25715154185",
		"image_data_lazy_img": "//img10.360buyimg.com/n7/jfs/t1/105249/25/17056/74155/5e84303eE9afd3891/fd91ef9acfdff12f.jpg",
		"image_src": "//img10.360buyimg.com/n7/jfs/t1/105249/25/17056/74155/5e84303eE9afd3891/fd91ef9acfdff12f.jpg",
		"price": "¥198.00",
		"shop_name": "星線美妝海外專營店",
		"suggest": [
			{
				"input": [
					"22",
					"點擊",
					"22ml",
					"精英",
					"100",
					"ml",
					"反恐",
					"進入",
					"199",
					"京東",
					"遮蓋",
					"反恐精英",
					"打底",
					"國際",
					"一站",
					"部分",
					"毛孔",
					"護膚",
					"臉部",
					"benefit"
				],
				"weight": 10
			},
			{
				"input": [
					"專營店",
					"海外",
					"專營"
				],
				"weight": 7
			}
		]
	}
}

可以看到tittle分詞了;這樣就可以實現API了。

實現:

suggest的API採用SpringBoot實現,由於ES版本比較高(7.4.1),所以使用原生的RestHighLevelClient來進行操作。

添加相關依賴:

    <dependency>
            <groupId>org.elasticsearch</groupId>
            <artifactId>elasticsearch</artifactId>
            <version>7.4.1</version>
        </dependency>
        <dependency>
            <groupId>org.elasticsearch.client</groupId>
            <artifactId>elasticsearch-rest-high-level-client</artifactId>
            <version>7.4.1</version>
       </dependency>

此外需要在pom.xml中添加以下配置,避免出錯

    <properties>
        <java.version>1.8</java.version>
        <elasticsearch.version>7.4.1</elasticsearch.version>
    </properties>

ESClient工廠:

import org.apache.http.HttpHost;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestClientBuilder;
import org.elasticsearch.client.RestHighLevelClient;

import java.io.IOException;

public class ESClientSpringFactory {

    public static int CONNECT_TIMEOUT_MILLIS = 1000;
    public static int SOCKET_TIMEOUT_MILLIS = 30000;
    public static int CONNECTION_REQUEST_TIMEOUT_MILLIS = 500;
    public static int MAX_CONN_PER_ROUTE = 10;
    public static int MAX_CONN_TOTAL = 30;

    private static HttpHost HTTP_HOST;
    private RestClientBuilder builder;
    private RestClient restClient;
    private RestHighLevelClient restHighLevelClient;

    private static ESClientSpringFactory esClientSpringFactory = new ESClientSpringFactory();

    private ESClientSpringFactory(){}

    public static ESClientSpringFactory build(HttpHost httpHost,
                                              Integer maxConnectNum, Integer maxConnectPerRoute){
        HTTP_HOST = httpHost;
        MAX_CONN_TOTAL = maxConnectNum;
        MAX_CONN_PER_ROUTE = maxConnectPerRoute;
        return  esClientSpringFactory;
    }

    public static ESClientSpringFactory build(HttpHost httpHost,Integer connectTimeOut, Integer socketTimeOut,
                                              Integer connectionRequestTime,Integer maxConnectNum, Integer maxConnectPerRoute){
        HTTP_HOST = httpHost;
        CONNECT_TIMEOUT_MILLIS = connectTimeOut;
        SOCKET_TIMEOUT_MILLIS = socketTimeOut;
        CONNECTION_REQUEST_TIMEOUT_MILLIS = connectionRequestTime;
        MAX_CONN_TOTAL = maxConnectNum;
        MAX_CONN_PER_ROUTE = maxConnectPerRoute;
        return  esClientSpringFactory;
    }


    public void init(){
        builder = RestClient.builder(HTTP_HOST);
        setConnectTimeOutConfig();
        setMutiConnectConfig();
        restClient = builder.build();
        restHighLevelClient = new RestHighLevelClient(builder);
        System.out.println("init factory");
    }
    // 配置連接時間延時
    public void setConnectTimeOutConfig(){
        builder.setRequestConfigCallback(requestConfigBuilder -> {
            requestConfigBuilder.setConnectTimeout(CONNECT_TIMEOUT_MILLIS);
            requestConfigBuilder.setSocketTimeout(SOCKET_TIMEOUT_MILLIS);
            requestConfigBuilder.setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MILLIS);
            return requestConfigBuilder;
        });
    }
    // 使用異步httpclient時設置併發連接數
    public void setMutiConnectConfig(){
        builder.setHttpClientConfigCallback(httpClientBuilder -> {
            httpClientBuilder.setMaxConnTotal(MAX_CONN_TOTAL);
            httpClientBuilder.setMaxConnPerRoute(MAX_CONN_PER_ROUTE);
            return httpClientBuilder;
        });
    }

    public RestClient getClient(){
        return restClient;
    }

    public RestHighLevelClient getRhlClient(){
        return restHighLevelClient;
    }

    public void close() {
        if (restClient != null) {
            try {
                restClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        System.out.println("close client");
    }
}


ESConfig:

import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpHost;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Scope;


@Configuration
@Slf4j
@ComponentScan(basePackageClasses=ESClientSpringFactory.class)
public class ESConfig {
    @Value("${elasticSearch.host}")
    private String host;

    @Value("${elasticSearch.port}")
    private int port;

    @Value("${elasticSearch.client.connectNum}")
    private Integer connectNum;

    @Value("${elasticSearch.client.connectPerRoute}")
    private Integer connectPerRoute;


    @Bean
    public HttpHost httpHost() {
        return new HttpHost(host, port, "http");
    }

    @Bean(initMethod = "init", destroyMethod = "close")
    public ESClientSpringFactory getFactory() {
        return ESClientSpringFactory.
                build(httpHost(), connectNum, connectPerRoute);
    }

    @Bean
    @Scope("singleton")
    public RestClient getRestClient() {
        return getFactory().getClient();
    }

    @Bean
    @Scope("singleton")
    public RestHighLevelClient getRHLClient() {
        return getFactory().getRhlClient();
    }

}

yaml中的配置數據:

elasticSearch.host=ip地址
elasticSearch.port=9200
elasticSearch.client.connectNum=10
elasticSearch.client.connectPerRoute=50

編寫一個測試:

    @Qualifier("getRHLClient")
    @Autowired
    RestHighLevelClient restHighLevelClient;

	@Test
    public void getSuggest()throws Exception{
        String data = "手機";

        SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
//        構建模糊相關的參數
        FuzzyOptions fuzzy =  FuzzyOptions.builder().setFuzzyPrefixLength(1).setFuzziness(0).setFuzzyMinLength(3).build();

        CompletionSuggestionBuilder completionSuggestionBuilder = new CompletionSuggestionBuilder("suggest");

        SuggestionBuilder termSuggestionBuilder = SuggestBuilders.completionSuggestion("suggest").prefix(data,fuzzy);

        SuggestBuilder suggestBuilder = new SuggestBuilder();

        suggestBuilder.addSuggestion("my-suggest", termSuggestionBuilder);

        searchSourceBuilder.suggest(suggestBuilder);
//        返回指定的字段
        String[] incloud = {"shop_name","title"};
        String[] excloud = {};
        searchSourceBuilder.fetchSource(incloud,excloud);
        SearchRequest searchRequest = new SearchRequest("jd_product"); //索引
        searchRequest.source(searchSourceBuilder);
        SearchResponse response = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
        Suggest suggestions = response.getSuggest();  //SearchHits提供有關所有匹配的全局信息,例如總命中數或最高分數:
        System.out.println("suggestions = " + suggestions);

    }

獲取到返回的數據:

image-20200404222003439

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章