elasticsearch使用中文分詞器和拼音分詞器,自定義分詞器
1. 到github 下載分詞器
上面有已經編譯好打好的包。下載後在es安裝目錄下的plugins/目錄下創建ik和pinyin兩個文件夾,把下載好的zip包解壓在裏面。重啓es就會生效了。github上readme.txt文件裏有使用說明。注意下載的時候下載版本對應的,比如我的es版本是5.6.16,下載分詞器的時候也要下載這個版本的。
ik 中文分詞器:https://github.com/medcl/elasticsearch-analysis-ik/releases
pinyin 拼音分詞器:https://github.com/medcl/elasticsearch-analysis-pinyin/releases
也可以下載源碼後,用mvn手動打包,但是特別慢,我打了個拼音包兩個多小時,可能和沒翻牆也有關係。
2. 使用分詞器
解壓後重啓es就可以使用了。分詞器是屬於索引的,所以測試分詞器的時候,要指定是哪個索引。
ik_smart: 會做最粗粒度的拆分,比如會將“中華人民共和國國歌”拆分爲“中華人民共和國,國歌”,適合 Phrase 查詢。
get http://localhost:9200/user_index/_analyze?analyzer=ik_smart&text=張三李四
返回
{
"tokens": [
{
"token": "張三李四",
"start_offset": 0,
"end_offset": 4,
"type": "CN_WORD",
"position": 0
}
]
}
ik_max_word: 會將文本做最細粒度的拆分,比如會將“中華人民共和國國歌”拆分爲“中華人民共和國,中華人民,中華,華人,人民共和國,人民,人,民,共和國,共和,和,國國,國歌”,會窮盡各種可能的組合,適合 Term Query;
get http://localhost:9200/user_index/_analyze?analyzer=ik_max_word&text=張三李四
返回
{
"tokens": [
{
"token": "張三李四",
"start_offset": 0,
"end_offset": 4,
"type": "CN_WORD",
"position": 0
},
{
"token": "張三",
"start_offset": 0,
"end_offset": 2,
"type": "CN_WORD",
"position": 1
},
{
"token": "三",
"start_offset": 1,
"end_offset": 2,
"type": "TYPE_CNUM",
"position": 2
},
{
"token": "李四",
"start_offset": 2,
"end_offset": 4,
"type": "CN_WORD",
"position": 3
},
{
"token": "四",
"start_offset": 3,
"end_offset": 4,
"type": "TYPE_CNUM",
"position": 4
}
]
}
get http://localhost:9200/user_index/_analyze?analyzer=pinyin&text=張三李四
返回
{
"tokens": [
{
"token": "zhang",
"start_offset": 0,
"end_offset": 1,
"type": "word",
"position": 0
},
{
"token": "zsls",
"start_offset": 0,
"end_offset": 4,
"type": "word",
"position": 0
},
{
"token": "san",
"start_offset": 1,
"end_offset": 2,
"type": "word",
"position": 1
},
{
"token": "li",
"start_offset": 2,
"end_offset": 3,
"type": "word",
"position": 2
},
{
"token": "si",
"start_offset": 3,
"end_offset": 4,
"type": "word",
"position": 3
}
]
}
3. 自定義分詞器,ik+pinyin組合使用
ik中文分詞器,貌似沒有可以設置的屬性,直接用就行了。
拼音分詞器有許多可以設置的選項。可以自行定義。原本的拼音分詞器,只能分析出來全拼、首字母全拼、和每個字的完整拼音,不過這個每個字的完整拼音我覺得沒什麼作用,太細微。我想實現的功能是,可以讓中文分詞器分詞後的字詞,再被拼音分詞器分詞,就可以用下面的方式,tokenizer 使用 中文分詞器ik_max_word,最後的標記過濾器,再使用pinyin 分詞器過濾一遍就可以了。
{
"index": {
"number_of_replicas" : "0",
"number_of_shards" : "1",
"analysis": {
"analyzer": {
"ik_pinyin_analyzer": {
"tokenizer": "my_ik_pinyin",
"filter": "pinyin_first_letter_and_full_pinyin_filter"
},
"pinyin_analyzer": {
"tokenizer": "my_pinyin"
}
},
"tokenizer": {
"my_ik_pinyin": {
"type": "ik_max_word"
},
"my_pinyin": {
"type": "pinyin",
"keep_first_letter": true,
"keep_separate_first_letter": false,
"keep_full_pinyin": false,
"keep_joined_full_pinyin": true,
"keep_none_chinese": true,
"none_chinese_pinyin_tokenize": false,
"keep_none_chinese_in_joined_full_pinyin": true,
"keep_original": false,
"limit_first_letter_length": 16,
"lowercase": true,
"trim_whitespace": true,
"remove_duplicated_term": true
}
},
"filter": {
"pinyin_first_letter_and_full_pinyin_filter": {
"type": "pinyin",
"keep_first_letter": true,
"keep_separate_first_letter": false,
"keep_full_pinyin": false,
"keep_joined_full_pinyin": true,
"keep_none_chinese": true,
"none_chinese_pinyin_tokenize": false,
"keep_none_chinese_in_joined_full_pinyin": true,
"keep_original": false,
"limit_first_letter_length": 16,
"lowercase": true,
"trim_whitespace": true,
"remove_duplicated_term": true
}
}
}
}
}
我們測試一下
http://localhost:9200/drug_index/_analyze?analyzer=ik_pinyin_analyzer&text=阿莫西林膠囊
返回的結果就是漢字ik_max_word分詞後的結果,再按照拼音分詞的規則做了分析。
{
"tokens": [
{
"token": "amoxilin",
"start_offset": 0,
"end_offset": 4,
"type": "CN_WORD",
"position": 0
},
{
"token": "amxl",
"start_offset": 0,
"end_offset": 4,
"type": "CN_WORD",
"position": 0
},
{
"token": "moxi",
"start_offset": 1,
"end_offset": 3,
"type": "CN_WORD",
"position": 1
},
{
"token": "mx",
"start_offset": 1,
"end_offset": 3,
"type": "CN_WORD",
"position": 1
},
{
"token": "xilin",
"start_offset": 2,
"end_offset": 4,
"type": "CN_WORD",
"position": 2
},
{
"token": "xl",
"start_offset": 2,
"end_offset": 4,
"type": "CN_WORD",
"position": 2
},
{
"token": "jiaonang",
"start_offset": 4,
"end_offset": 6,
"type": "CN_WORD",
"position": 3
},
{
"token": "jn",
"start_offset": 4,
"end_offset": 6,
"type": "CN_WORD",
"position": 3
}
]
}
4. 代碼測試
package com.boot.es.model;
import lombok.Data;
import org.springframework.data.annotation.Id;
import org.springframework.data.elasticsearch.annotations.Document;
import org.springframework.data.elasticsearch.annotations.Field;
import org.springframework.data.elasticsearch.annotations.FieldType;
import org.springframework.data.elasticsearch.annotations.InnerField;
import org.springframework.data.elasticsearch.annotations.MultiField;
import org.springframework.data.elasticsearch.annotations.Setting;
/**
* Author: susq
* Date: 2019-06-30 10:12
*/
@Data
@Document(indexName = "drug_index", type = "drug")
@Setting(settingPath = "settings.json")
public class Drug {
@Id
private Long id;
@Field(type = FieldType.Keyword)
private String price;
@MultiField(
mainField = @Field(type = FieldType.Keyword),
otherFields = {
@InnerField(type = FieldType.Text, suffix = "ik", analyzer = "ik_max_word", searchAnalyzer = "ik_max_word"),
@InnerField(type = FieldType.Text, suffix = "ik_pinyin", analyzer = "ik_pinyin_analyzer", searchAnalyzer = "ik_pinyin_analyzer"),
@InnerField(type = FieldType.Text, suffix = "pinyin", analyzer = "pinyin_analyzer", searchAnalyzer = "pinyin_analyzer")
}
)
private String name;
@MultiField(
mainField = @Field(type = FieldType.Keyword),
otherFields = {
@InnerField(type = FieldType.Text, suffix = "ik", analyzer = "ik_max_word", searchAnalyzer = "ik_smart"),
@InnerField(type = FieldType.Text, suffix = "ik_pinyin", analyzer = "ik_pinyin_analyzer", searchAnalyzer = "ik_pinyin_analyzer"),
@InnerField(type = FieldType.Text, suffix = "pinyin", analyzer = "pinyin_analyzer", searchAnalyzer = "pinyin_analyzer")
}
)
private String effect;
}
@Test
public void drugSaveTest() {
Drug drug = new Drug();
drug.setId(1L);
drug.setName("阿莫西林膠囊");
drug.setPrice("10");
drug.setEffect("阿莫西林適用於敏感菌(不產β內酰胺酶菌株)所致的感染");
Drug drug1 = new Drug();
drug1.setId(3L);
drug1.setName("阿莫西林");
drug1.setPrice("10");
drug1.setEffect("阿莫西林適用於敏感菌(不產β內酰胺酶菌株)所致的感染");
Drug drug2 = new Drug();
drug2.setId(2L);
drug2.setName("999感冒靈顆粒");
drug2.setPrice("20");
drug2.setEffect("本品解熱鎮痛。用於感冒引起的頭痛,發熱,鼻塞,流涕,咽痛等");
drugRepository.saveAll(Lists.newArrayList(drug, drug1, drug2));
List<Drug> drugs = Lists.newArrayList(drugRepository.findAll());
log.info("以保存的drugs: {}", drugs);
}
@Test
public void drugSaveTest() {
Drug drug = new Drug();
drug.setId(1L);
drug.setName("阿莫西林膠囊");
drug.setPrice("10");
drug.setEffect("阿莫西林適用於敏感菌(不產β內酰胺酶菌株)所致的感染");
Drug drug1 = new Drug();
drug1.setId(3L);
drug1.setName("阿莫西林");
drug1.setPrice("10");
drug1.setEffect("阿莫西林適用於敏感菌(不產β內酰胺酶菌株)所致的感染");
Drug drug2 = new Drug();
drug2.setId(2L);
drug2.setName("999感冒靈顆粒");
drug2.setPrice("20");
drug2.setEffect("本品解熱鎮痛。用於感冒引起的頭痛,發熱,鼻塞,流涕,咽痛等");
drugRepository.saveAll(Lists.newArrayList(drug, drug1, drug2));
List<Drug> drugs = Lists.newArrayList(drugRepository.findAll());
log.info("以保存的drugs: {}", drugs);
}
/**
* 這個測試中,name(不帶後綴的時候是Keyword類型),不分詞的時候,如果能匹配到 * 那就是完全匹配,應該要得分高一點,所以設置是match查詢的兩倍
*/
@Test
public void drugIkSearchTest() {
NativeSearchQueryBuilder builder = new NativeSearchQueryBuilder();
NativeSearchQuery query = builder.withQuery(QueryBuilders.boolQuery()
.should(QueryBuilders.matchQuery("name", "阿莫西林")).boost(2)
.should(QueryBuilders.matchQuery("name.ik", "阿莫西林")).boost(1))
.build();
log.info("DSL:{}", query.getQuery().toString());
Iterable<Drug> iterable = drugRepository.search(query);
List<Drug> drugs = Lists.newArrayList(iterable);
log.info("result: {}", drugs);
}
/**
* 這個測試中,name.pinyin(只生成整個name的全拼和所有漢字首字母的全拼接), * 這個匹配的時候就是完全匹配,得分應該高一點
*/
@Test
public void drugPinyinSearchTest() {
NativeSearchQueryBuilder builder = new NativeSearchQueryBuilder();
NativeSearchQuery query = builder.withQuery(QueryBuilders.boolQuery()
.should(QueryBuilders.matchQuery("name.ik_pinyin", "阿莫西林").boost(1))
.should(QueryBuilders.matchQuery("name.pinyin", "阿莫西林").boost(2))
)
.withSort(SortBuilders.scoreSort())
.build();
log.info("DSL:{}", query.getQuery().toString());
Iterable<Drug> iterable = drugRepository.search(query);
List<Drug> drugs = Lists.newArrayList(iterable);
log.info("result: {}", drugs);
}