之前已經分別單獨介紹了中文分詞器(IK)和 拼音分詞器 的使用方式,本節重點介紹將中文分詞器和拼音分詞器結合使用的方式,即針對同一個字段即可用中文分詞檢索,也可以用拼音檢索。
廢話不多說,直接上配置
PUT /pinyin-ik-test/
{
"index": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "ik_max_word",
"filter":"my_filter"
}
},
"filter":{
"my_filter":{
"type": "pinyin",
"keep_separate_first_letter": false,
"keep_full_pinyin": true,
"keep_original": true,
"limit_first_letter_length": 16,
"lowercase": true,
"remove_duplicated_term": true
}
}
}
}
}
配置說明
analyzer的詳細配置這裏不做介紹,重點使用的三個配置項:type,tokenizer、filter
實現中文分詞和拼音結合的思路:通過IK分詞後得到的結果再經過 pinyin 的 fiter 處理得到分詞的拼音,效果如下:
GET /pinyin-ik-test/_analyze
{
"text": ["劉德華是著名的歌星、影星、慈善家、明星"],
"analyzer": "my_analyzer"
}
{
"tokens": [
{
"token": "liu",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 0
},
{
"token": "劉德華",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 0
},
{
"token": "ldh",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 0
},
{
"token": "de",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 1
},
{
"token": "hua",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 2
},
{
"token": "shi",
"start_offset": 3,
"end_offset": 4,
"type": "CN_CHAR",
"position": 3
},
{
"token": "是",
"start_offset": 3,
"end_offset": 4,
"type": "CN_CHAR",
"position": 3
},
{
"token": "s",
"start_offset": 3,
"end_offset": 4,
"type": "CN_CHAR",
"position": 3
},
{
"token": "zhu",
"start_offset": 4,
"end_offset": 6,
"type": "CN_WORD",
"position": 4
},
{
"token": "ming",
"start_offset": 4,
"end_offset": 6,
"type": "CN_WORD",
"position": 5
},
{
"token": "著名",
"start_offset": 4,
"end_offset": 6,
"type": "CN_WORD",
"position": 5
},
{
"token": "zm",
"start_offset": 4,
"end_offset": 6,
"type": "CN_WORD",
"position": 5
},
{
"token": "de",
"start_offset": 6,
"end_offset": 7,
"type": "CN_CHAR",
"position": 6
},
{
"token": "的",
"start_offset": 6,
"end_offset": 7,
"type": "CN_CHAR",
"position": 6
},
{
"token": "d",
"start_offset": 6,
"end_offset": 7,
"type": "CN_CHAR",
"position": 6
},
{
"token": "ge",
"start_offset": 7,
"end_offset": 9,
"type": "CN_WORD",
"position": 7
},
{
"token": "xing",
"start_offset": 7,
"end_offset": 9,
"type": "CN_WORD",
"position": 8
},
{
"token": "歌星",
"start_offset": 7,
"end_offset": 9,
"type": "CN_WORD",
"position": 8
},
{
"token": "gx",
"start_offset": 7,
"end_offset": 9,
"type": "CN_WORD",
"position": 8
},
{
"token": "ying",
"start_offset": 10,
"end_offset": 12,
"type": "CN_WORD",
"position": 9
},
{
"token": "xing",
"start_offset": 10,
"end_offset": 12,
"type": "CN_WORD",
"position": 10
},
{
"token": "影星",
"start_offset": 10,
"end_offset": 12,
"type": "CN_WORD",
"position": 10
},
{
"token": "yx",
"start_offset": 10,
"end_offset": 12,
"type": "CN_WORD",
"position": 10
},
{
"token": "ci",
"start_offset": 13,
"end_offset": 16,
"type": "CN_WORD",
"position": 11
},
{
"token": "shan",
"start_offset": 13,
"end_offset": 16,
"type": "CN_WORD",
"position": 12
},
{
"token": "jia",
"start_offset": 13,
"end_offset": 16,
"type": "CN_WORD",
"position": 13
},
{
"token": "慈善家",
"start_offset": 13,
"end_offset": 16,
"type": "CN_WORD",
"position": 13
},
{
"token": "csj",
"start_offset": 13,
"end_offset": 16,
"type": "CN_WORD",
"position": 13
},
{
"token": "ci",
"start_offset": 13,
"end_offset": 15,
"type": "CN_WORD",
"position": 14
},
{
"token": "shan",
"start_offset": 13,
"end_offset": 15,
"type": "CN_WORD",
"position": 15
},
{
"token": "慈善",
"start_offset": 13,
"end_offset": 15,
"type": "CN_WORD",
"position": 15
},
{
"token": "cs",
"start_offset": 13,
"end_offset": 15,
"type": "CN_WORD",
"position": 15
},
{
"token": "jia",
"start_offset": 15,
"end_offset": 16,
"type": "CN_CHAR",
"position": 16
},
{
"token": "家",
"start_offset": 15,
"end_offset": 16,
"type": "CN_CHAR",
"position": 16
},
{
"token": "j",
"start_offset": 15,
"end_offset": 16,
"type": "CN_CHAR",
"position": 16
},
{
"token": "ming",
"start_offset": 17,
"end_offset": 19,
"type": "CN_WORD",
"position": 17
},
{
"token": "xing",
"start_offset": 17,
"end_offset": 19,
"type": "CN_WORD",
"position": 18
},
{
"token": "明星",
"start_offset": 17,
"end_offset": 19,
"type": "CN_WORD",
"position": 18
},
{
"token": "mx",
"start_offset": 17,
"end_offset": 19,
"type": "CN_WORD",
"position": 18
}
]
}
配置index
POST /pinyin-ik-test/DOC/_mapping
{
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer"
}
}
}
效果測試
#寫入數據
POST /pinyin-ik-test/DOC/
{
"content":"劉德華是著名的明星"
}
POST /pinyin-ik-test/DOC/
{
"content":"劉德華是著名的影星"
}
POST /pinyin-ik-test/DOC/
{
"content":"劉德華是著名的慈善家"
}
POST /pinyin-ik-test/DOC/
{
"content":"劉德華是著名的明星"
}
POST /pinyin-ik-test/DOC/
{
"content":"劉德華是著名的人"
}
#用中文 明星 檢索
POST /pinyin-ik-test/DOC/_search
{
"query": {
"bool": {
"must": [
{
"term": {
"content": "明星"
}
}
]
}
}
}
{
"took": 10,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.36661926,
"hits": [
{
"_index": "pinyin-ik-test",
"_type": "DOC",
"_id": "gl_u23IBeaMPz9g6tTIh",
"_score": 0.36661926,
"_source": {
"content": "劉德華是著名的明星"
}
}
]
}
}
#用拼音 mingxing 檢索,無任何結果,是因爲沒有把 keep_joined_full_pinyin 選項設置爲true,默認是false
POST /pinyin-ik-test/DOC/_search
{
"query": {
"bool": {
"must": [
{
"term": {
"content": "mingxing"
}
}
]
}
}
}
{
"took": 11,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
}
#用拼音 mx 檢索
POST /pinyin-ik-test/DOC/_search
{
"query": {
"bool": {
"must": [
{
"term": {
"content": "mx"
}
}
]
}
}
}
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.36661926,
"hits": [
{
"_index": "pinyin-ik-test",
"_type": "DOC",
"_id": "gl_u23IBeaMPz9g6tTIh",
"_score": 0.36661926,
"_source": {
"content": "劉德華是著名的明星"
}
}
]
}
}
修改配置
將keep_joined_full_pinyin 設置爲true,再驗證一下用 mingxing 拼音檢索
#新建index pinyin-ik-test-3,不能在原有基礎上修改
PUT /pinyin-ik-test-3/
{
"index": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "ik_max_word",
"filter":"my_filter"
}
},
"filter":{
"my_filter":{
"type": "pinyin",
"keep_separate_first_letter": false,
"keep_full_pinyin": true,
"keep_original": true,
"limit_first_letter_length": 16,
"lowercase": true,
"remove_duplicated_term": true,
"keep_joined_full_pinyin": true #這裏設置爲true
}
}
}
}
}
#配置content字段
POST /pinyin-ik-test-3/DOC/_mapping
{
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer"
}
}
}
#寫入數據
POST /pinyin-ik-test-3/DOC/
{
"content":"劉德華是著名的明星"
}
#測試
POST /pinyin-ik-test-3/DOC/_search
{
"query": {
"bool": {
"must": [
{
"term": {
"content": "mingxing"
}
}
]
}
}
}
{
"took": 9,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.37939543,
"hits": [
{
"_index": "pinyin-ik-test-3",
"_type": "DOC",
"_id": "hl_823IBeaMPz9g63jJ1",
"_score": 0.37939543,
"_source": {
"content": "劉德華是著名的明星"
}
}
]
}
}