elasticsearch自定義分詞器--中文分詞器與拼音分詞器結合

之前已經分別單獨介紹了中文分詞器(IK)和 拼音分詞器 的使用方式,本節重點介紹將中文分詞器和拼音分詞器結合使用的方式,即針對同一個字段即可用中文分詞檢索,也可以用拼音檢索。

廢話不多說,直接上配置

PUT /pinyin-ik-test/
{
  "index": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "type": "custom",
          "tokenizer": "ik_max_word",
          "filter":"my_filter"
        }
      },
      "filter":{
        "my_filter":{
          "type": "pinyin",
          "keep_separate_first_letter": false,
          "keep_full_pinyin": true,
          "keep_original": true,
          "limit_first_letter_length": 16,
          "lowercase": true,
          "remove_duplicated_term": true
        }
      }
    }
  }
}

配置說明

analyzer的詳細配置這裏不做介紹,重點使用的三個配置項:type,tokenizer、filter

實現中文分詞和拼音結合的思路:通過IK分詞後得到的結果再經過 pinyin 的 fiter 處理得到分詞的拼音,效果如下:

GET /pinyin-ik-test/_analyze
{
  "text": ["劉德華是著名的歌星、影星、慈善家、明星"],
  "analyzer": "my_analyzer"
}

{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 3,
      "type": "CN_WORD",
      "position": 0
    },
    {
      "token": "劉德華",
      "start_offset": 0,
      "end_offset": 3,
      "type": "CN_WORD",
      "position": 0
    },
    {
      "token": "ldh",
      "start_offset": 0,
      "end_offset": 3,
      "type": "CN_WORD",
      "position": 0
    },
    {
      "token": "de",
      "start_offset": 0,
      "end_offset": 3,
      "type": "CN_WORD",
      "position": 1
    },
    {
      "token": "hua",
      "start_offset": 0,
      "end_offset": 3,
      "type": "CN_WORD",
      "position": 2
    },
    {
      "token": "shi",
      "start_offset": 3,
      "end_offset": 4,
      "type": "CN_CHAR",
      "position": 3
    },
    {
      "token": "是",
      "start_offset": 3,
      "end_offset": 4,
      "type": "CN_CHAR",
      "position": 3
    },
    {
      "token": "s",
      "start_offset": 3,
      "end_offset": 4,
      "type": "CN_CHAR",
      "position": 3
    },
    {
      "token": "zhu",
      "start_offset": 4,
      "end_offset": 6,
      "type": "CN_WORD",
      "position": 4
    },
    {
      "token": "ming",
      "start_offset": 4,
      "end_offset": 6,
      "type": "CN_WORD",
      "position": 5
    },
    {
      "token": "著名",
      "start_offset": 4,
      "end_offset": 6,
      "type": "CN_WORD",
      "position": 5
    },
    {
      "token": "zm",
      "start_offset": 4,
      "end_offset": 6,
      "type": "CN_WORD",
      "position": 5
    },
    {
      "token": "de",
      "start_offset": 6,
      "end_offset": 7,
      "type": "CN_CHAR",
      "position": 6
    },
    {
      "token": "的",
      "start_offset": 6,
      "end_offset": 7,
      "type": "CN_CHAR",
      "position": 6
    },
    {
      "token": "d",
      "start_offset": 6,
      "end_offset": 7,
      "type": "CN_CHAR",
      "position": 6
    },
    {
      "token": "ge",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 7
    },
    {
      "token": "xing",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 8
    },
    {
      "token": "歌星",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 8
    },
    {
      "token": "gx",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 8
    },
    {
      "token": "ying",
      "start_offset": 10,
      "end_offset": 12,
      "type": "CN_WORD",
      "position": 9
    },
    {
      "token": "xing",
      "start_offset": 10,
      "end_offset": 12,
      "type": "CN_WORD",
      "position": 10
    },
    {
      "token": "影星",
      "start_offset": 10,
      "end_offset": 12,
      "type": "CN_WORD",
      "position": 10
    },
    {
      "token": "yx",
      "start_offset": 10,
      "end_offset": 12,
      "type": "CN_WORD",
      "position": 10
    },
    {
      "token": "ci",
      "start_offset": 13,
      "end_offset": 16,
      "type": "CN_WORD",
      "position": 11
    },
    {
      "token": "shan",
      "start_offset": 13,
      "end_offset": 16,
      "type": "CN_WORD",
      "position": 12
    },
    {
      "token": "jia",
      "start_offset": 13,
      "end_offset": 16,
      "type": "CN_WORD",
      "position": 13
    },
    {
      "token": "慈善家",
      "start_offset": 13,
      "end_offset": 16,
      "type": "CN_WORD",
      "position": 13
    },
    {
      "token": "csj",
      "start_offset": 13,
      "end_offset": 16,
      "type": "CN_WORD",
      "position": 13
    },
    {
      "token": "ci",
      "start_offset": 13,
      "end_offset": 15,
      "type": "CN_WORD",
      "position": 14
    },
    {
      "token": "shan",
      "start_offset": 13,
      "end_offset": 15,
      "type": "CN_WORD",
      "position": 15
    },
    {
      "token": "慈善",
      "start_offset": 13,
      "end_offset": 15,
      "type": "CN_WORD",
      "position": 15
    },
    {
      "token": "cs",
      "start_offset": 13,
      "end_offset": 15,
      "type": "CN_WORD",
      "position": 15
    },
    {
      "token": "jia",
      "start_offset": 15,
      "end_offset": 16,
      "type": "CN_CHAR",
      "position": 16
    },
    {
      "token": "家",
      "start_offset": 15,
      "end_offset": 16,
      "type": "CN_CHAR",
      "position": 16
    },
    {
      "token": "j",
      "start_offset": 15,
      "end_offset": 16,
      "type": "CN_CHAR",
      "position": 16
    },
    {
      "token": "ming",
      "start_offset": 17,
      "end_offset": 19,
      "type": "CN_WORD",
      "position": 17
    },
    {
      "token": "xing",
      "start_offset": 17,
      "end_offset": 19,
      "type": "CN_WORD",
      "position": 18
    },
    {
      "token": "明星",
      "start_offset": 17,
      "end_offset": 19,
      "type": "CN_WORD",
      "position": 18
    },
    {
      "token": "mx",
      "start_offset": 17,
      "end_offset": 19,
      "type": "CN_WORD",
      "position": 18
    }
  ]
}

配置index

POST /pinyin-ik-test/DOC/_mapping
{
  "properties": {
    "content": {
      "type": "text",
      "analyzer": "my_analyzer"
    }
  }
}

效果測試

#寫入數據
POST /pinyin-ik-test/DOC/
{
  "content":"劉德華是著名的明星"
}

POST /pinyin-ik-test/DOC/
{
  "content":"劉德華是著名的影星"
}

POST /pinyin-ik-test/DOC/
{
  "content":"劉德華是著名的慈善家"
}

POST /pinyin-ik-test/DOC/
{
  "content":"劉德華是著名的明星"
}

POST /pinyin-ik-test/DOC/
{
  "content":"劉德華是著名的人"
}

#用中文 明星 檢索
POST /pinyin-ik-test/DOC/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "content": "明星"
          }
        }
      ]
    }
  }
}

{
  "took": 10,
  "timed_out": false,
  "_shards": {
    "total": 3,
    "successful": 3,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 1,
    "max_score": 0.36661926,
    "hits": [
      {
        "_index": "pinyin-ik-test",
        "_type": "DOC",
        "_id": "gl_u23IBeaMPz9g6tTIh",
        "_score": 0.36661926,
        "_source": {
          "content": "劉德華是著名的明星"
        }
      }
    ]
  }
}

#用拼音 mingxing 檢索,無任何結果,是因爲沒有把 keep_joined_full_pinyin 選項設置爲true,默認是false

POST /pinyin-ik-test/DOC/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "content": "mingxing"
          }
        }
      ]
    }
  }
}

{
  "took": 11,
  "timed_out": false,
  "_shards": {
    "total": 3,
    "successful": 3,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 0,
    "max_score": null,
    "hits": []
  }
}


#用拼音 mx 檢索
POST /pinyin-ik-test/DOC/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "content": "mx"
          }
        }
      ]
    }
  }
}

{
  "took": 5,
  "timed_out": false,
  "_shards": {
    "total": 3,
    "successful": 3,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 1,
    "max_score": 0.36661926,
    "hits": [
      {
        "_index": "pinyin-ik-test",
        "_type": "DOC",
        "_id": "gl_u23IBeaMPz9g6tTIh",
        "_score": 0.36661926,
        "_source": {
          "content": "劉德華是著名的明星"
        }
      }
    ]
  }
}

修改配置

將keep_joined_full_pinyin 設置爲true,再驗證一下用 mingxing 拼音檢索

#新建index   pinyin-ik-test-3,不能在原有基礎上修改

PUT /pinyin-ik-test-3/
{
  "index": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "type": "custom",
          "tokenizer": "ik_max_word",
          "filter":"my_filter"
        }
      },
      "filter":{
        "my_filter":{
          "type": "pinyin",
          "keep_separate_first_letter": false,
          "keep_full_pinyin": true,
          "keep_original": true,
          "limit_first_letter_length": 16,
          "lowercase": true,
          "remove_duplicated_term": true,
          "keep_joined_full_pinyin": true  #這裏設置爲true
        }
      }
    }
  }
}

#配置content字段
POST /pinyin-ik-test-3/DOC/_mapping
{
  "properties": {
    "content": {
      "type": "text",
      "analyzer": "my_analyzer"
    }
  }
}

#寫入數據
POST /pinyin-ik-test-3/DOC/
{
  "content":"劉德華是著名的明星"
}

#測試
POST /pinyin-ik-test-3/DOC/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "content": "mingxing"
          }
        }
      ]
    }
  }
}

{
  "took": 9,
  "timed_out": false,
  "_shards": {
    "total": 3,
    "successful": 3,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 1,
    "max_score": 0.37939543,
    "hits": [
      {
        "_index": "pinyin-ik-test-3",
        "_type": "DOC",
        "_id": "hl_823IBeaMPz9g63jJ1",
        "_score": 0.37939543,
        "_source": {
          "content": "劉德華是著名的明星"
        }
      }
    ]
  }
}


 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章