elasticsearch自定義分詞器---拼音分詞器

拼音分詞器

之前詳細介紹了IK中文分詞器,本節詳細介紹拼音分詞。拼音分詞是將中文轉化爲拼音,並提供可配置項共用戶自定義拼音檢索方式,如拼音首字母、全拼等

下載地址

https://github.com/medcl/elasticsearch-analysis-pinyin

安裝方式

1.在github頁面找到releases頁籤,通過上下頁找到自己ES的版本對應的發佈包,以6.2.2爲例,下載zip包。

2.上傳到ES安裝目錄的plugins文件夾,解壓並重命名解壓後的文件夾爲 pinyin(這步很重要)

詳細步驟可以參考IK中文分詞器

3.重啓es,如果重啓成功代表插件安裝成功

插件說明

該插件內置了analyzer: pinyin , tokenizer: pinyin, token-filter: pinyin

GET /ik-pinyin/_analyze
{
  "text": ["中華人民共和國人民大會堂"],
  "analyzer": "pinyin"
}

{
  "tokens": [
    {
      "token": "zhong",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 0
    },
    {
      "token": "zhrmghgrmdht",  #所有文字拼音的首字母
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 0
    },
    {
      "token": "hua",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 1
    },
    {
      "token": "ren",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 2
    },
    {
      "token": "min",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 3
    },
    {
      "token": "gong",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 4
    },
    {
      "token": "he",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 5
    },
    {
      "token": "guo",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 6
    },
    {
      "token": "ren",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 7
    },
    {
      "token": "min",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 8
    },
    {
      "token": "da",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 9
    },
    {
      "token": "hui",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 10
    },
    {
      "token": "tang",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 11
    }
  ]
}

 

配置index

1.創建index

PUT /ik-pinyin/
 
{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "ik-pinyin"
}

2.設置mappings,這裏設置要做分詞檢索的字段,以content爲例

PUT /ik-pinyin/DOC/_mapping
{
  "properties": {
    "content": {
      "type": "text",
      "analyzer": "pinyin"
    }
  }
}

{
  "acknowledged": true
}

3.測試

#寫數據
POST /ik-pinyin/DOC/
{
  "content":"人民共和國人民大會堂"
}

POST /ik-pinyin/DOC/
{
  "content":"中華人民共和國人民大會堂"
}

POST /ik-pinyin/DOC/
{
  "content":"人民大會堂"
}

#查看數據
POST /ik-pinyin/DOC/_search


{
  "took": 8,
  "timed_out": false,
  "_shards": {
    "total": 3,
    "successful": 3,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 3,
    "max_score": 1,
    "hits": [
      {
        "_index": "ik-pinyin",
        "_type": "DOC",
        "_id": "d1--23IBeaMPz9g6CzKs",
        "_score": 1,
        "_source": {
          "content": "中華人民共和國人民大會堂"
        }
      },
      {
        "_index": "ik-pinyin",
        "_type": "DOC",
        "_id": "dl-923IBeaMPz9g6LzIu",
        "_score": 1,
        "_source": {
          "content": "人民共和國人民大會堂"
        }
      },
      {
        "_index": "ik-pinyin",
        "_type": "DOC",
        "_id": "eF--23IBeaMPz9g6ODK2",
        "_score": 1,
        "_source": {
          "content": "人民大會堂"
        }
      }
    ]
  }
}

#使用中文檢索數據,檢索結果爲空
POST /ik-pinyin/DOC/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "content": "人民"
          }
        }
      ]
    }
  }
}

{
  "took": 8,
  "timed_out": false,
  "_shards": {
    "total": 3,
    "successful": 3,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 0,
    "max_score": null,
    "hits": []    #未命中任何記錄
  }
}

#用拼音 guo 檢索
POST /ik-pinyin/DOC/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "content": "guo"
          }
        }
      ]
    }
  }
}

{
  "took": 4,
  "timed_out": false,
  "_shards": {
    "total": 3,
    "successful": 3,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 2,
    "max_score": 0.2987943,
    "hits": [
      {
        "_index": "ik-pinyin",
        "_type": "DOC",
        "_id": "dl-923IBeaMPz9g6LzIu",
        "_score": 0.2987943,
        "_source": {
          "content": "人民共和國人民大會堂"
        }
      },
      {
        "_index": "ik-pinyin",
        "_type": "DOC",
        "_id": "d1--23IBeaMPz9g6CzKs",
        "_score": 0.29702917,
        "_source": {
          "content": "中華人民共和國人民大會堂"
        }
      }
    ]
  }
}

#使用zhrmghgrmdht檢索
POST /ik-pinyin/DOC/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "content": "zhrmghgrmdht"
          }
        }
      ]
    }
  }
}

{
  "took": 9,
  "timed_out": false,
  "_shards": {
    "total": 3,
    "successful": 3,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 1,
    "max_score": 0.29702917,
    "hits": [
      {
        "_index": "ik-pinyin",
        "_type": "DOC",
        "_id": "d1--23IBeaMPz9g6CzKs",
        "_score": 0.29702917,
        "_source": {
          "content": "中華人民共和國人民大會堂"
        }
      }
    ]
  }
}

自定義配置

GitHub中給出了詳細的配置項及其說明和使用方式

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章