elasticsearch自定义分词器---拼音分词器

拼音分词器

之前详细介绍了IK中文分词器,本节详细介绍拼音分词。拼音分词是将中文转化为拼音,并提供可配置项共用户自定义拼音检索方式,如拼音首字母、全拼等

下载地址

https://github.com/medcl/elasticsearch-analysis-pinyin

安装方式

1.在github页面找到releases页签,通过上下页找到自己ES的版本对应的发布包,以6.2.2为例,下载zip包。

2.上传到ES安装目录的plugins文件夹,解压并重命名解压后的文件夹为 pinyin(这步很重要)

详细步骤可以参考IK中文分词器

3.重启es,如果重启成功代表插件安装成功

插件说明

该插件内置了analyzer: pinyin , tokenizer: pinyin, token-filter: pinyin

GET /ik-pinyin/_analyze
{
  "text": ["中华人民共和国人民大会堂"],
  "analyzer": "pinyin"
}

{
  "tokens": [
    {
      "token": "zhong",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 0
    },
    {
      "token": "zhrmghgrmdht",  #所有文字拼音的首字母
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 0
    },
    {
      "token": "hua",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 1
    },
    {
      "token": "ren",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 2
    },
    {
      "token": "min",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 3
    },
    {
      "token": "gong",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 4
    },
    {
      "token": "he",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 5
    },
    {
      "token": "guo",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 6
    },
    {
      "token": "ren",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 7
    },
    {
      "token": "min",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 8
    },
    {
      "token": "da",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 9
    },
    {
      "token": "hui",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 10
    },
    {
      "token": "tang",
      "start_offset": 0,
      "end_offset": 0,
      "type": "word",
      "position": 11
    }
  ]
}

 

配置index

1.创建index

PUT /ik-pinyin/
 
{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "ik-pinyin"
}

2.设置mappings,这里设置要做分词检索的字段,以content为例

PUT /ik-pinyin/DOC/_mapping
{
  "properties": {
    "content": {
      "type": "text",
      "analyzer": "pinyin"
    }
  }
}

{
  "acknowledged": true
}

3.测试

#写数据
POST /ik-pinyin/DOC/
{
  "content":"人民共和国人民大会堂"
}

POST /ik-pinyin/DOC/
{
  "content":"中华人民共和国人民大会堂"
}

POST /ik-pinyin/DOC/
{
  "content":"人民大会堂"
}

#查看数据
POST /ik-pinyin/DOC/_search


{
  "took": 8,
  "timed_out": false,
  "_shards": {
    "total": 3,
    "successful": 3,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 3,
    "max_score": 1,
    "hits": [
      {
        "_index": "ik-pinyin",
        "_type": "DOC",
        "_id": "d1--23IBeaMPz9g6CzKs",
        "_score": 1,
        "_source": {
          "content": "中华人民共和国人民大会堂"
        }
      },
      {
        "_index": "ik-pinyin",
        "_type": "DOC",
        "_id": "dl-923IBeaMPz9g6LzIu",
        "_score": 1,
        "_source": {
          "content": "人民共和国人民大会堂"
        }
      },
      {
        "_index": "ik-pinyin",
        "_type": "DOC",
        "_id": "eF--23IBeaMPz9g6ODK2",
        "_score": 1,
        "_source": {
          "content": "人民大会堂"
        }
      }
    ]
  }
}

#使用中文检索数据,检索结果为空
POST /ik-pinyin/DOC/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "content": "人民"
          }
        }
      ]
    }
  }
}

{
  "took": 8,
  "timed_out": false,
  "_shards": {
    "total": 3,
    "successful": 3,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 0,
    "max_score": null,
    "hits": []    #未命中任何记录
  }
}

#用拼音 guo 检索
POST /ik-pinyin/DOC/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "content": "guo"
          }
        }
      ]
    }
  }
}

{
  "took": 4,
  "timed_out": false,
  "_shards": {
    "total": 3,
    "successful": 3,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 2,
    "max_score": 0.2987943,
    "hits": [
      {
        "_index": "ik-pinyin",
        "_type": "DOC",
        "_id": "dl-923IBeaMPz9g6LzIu",
        "_score": 0.2987943,
        "_source": {
          "content": "人民共和国人民大会堂"
        }
      },
      {
        "_index": "ik-pinyin",
        "_type": "DOC",
        "_id": "d1--23IBeaMPz9g6CzKs",
        "_score": 0.29702917,
        "_source": {
          "content": "中华人民共和国人民大会堂"
        }
      }
    ]
  }
}

#使用zhrmghgrmdht检索
POST /ik-pinyin/DOC/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "content": "zhrmghgrmdht"
          }
        }
      ]
    }
  }
}

{
  "took": 9,
  "timed_out": false,
  "_shards": {
    "total": 3,
    "successful": 3,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 1,
    "max_score": 0.29702917,
    "hits": [
      {
        "_index": "ik-pinyin",
        "_type": "DOC",
        "_id": "d1--23IBeaMPz9g6CzKs",
        "_score": 0.29702917,
        "_source": {
          "content": "中华人民共和国人民大会堂"
        }
      }
    ]
  }
}

自定义配置

GitHub中给出了详细的配置项及其说明和使用方式

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章