文章目錄
發現好久沒更新博客了,由於實習公司用到了ES,這兩天一直在研究ES,正好記錄一下學習的過程,方便以後查閱,後續可能彙總結一下ES的原理,並對比分析一下Lucene,本節主要是對增刪改差等基本操作的記錄,以及通過python調用的實例,過年新開工,今年繼續加油,希望能夠斬獲秋招!
一、安裝及插件
二、基本概念
2.1 Index
下面的命令可以查看當前節點的所有 Index
curl -X GET 'http://localhost:9200/_cat/indices?v'
2.2 Type
下面的命令可以列出每個 Index 所包含的 Type。
$ curl 'localhost:9200/_mapping?pretty=true'
三、新建和刪除 Index
curl -X PUT 'localhost:9200/weather'
curl -X DELETE 'localhost:9200/weather'
四、中文分詞設置
$ curl -H "Content-Type: application/json" -X PUT 'localhost:9200/accounts' -d '
{
"mappings": {
"person": {
"properties": {
"user": {
"type": "text",
"analyzer": "smartcn",
"search_analyzer": "smartcn"
},
"title": {
"type": "text",
"analyzer": "smartcn",
"search_analyzer": "smartcn"
},
"desc": {
"type": "text",
"analyzer": "smartcn",
"search_analyzer": "smartcn"
}
}
}
}
}'
五、數據操作
5.1 新增記錄
$ curl -H "Content-Type: application/json" -X PUT 'localhost:9200/accounts/person/1' -d '
{
"user": "張三",
"title": "工程師",
"desc": "數據庫管理"
}'
$ curl -H "Content-Type: application/json" -X POST 'localhost:9200/accounts/person' -d '
{
"user": "李四",
"title": "工程師",
"desc": "系統管理"
}'
curl -X DELETE 'localhost:9200/accounts/person/1'
curl -H "Content-Type: application/json" -X PUT 'localhost:9200/accounts/person/1' -d '
{
"user" : "張三",
"title" : "工程師",
"desc" : "數據庫管理,軟件開發"
}'
六、數據查詢
6.1 返回所有記錄
使用 GET 方法,直接請求/Index/Type/_search,就會返回所有記錄。
curl 'localhost:9200/accounts/person/_search'
6.2 全文搜索
Elastic 的查詢非常特別,使用自己的查詢語法,要求 GET 請求帶有數據體。
$ curl -H "Content-Type: application/json" 'localhost:9200/accounts/person/_search' -d '
{
"query" : { "match" : { "desc" : "軟件" }},
"from": 1,
"size": 1
}'
size指定,每次只返回一條結果。還可以通過from字段,指定位移。
6.3 邏輯運算
如果有多個搜索關鍵字, Elastic 認爲它們是or關係。
$ curl -H "Content-Type: application/json" localhost:9200/accounts/person/_search' -d '
{
"query" : { "match" : { "desc" : "軟件 系統" }}
}'
上面代碼搜索的是軟件 or 系統。
如果要執行多個關鍵詞的and搜索,必須使用布爾查詢。
$ curl -H "Content-Type: application/json" localhost:9200/accounts/person/_search' -d '
{
"query": {
"bool": {
"must": [
{ "match": { "desc": "軟件" } },
{ "match": { "desc": "系統" } }
]
}
}
}'
6.4 複雜查詢實例
- 查詢時間戳>某個時間並且shopId爲100000002和100000006的在SQL中是這樣的:
select * from shopsOrder where timestamp>1523671189000 and shopid in ("100000002","100000006")
- 在ES中就得這麼查:
POST:http://192.168.0.1:9200/shopsinfo/shopsOrder/_search
{
"size":20,
"query":{
"bool":{
"must":[
{
"range":{
"timestamp":{
"gte":1523671189000
}
}
},
{
"terms":{
"shopid":["100000002","100000006"]
}
}
]
}
}
}
- 統計的話ES是以aggs作爲參數,全稱應該叫做Aggregation,比如接着剛纔的查詢我想計算出結果的amount總額是多少就是類似SQL中的
select sum(amount)query_amount from shopsOrder where timestamp>1523671189000 and shopid in ("100000002","100000006")
- 在ES中就得這麼查
{
"aggs":{
"query_amount":{
"sum":{
"field":"amount"
}
}
},
"query":{
"bool":{
"must":[
{
"range":{
"timestamp":{
"gte":1523671189000
}
}
},
{
"terms":{
"shopid":["100000002","100000006"]
}
}
]
}
}
}
- 按天分組進行統計查詢SQL中的提現是這樣的:
select createdate,sum(amount)query_amount from shopsOrder where timestamp>1523671189000 and shopid in ("100000002","100000006")
group by createdate order by createdate
- 在ES中是這樣的:
{
"size":0,
"aggs":{
"orderDate":{
"terms":{
"field":"createdate",
"order":{
"_term":"asc"
}
},
"aggs":{
"query_amount":{
"sum":{
"field":"amount"
}
}
}
}
},
"query":{
"bool":{
"must":[
{
"range":{
"timestamp":{
"gte":1523671189000
}
}
},
{
"terms":{
"shopid":["100000002","100000006"]
}
}
]
}
}
}
- 查詢結果爲
......
"aggregations": {
"orderDate": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 99,
"buckets": [
......
{
"key": "20180415",
"doc_count": 8,
"query_amount": {
"value": 31632
}
},
{
"key": "20180417",
"doc_count": 3,
"query_amount": {
"value": 21401
}
},
{
"key": "20180418",
"doc_count": 2,
"query_amount": {
"value": 2333
}
}
......
]
}
}
- buckets中就是查詢的結果,key爲按我createdate分組後的值,doc_count類似count,query_amount爲sum後的值。至於我的參數裏面有一個size:0是因爲我不需要具體的記錄就是hits,所以這裏傳0
- 最後我們來個更復雜的1、統計所有的總額;2、先按paymentType支付方式分組統計amount總額,並且每個支付方式中再按天分組統計每天的amount總額
{
"size":0,
"aggs":{
"amount":{
"sum":{
"field":"amount"
}
},
"paymenttype":{
"terms":{
"field":"paymentType"
},
"aggs":{
"query_amount":{
"sum":{
"field":"amount"
}
},
"payment_date":{
"terms":{
"field":"createdate"
},
"aggs":{
"query_amount":{
"sum":{
"field":"amount"
}
}
}
}
}
}
},
"query":{
"bool":{
"must":[
{
"range":{
"timestamp":{
"gte":1523671189000
}
}
},
{
"terms":{
"shopid":["100000002","100000006"]
}
}
]
}
}
}
- 查詢結果爲:
......
"amount": {
"value": 684854
},
"paymenttype":{
......
"buckets": [
{
"key": "wechatpay",
"doc_count": 73,
"amount": {
"value": 351142
},
"payment_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 25,
"buckets": [
......
{
"key": "20180415",
"doc_count": 6,
"amount": {
"value": 29032
}
},
{
"key": "20180425",
"doc_count": 6,
"amount": {
"value": 21592
}
}
......
]
}
},
{
"key": "alipay",
"doc_count": 67,
"amount": {
"value": 333712
},
"payment_date": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 23,
"buckets": [
......
{
"key": "20180506",
"doc_count": 8,
"amount": {
"value": 38280
}
},
{
"key": "20180426",
"doc_count": 6,
"amount": {
"value": 41052
}
}
......
]
}
}
]
}
七、python簡單調用,實現中文分詞檢索功能
# -*- coding:UTF-8 -*-
from elasticsearch import Elasticsearch
import pymysql
from utils.util_tools import *
from conf import tmp_dir
import os
from elasticsearch.helpers import bulk
class ElasticObj:
def __init__(self, index_name, index_type, ip="127.0.0.1"):
'''
:param index_name: 索引名稱
:param index_type: 索引類型
'''
self.index_name = index_name
self.index_type = index_type
# 用戶名 密碼
self.es = Elasticsearch([ip])
# self.es = Elasticsearch([ip],http_auth=('elastic', 'password'),port=9200)
def create_index(self, index_mappings):
# 創建索引
if self.es.indices.exists(index=self.index_name) is not True:
res = self.es.indices.create(index=self.index_name, body=index_mappings, ignore=400)
print(res)
def delete_index(self):
result = self.es.indices.delete(index=self.index_name)
print(result)
def bulk_index_data(self, in_list):
'''
用bulk將批量數據存儲到es
:return:
'''
ACTIONS = []
i = 1
for line in in_list:
action = {
"_index": self.index_name,
"_type": self.index_type,
"_id": i, # _id 也可以默認生成,不賦值
"_source": {
"date": line['date'],
"source": line['source'],
"link": line['link'],
"keyword": line['keyword'],
"title": line['title']}
}
i += 1
ACTIONS.append(action)
# 批量處理
success, _ = bulk(self.es, ACTIONS, index=self.index_name, raise_on_error=True)
print('Performed %d actions' % success)
def build_index_doc(self, in_file, body_data_keys, extra_val, sep=None):
try:
i = 0
with open(in_file, encoding='utf-8') as fin:
for line in fin:
row = line.strip().split(sep)
body_data = {}
for idx, col in enumerate(row):
body_data[body_data_keys[idx]] = col
body_data[body_data_keys[-1]] = extra_val
self.es.index(index=self.index_name, doc_type=self.index_type, body=body_data)
i += 1
except Exception as e:
print(e)
def build_index_db(self, body_data_keys):
db = pymysql.connect("localhost", "root", "root", "gongan", charset='utf8')
cursor = db.cursor()
sql = "SELECT * FROM sheet1"
try:
# 使用 execute() 方法執行 SQL 查詢
cursor.execute(sql)
# 獲取所有記錄列表
results = cursor.fetchall()
new = []
i = -1
for row in results:
i += 1
body_data = {}
for idx, col in enumerate(row):
body_data[body_data_keys[idx]] = col
self.es.index(index='gongan', doc_type='test-type', body=body_data)
except:
print("Error: unable to fecth data")
db.close()
def delete_index_data(self, id):
res = self.es.delete(index=self.index_name, doc_type=self.index_type, id=id)
print(res)
def get_data_by_id(self, id):
_searched = self.es.get(index=self.index_name, doc_type=self.index_type, id=id)
print(_searched['_source'])
print('--' * 50)
def get_data_by_body(self, doc):
_searched = self.es.search(index=self.index_name, doc_type=self.index_type, body=doc)
for hit in _searched['hits']['hits']:
# print hit['_source']
print(hit['_source']['date'], hit['_source']['source'], hit['_source']['link'], hit['_source']['keyword'], \
hit['_source']['title'])
# @excute_time_log
def get_data_by_para(self, search_name, corpus_type, topk=10):
body_data = {
"size": topk,
"query": {
"bool": {
"must": [
{
"match": {
"standard_name": search_name
}
},
{
"term": {
"corpus_type": corpus_type
}
}
]
}
}
}
_searched = self.es.search(index=self.index_name, doc_type=self.index_type, body=body_data)
candidates = []
for item in _searched["hits"]["hits"]:
candidates.append(item["_source"]["standard_name"])
return candidates
if __name__ == '__main__':
# """
_index_name = "medical_corpus"
_index_type = "doc_type_test"
_index_mappings = {
"mappings": {
_index_type: {
"properties": {
# "id": {
# "type": "long",
# "index": "false"
# },
"icd_code": {
"type": "keyword"
},
"standard_name": {
"type": "text",
# "analyzer": "standard",
# "search_analyzer": "standard"
},
"corpus_type": {
"type": "keyword"
}
}
}
}
}
obj = ElasticObj(_index_name, _index_type)
obj = ElasticObj("test", "test_type")
# obj.delete_index()
# obj.create_index(_index_mappings)
corpus_names = ["drug.txt", "treatment.txt", "material.txt"]
body_data_keys = ['icd_code', 'standard_name', 'corpus_type']
sep = None
for corpus_name in corpus_names:
in_file = os.path.join(tmp_dir, "corpus_bm25/" + corpus_name)
obj.build_index_doc(in_file, body_data_keys, corpus_name.split(".")[0], sep)
# obj.get_data_by_id(1)
# obj.delete_index_data(1)
# obj.get_data_by_id(2)
# search_name = "組織鉗"
# corpus_type = "material"
# print(obj.get_data_by_para(search_name, corpus_type, topk=20))
doc0 = {'query': {'match_all': {}}}
doc = {
"query": {
"match": {
"title": "電視"
}
}
}
doc1 = {
"query": {
"multi_match": {
"query": "網",
"fields": ["source", "title"]
}
}
}
doc = {
"size": 10,
"query": {
"bool": {
"must": [
{
"term": {
"title": "人民"
}
},
{
"terms": {
"source": ["慧聰網", "人民電視"]
}
}
]
}
}
}
obj.get_data_by_body(doc0)
# """
# 測試用:
""" in_list
in_list = [
{"date": "2017-09-13",
"source": "慧聰網",
"link": "http://info.broadcast.hc360.com/2017/09/130859749974.shtml",
"keyword": "電視",
"title": "付費 電視 行業面臨的轉型和挑戰"
},
{"date": "2017-09-13",
"source": "中國文明網",
"link": "http://www.wenming.cn/xj_pd/yw/201709/t20170913_4421323.shtml",
"keyword": "電視",
"title": "電視 專題片《巡視利劍》廣獲好評:鐵腕反腐凝聚黨心民心"
},
{"date": "2017-09-13",
"source": "人民電視",
"link": "http://tv.people.com.cn/BIG5/n1/2017/0913/c67816-29533981.html",
"keyword": "電視",
"title": "中國第21批赴剛果(金)維和部隊啓程--人民 電視 --人民網"
},
{"date": "2017-09-13",
"source": "站長之家",
"link": "http://www.chinaz.com/news/2017/0913/804263.shtml",
"keyword": "電視",
"title": "電視 盒子 哪個牌子好? 吐血奉獻三大選購祕笈"
}
]
# obj.bulk_index_data(in_list)
"""
""" mapping
"serial": {
"type": "keyword", # keyword不會進行分詞,text會分詞
"index": "false" # 不建索引
},
# tags可以存json格式,訪問tags.content
"tags": {
"type": "object",
"properties": {
"content": {"type": "keyword", "index": True},
"dominant_color_name": {"type": "keyword", "index": True},
"skill": {"type": "keyword", "index": True},
}
},
"status": {
"type": "long",
"index": True
},
"createTime": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
},
"updateTime": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
}
"""