Elasticsearch+python學習

爬蟲中建立moudle文件夾用於存放elasticsearch基本數據操作命令(建表)

from datetime import datetime
from elasticsearch_dsl import DocType, Date, Nested, Boolean,analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
from elasticsearch_dsl.connections import connections

#創建服務器鏈接,非常終於
connections.create_connection(hosts=["localhost"])

#定義數據類,繼承DocType,定義各個字段數據類型,在from elasticsearch_dsl import中導入需要的數據類型,包括字符串,整型,布爾等等
class LagouType(DocType):
    job_name = Text(analyzer="ik_max_word")
    company = Text(analyzer="ik_max_word")
    url = Keyword()
    job_id = Keyword()
    salary = Text(analyzer="ik_max_word")
    city = Keyword()
    experience = Text(analyzer="ik_max_word")
    education = Text(analyzer="ik_max_word")
    job_type = Keyword()
    label = Text(analyzer="ik_max_word")
    job_benefit = Text(analyzer="ik_max_word")
    job_description = Text(analyzer="ik_max_word")
    addr = Text(analyzer="ik_max_word")
    publish_time = Text(analyzer="ik_max_word")
    crawl_time = Date()


    #建立鏈接的index和doc,在類中建立類,必須是Meta類,用於傳入index值和type(表)值
    class Meta:
        index = "lagou"
        doc_type = "job"

if __name__ == "__main__":
    #調用init()方法建立映射(mappings)
    LagouType.init()

在pipeline中定製與Elasticsearch連接

1.直接寫在pipeline中,但是爬去的item不一定存入elasticsearch中或某數據庫中,並且值內容不一,容易混亂,配置性低

#pipeline中寫入
    class Elasticsearch_pipeline(object):
    def __init__(self):
        pass
    def process_item(self,item,spider):
        lagou = LagouType()
        lagou.job_name = item['job_name']
        lagou.company = item['company']
        lagou.url = item['url']
        lagou.job_id = item['job_id']
        lagou.salary = item['salary']
        lagou.city = item['city']
        lagou.experience = item['experience']
        lagou.education = item['education']
        lagou.job_type = item['job_type']
        lagou.label = item['label']
        lagou.job_benefit = item['job_benefit']
        lagou.job_description = item['job_description']
        lagou.addr = item['addr']
        lagou.publish_time = item['publish_time']
        lagou.crawl_time = item['crawl_time']
        lagou.save()

        return item

2.在item中定製save_to_elasticsearch接口,並在pipeline中調用item方法,增強item的可配置性

    #item方法
    def save_to_elasticsearch(self):
        # 繼承類
        lagou = LagouType()
        lagou.job_name = self['job_name']
        lagou.company = self['company']
        lagou.url = self['url']
        lagou.job_id = self['job_id']
        lagou.salary = self['salary']
        lagou.city = self['city']
        lagou.experience = self['experience']
        lagou.education = self['education']
        lagou.job_type = self['job_type']
        lagou.label = self['label']
        lagou.job_benefit = self['job_benefit']
        lagou.job_description = self['job_description']
        lagou.addr = self['addr']
        lagou.publish_time = self['publish_time']
        lagou.crawl_time = self['crawl_time']
        lagou.save()

#pipeline調用
class Elasticsearch_pipeline(object):
    def __init__(self):
        pass

    #在process_item中調用item的方法(item.save_to_elasticsearch())
    def process_item(self,item,spider):
        item.save_to_elasticsearch()
        return item

#settings中開啓item_pipeline

    ITEM_PIPELINES = {
    'lagou_spider.pipelines.Elasticsearch_pipeline': 1
    }

#settings中開啓item_pipeline

    ITEM_PIPELINES = {
    'lagou_spider.pipelines.Elasticsearch_pipeline': 1
    }

記錄Elasticsearch與Python的各種查詢操作(基本與kibana中的elasticsearch操作相同可以照搬)

重點“ from elasticsearch import Elasticsearch”

Elasticsearch的查詢API接口:

client = Elasticsearch()

response = client.search(select sentence……)

class Elasticsearch_Option:
    def __init__(self):
        pass

#注意點1:注意大小寫,進行分詞分析時,elasticsearch的分詞器會把自動把所有詞變成小寫


#match 用法,對 match 傳入的值進行分詞,符合分詞結果的都可以檢索到
def match_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
           "query": {
                "match": {
                    'title':'C++後端工程師'
                }
            }
        }
    )

#term 用法,不對 term  傳入的值進行分詞
def term_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
           "query": {
                "term": {
                    'salary_min':'2000000'
                }
            }
        }
    )

#terms 用法,可傳入列表,符合列表內的值都可以檢索到
def terms_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
            "query": {
                "terms": {
                    'title': ['python','java','c++']  #千萬注意大小寫
                }
            }
        }
    )

#from 和 size 的用法
def from_size_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
           "query": {
                "match": {
                    'title':'工程師'
                }
            },
            "from":0,
            "size":4
        }
    )

#match_all操作
def match_all_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
           "query": {
                "match_all": {
            }
           }
        }
    )

# match_phrase 短語查詢
def match_phrase_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
            "query": {
                "match_phrase": {
                    "title": 'python研發工程師'
                }
            }
        }
    )
    for i in response['hits']['hits']:
        print(i['_source'])


#multi_match查詢,單一查詢條件查詢多列(fields)
def multi_match_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
            "query": {
                "multi_match": {
                    "query":"深圳",
                    "fields": ['title','city']    #查詢 fields 多個字段中,只要有:query查詢內容的關鍵字的就查詢出來。
                }
            }
        }
    )
    #仔細留意response返回結構
    for i in response['hits']['hits']:
        print(i['_source'])

#排序操作
def sort_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
            "query": {
            "match_all":{}
            },
            "sort":{
                "comment":{         #sort下面先制定需要排序的欄
                    "order": "asc"
                }
            }
        }
    )
    # 仔細留意response返回結構
    for i in response['hits']['hits']:
        print(i['_source'])

#範圍查詢,gte:大於等於; gt:大於; lte:小於等於; lt:小於; boots:表示權重
def range_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
            "query": {
                "range":{
                    "comment":{     #range下面是要確定範圍的field
                        "gt": 15,
                        "lt": 20
                    }
                }
            }
        }
    )
    # 仔細留意response返回結構
    for i in response['hits']['hits']:
        print(i['_source'])

#wildcard,模糊查詢
def wildcard_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
            "query": {
                "wildcard":{
                    "title":{     #range下面是要確定範圍的field
                    "value":"pyth*n"    # "*" 標識通配
                    }
                }
            }
        }
    )
    # 仔細留意response返回結構
    for i in response['hits']['hits']:
        print(i['_source'])


# bool查詢
# filter:字段過濾並且不參與打分,過濾掉非數組內的內容
# must:滿足數組中所有的條件,“與”
# should:數組中的查詢條件滿足一個或多個,“或”
# must_not:數組中的查詢條件一個都不能去滿足,“非”

def bool_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
              "query": {
                "bool": {
                  "must": [{
                    "match_all":{}
                  }],
                  "filter": {
                    "term": {
                      "title": '工程師'
                    }
                  },
                  "must_not": [{
                    "match": {
                      "comment": 16
                    }
                  }],
                  "should": [{
                    "match": {
                      "title": 'c'
                    }
                  }]
                }
              }
            }
    )
    # 仔細留意response返回結構
    for i in response['hits']['hits']:
        print(i['_source'])
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章