爬蟲中建立moudle文件夾用於存放elasticsearch基本數據操作命令（建表）

from datetime import datetime
from elasticsearch_dsl import DocType, Date, Nested, Boolean,analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
from elasticsearch_dsl.connections import connections

#創建服務器鏈接,非常終於
connections.create_connection(hosts=["localhost"])

#定義數據類，繼承DocType,定義各個字段數據類型，在from elasticsearch_dsl import中導入需要的數據類型，包括字符串，整型，布爾等等
class LagouType(DocType):
    job_name = Text(analyzer="ik_max_word")
    company = Text(analyzer="ik_max_word")
    url = Keyword()
    job_id = Keyword()
    salary = Text(analyzer="ik_max_word")
    city = Keyword()
    experience = Text(analyzer="ik_max_word")
    education = Text(analyzer="ik_max_word")
    job_type = Keyword()
    label = Text(analyzer="ik_max_word")
    job_benefit = Text(analyzer="ik_max_word")
    job_description = Text(analyzer="ik_max_word")
    addr = Text(analyzer="ik_max_word")
    publish_time = Text(analyzer="ik_max_word")
    crawl_time = Date()


    #建立鏈接的index和doc，在類中建立類，必須是Meta類，用於傳入index值和type（表）值
    class Meta:
        index = "lagou"
        doc_type = "job"

if __name__ == "__main__":
    #調用init()方法建立映射（mappings）
    LagouType.init()

在pipeline中定製與Elasticsearch連接

1.直接寫在pipeline中，但是爬去的item不一定存入elasticsearch中或某數據庫中，並且值內容不一，容易混亂，配置性低

#pipeline中寫入
    class Elasticsearch_pipeline(object):
    def __init__(self):
        pass
    def process_item(self,item,spider):
        lagou = LagouType()
        lagou.job_name = item['job_name']
        lagou.company = item['company']
        lagou.url = item['url']
        lagou.job_id = item['job_id']
        lagou.salary = item['salary']
        lagou.city = item['city']
        lagou.experience = item['experience']
        lagou.education = item['education']
        lagou.job_type = item['job_type']
        lagou.label = item['label']
        lagou.job_benefit = item['job_benefit']
        lagou.job_description = item['job_description']
        lagou.addr = item['addr']
        lagou.publish_time = item['publish_time']
        lagou.crawl_time = item['crawl_time']
        lagou.save()

        return item

2.在item中定製save_to_elasticsearch接口，並在pipeline中調用item方法，增強item的可配置性

    #item方法
    def save_to_elasticsearch(self):
        # 繼承類
        lagou = LagouType()
        lagou.job_name = self['job_name']
        lagou.company = self['company']
        lagou.url = self['url']
        lagou.job_id = self['job_id']
        lagou.salary = self['salary']
        lagou.city = self['city']
        lagou.experience = self['experience']
        lagou.education = self['education']
        lagou.job_type = self['job_type']
        lagou.label = self['label']
        lagou.job_benefit = self['job_benefit']
        lagou.job_description = self['job_description']
        lagou.addr = self['addr']
        lagou.publish_time = self['publish_time']
        lagou.crawl_time = self['crawl_time']
        lagou.save()

#pipeline調用
class Elasticsearch_pipeline(object):
    def __init__(self):
        pass

    #在process_item中調用item的方法（item.save_to_elasticsearch()）
    def process_item(self,item,spider):
        item.save_to_elasticsearch()
        return item

#settings中開啓item_pipeline

    ITEM_PIPELINES = {
    'lagou_spider.pipelines.Elasticsearch_pipeline': 1
    }

#settings中開啓item_pipeline

    ITEM_PIPELINES = {
    'lagou_spider.pipelines.Elasticsearch_pipeline': 1
    }

記錄Elasticsearch與Python的各種查詢操作（基本與kibana中的elasticsearch操作相同可以照搬）

重點“ from elasticsearch import Elasticsearch”

Elasticsearch的查詢API接口：

client = Elasticsearch()

response = client.search(select sentence……)

class Elasticsearch_Option:
    def __init__(self):
        pass

#注意點1：注意大小寫，進行分詞分析時，elasticsearch的分詞器會把自動把所有詞變成小寫


#match 用法，對 match 傳入的值進行分詞，符合分詞結果的都可以檢索到
def match_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
           "query": {
                "match": {
                    'title':'C++後端工程師'
                }
            }
        }
    )

#term 用法，不對 term  傳入的值進行分詞
def term_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
           "query": {
                "term": {
                    'salary_min':'2000000'
                }
            }
        }
    )

#terms 用法，可傳入列表，符合列表內的值都可以檢索到
def terms_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
            "query": {
                "terms": {
                    'title': ['python','java','c++']  #千萬注意大小寫
                }
            }
        }
    )

#from 和 size 的用法
def from_size_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
           "query": {
                "match": {
                    'title':'工程師'
                }
            },
            "from":0,
            "size":4
        }
    )

#match_all操作
def match_all_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
           "query": {
                "match_all": {
            }
           }
        }
    )

# match_phrase 短語查詢
def match_phrase_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
            "query": {
                "match_phrase": {
                    "title": 'python研發工程師'
                }
            }
        }
    )
    for i in response['hits']['hits']:
        print(i['_source'])


#multi_match查詢，單一查詢條件查詢多列（fields）
def multi_match_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
            "query": {
                "multi_match": {
                    "query":"深圳",
                    "fields": ['title','city']    #查詢 fields 多個字段中，只要有：query查詢內容的關鍵字的就查詢出來。
                }
            }
        }
    )
    #仔細留意response返回結構
    for i in response['hits']['hits']:
        print(i['_source'])

#排序操作
def sort_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
            "query": {
            "match_all":{}
            },
            "sort":{
                "comment":{         #sort下面先制定需要排序的欄
                    "order": "asc"
                }
            }
        }
    )
    # 仔細留意response返回結構
    for i in response['hits']['hits']:
        print(i['_source'])

#範圍查詢，gte：大於等於； gt：大於； lte：小於等於； lt：小於； boots：表示權重
def range_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
            "query": {
                "range":{
                    "comment":{     #range下面是要確定範圍的field
                        "gt": 15,
                        "lt": 20
                    }
                }
            }
        }
    )
    # 仔細留意response返回結構
    for i in response['hits']['hits']:
        print(i['_source'])

#wildcard,模糊查詢
def wildcard_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
            "query": {
                "wildcard":{
                    "title":{     #range下面是要確定範圍的field
                    "value":"pyth*n"    # "*" 標識通配
                    }
                }
            }
        }
    )
    # 仔細留意response返回結構
    for i in response['hits']['hits']:
        print(i['_source'])


# bool查詢
# filter：字段過濾並且不參與打分，過濾掉非數組內的內容
# must：滿足數組中所有的條件，“與”
# should：數組中的查詢條件滿足一個或多個，“或”
# must_not：數組中的查詢條件一個都不能去滿足，“非”

def bool_option(self):
    client = Elasticsearch()
    response = client.search(
        index="lagou",
        body={
              "query": {
                "bool": {
                  "must": [{
                    "match_all":{}
                  }],
                  "filter": {
                    "term": {
                      "title": '工程師'
                    }
                  },
                  "must_not": [{
                    "match": {
                      "comment": 16
                    }
                  }],
                  "should": [{
                    "match": {
                      "title": 'c'
                    }
                  }]
                }
              }
            }
    )
    # 仔細留意response返回結構
    for i in response['hits']['hits']:
        print(i['_source'])

Elasticsearch+python學習

爬蟲中建立moudle文件夾用於存放elasticsearch基本數據操作命令（建表）

在pipeline中定製與Elasticsearch連接

記錄Elasticsearch與Python的各種查詢操作（基本與kibana中的elasticsearch操作相同可以照搬）

重點“ from elasticsearch import Elasticsearch”

Elasticsearch的查詢API接口：

client = Elasticsearch()

response = client.search(select sentence……)

如何使用 JS 判斷用戶是否處於活躍狀態

lightdb秒級增加列和刪除列（not null帶默認值）

lightdb數據庫超時相關控制參數

通過HPA+CronHPA組合應對業務複雜彈性伸縮場景

❤️‍🔥 Solon Cloud Event 新的事務特性與應用

lightdb mysql 8.0兼容之不可見主鍵

使用 JS 實現在瀏覽器控制檯打印圖片 console.image()

基於Ubuntu-22.04安裝K8s-v1.28.2實驗（四）使用域名訪問網站應用

Elasticsearch-基本操作

selenium 配合使用

scrapy簡單的反爬蟲方法總結

正則表達式re模塊學習！

scrapy爬蟲學習

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結