爬蟲中建立moudle文件夾用於存放elasticsearch基本數據操作命令(建表)
from datetime import datetime
from elasticsearch_dsl import DocType, Date, Nested, Boolean,analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
from elasticsearch_dsl.connections import connections
#創建服務器鏈接,非常終於
connections.create_connection(hosts=["localhost"])
#定義數據類,繼承DocType,定義各個字段數據類型,在from elasticsearch_dsl import中導入需要的數據類型,包括字符串,整型,布爾等等
class LagouType(DocType):
job_name = Text(analyzer="ik_max_word")
company = Text(analyzer="ik_max_word")
url = Keyword()
job_id = Keyword()
salary = Text(analyzer="ik_max_word")
city = Keyword()
experience = Text(analyzer="ik_max_word")
education = Text(analyzer="ik_max_word")
job_type = Keyword()
label = Text(analyzer="ik_max_word")
job_benefit = Text(analyzer="ik_max_word")
job_description = Text(analyzer="ik_max_word")
addr = Text(analyzer="ik_max_word")
publish_time = Text(analyzer="ik_max_word")
crawl_time = Date()
#建立鏈接的index和doc,在類中建立類,必須是Meta類,用於傳入index值和type(表)值
class Meta:
index = "lagou"
doc_type = "job"
if __name__ == "__main__":
#調用init()方法建立映射(mappings)
LagouType.init()
在pipeline中定製與Elasticsearch連接
1.直接寫在pipeline中,但是爬去的item不一定存入elasticsearch中或某數據庫中,並且值內容不一,容易混亂,配置性低
#pipeline中寫入
class Elasticsearch_pipeline(object):
def __init__(self):
pass
def process_item(self,item,spider):
lagou = LagouType()
lagou.job_name = item['job_name']
lagou.company = item['company']
lagou.url = item['url']
lagou.job_id = item['job_id']
lagou.salary = item['salary']
lagou.city = item['city']
lagou.experience = item['experience']
lagou.education = item['education']
lagou.job_type = item['job_type']
lagou.label = item['label']
lagou.job_benefit = item['job_benefit']
lagou.job_description = item['job_description']
lagou.addr = item['addr']
lagou.publish_time = item['publish_time']
lagou.crawl_time = item['crawl_time']
lagou.save()
return item
2.在item中定製save_to_elasticsearch接口,並在pipeline中調用item方法,增強item的可配置性
#item方法
def save_to_elasticsearch(self):
# 繼承類
lagou = LagouType()
lagou.job_name = self['job_name']
lagou.company = self['company']
lagou.url = self['url']
lagou.job_id = self['job_id']
lagou.salary = self['salary']
lagou.city = self['city']
lagou.experience = self['experience']
lagou.education = self['education']
lagou.job_type = self['job_type']
lagou.label = self['label']
lagou.job_benefit = self['job_benefit']
lagou.job_description = self['job_description']
lagou.addr = self['addr']
lagou.publish_time = self['publish_time']
lagou.crawl_time = self['crawl_time']
lagou.save()
#pipeline調用
class Elasticsearch_pipeline(object):
def __init__(self):
pass
#在process_item中調用item的方法(item.save_to_elasticsearch())
def process_item(self,item,spider):
item.save_to_elasticsearch()
return item
#settings中開啓item_pipeline
ITEM_PIPELINES = {
'lagou_spider.pipelines.Elasticsearch_pipeline': 1
}
#settings中開啓item_pipeline
ITEM_PIPELINES = {
'lagou_spider.pipelines.Elasticsearch_pipeline': 1
}
記錄Elasticsearch與Python的各種查詢操作(基本與kibana中的elasticsearch操作相同可以照搬)
重點“ from elasticsearch import Elasticsearch”
Elasticsearch的查詢API接口:
client = Elasticsearch()
response = client.search(select sentence……)
class Elasticsearch_Option:
def __init__(self):
pass
#注意點1:注意大小寫,進行分詞分析時,elasticsearch的分詞器會把自動把所有詞變成小寫
#match 用法,對 match 傳入的值進行分詞,符合分詞結果的都可以檢索到
def match_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"match": {
'title':'C++後端工程師'
}
}
}
)
#term 用法,不對 term 傳入的值進行分詞
def term_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"term": {
'salary_min':'2000000'
}
}
}
)
#terms 用法,可傳入列表,符合列表內的值都可以檢索到
def terms_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"terms": {
'title': ['python','java','c++'] #千萬注意大小寫
}
}
}
)
#from 和 size 的用法
def from_size_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"match": {
'title':'工程師'
}
},
"from":0,
"size":4
}
)
#match_all操作
def match_all_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"match_all": {
}
}
}
)
# match_phrase 短語查詢
def match_phrase_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"match_phrase": {
"title": 'python研發工程師'
}
}
}
)
for i in response['hits']['hits']:
print(i['_source'])
#multi_match查詢,單一查詢條件查詢多列(fields)
def multi_match_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"multi_match": {
"query":"深圳",
"fields": ['title','city'] #查詢 fields 多個字段中,只要有:query查詢內容的關鍵字的就查詢出來。
}
}
}
)
#仔細留意response返回結構
for i in response['hits']['hits']:
print(i['_source'])
#排序操作
def sort_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"match_all":{}
},
"sort":{
"comment":{ #sort下面先制定需要排序的欄
"order": "asc"
}
}
}
)
# 仔細留意response返回結構
for i in response['hits']['hits']:
print(i['_source'])
#範圍查詢,gte:大於等於; gt:大於; lte:小於等於; lt:小於; boots:表示權重
def range_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"range":{
"comment":{ #range下面是要確定範圍的field
"gt": 15,
"lt": 20
}
}
}
}
)
# 仔細留意response返回結構
for i in response['hits']['hits']:
print(i['_source'])
#wildcard,模糊查詢
def wildcard_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"wildcard":{
"title":{ #range下面是要確定範圍的field
"value":"pyth*n" # "*" 標識通配
}
}
}
}
)
# 仔細留意response返回結構
for i in response['hits']['hits']:
print(i['_source'])
# bool查詢
# filter:字段過濾並且不參與打分,過濾掉非數組內的內容
# must:滿足數組中所有的條件,“與”
# should:數組中的查詢條件滿足一個或多個,“或”
# must_not:數組中的查詢條件一個都不能去滿足,“非”
def bool_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"bool": {
"must": [{
"match_all":{}
}],
"filter": {
"term": {
"title": '工程師'
}
},
"must_not": [{
"match": {
"comment": 16
}
}],
"should": [{
"match": {
"title": 'c'
}
}]
}
}
}
)
# 仔細留意response返回結構
for i in response['hits']['hits']:
print(i['_source'])