# 多線程模塊
import threading
# 隊列模塊
import queue
import requests
from lxml import etree
import time
import random
import json
import pymongo
# 採集線程數
concurrent = 3
# 解析線程數
conparse = 3
class Crawl(threading.Thread):
"""
主要作用根據特定URL獲取響應信息
"""
def __init__(self, number, req_list, data_list):
# 調用Thread父類方法
super(Crawl, self).__init__()
# 初始化子類屬性
self.number = number
self.req_list = req_list
self.data_list = data_list
self.headers = {
"authority": "search.jd.com",
"method": "GET",
"path": "/s_new.php?keyword=iphone&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0",
"scheme": "https",
"cookie": "shshshfpa=817b0820-cebe-1aa5-5eaa-8f8d3b5945b3-1564123066; shshshfpb=nyq8Po%20NsVQIn1ih2zomJlw%3D%3D; TrackID=167xINX9YcKrGWU-JQ0EQWhwYstFi3gNOxFq6Em4_l6J6OECY5-pwyzHxRFr6TTZkLHI1m_3orstgEzPhWGk1pkbfG_ASOMSSscDY_oEz4XQ; pinId=DRR64H7p6D2CxuR9knABB7V9-x-f3wj7; qrsc=3; __jdu=502413746; areaId=2; PCSYCityID=CN_310000_310100_310112; xtest=3925.cf6b6759; ipLoc-djd=2-2825-51931-0; rkv=V0600; user-key=88fa50f5-ec38-48ff-9efe-beecaa5ffc96; cn=0; unpl=V2_ZzNtbRdfQEF8DRMDeR9ZDGIHFAhKUhcRd1tBVnNLXAcwB0FbclRCFX0UR1xnGlgUZwMZWEpcRxVFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8YXwBnARRYRWdzEkU4dlB8G1oEVwIiXHIVF0l1CkJRfxkRAWYAF11AUUYSRQl2Vw%3d%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_d92e84df3748457d94e53b639c13f5b7|1569383391707; __jda=122270672.502413746.1564123065.1569380256.1569383392.14; __jdc=122270672; shshshfp=f5c7274df7eb1773b86a72faa494fed4; 3AB9D23F7A4B3C9B=SHZYIPK2KZYMKRXEKXFSLWSYQOVLA745EZ4NXQTNMFLQMVFRJKZT7VYDEIY6L2USE2KDEOHO2IPNACME4W7GJ2LKTM",
"referer": "https://search.jd.com/Search?keyword=iphone&enc=utf-8&suggest=1.his.0.0&wq=&pvid=ec4030074500424391b371d06a8a62fd",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
}
# 線程啓動的時候調用
def run(self):
print("啓動採集線程{}號".format(self.number))
# 如果請求隊列不爲空, 則無限循環, 從請求隊列裏面請求URL
while self.req_list.qsize() > 0:
# 從請求隊列裏提取URL
url = self.req_list.get()
print("{}號線程採集:{}".format(self.number, url))
# 防止請求頻率過快,隨機設置阻塞時間
time.sleep(random.randint(1, 3))
# 發起http請求,獲取響應內容, 追加到數據隊列裏,等待解析
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
# 向數據隊列裏面追加
self.data_list.put(response.text)
class Parse(threading.Thread):
"""
將響應信息進行數據解析
"""
def __init__(self, number, data_list, req_thread, f):
super(Parse, self).__init__()
# 線程編號
self.number = number
# 數據隊列
self.data_list = data_list
# 請求隊列,爲了判斷採集線程存活狀態
self.req_list = req_thread
# 獲取文件對象
self.f = f
# 判斷是否從數據隊列裏提取數據
self.is_parse = True
def run(self):
print("啓動{}號解析線程".format(self.number))
# 無線循環
while True:
# 如何判斷解析線程的結束條件
for t in self.req_list:
# 判斷解析線程是否存活
if t.is_alive():
# 如果線程存活,代表請求沒有完成,跳出整個循環,從新開始while True
break
else: # 如果循環完畢,沒有執行break語句,則進入else
if self.data_list.qsize() == 0:
# 設置解析False
self.is_parse = False
# 判斷是否繼續解析
if self.is_parse:
try:
# 從數據隊列裏面提取一個數據
data = self.data_list.get(timeout=3)
except Exception as E: # 超時以後進入異常
data = None
# 如歌成功拿到數據,則條用解析方法
if data is not None:
self.parse(data)
else:
# 結束while 無線循環
break
print("退出{}號解析線程".format(self.number))
def parse(self, data):
html = etree.HTML(data)
# 獲取多有li
lis = html.xpath('//li[contains(@class,"gl-item")]')
for li in lis:
href = li.xpath('div/div[@class="p-img"]/a/@href')
title = li.xpath('div/div[@class="p-img"]/a/@title')
price = li.xpath('div/div[@class="p-price"]/strong/i/text()')
name = li.xpath('div/div[@class="p-name p-name-type-2"]/a/em/text()')
commit = li.xpath('div/div[@class="p-commit"]//text()')
shop = li.xpath('div/div[@class="p-shop"]/span/a/text()')
icons = li.xpath('div/div[@class="p-icons"]/i/text()')
stock = li.xpath('div/div[@class="p-stock"]/text()')
item = {
"href": "https:" + href[0],
"title": str(title[0]).replace("\r\n\t", ""),
"price": " ".join(price),
"name": str(" ".join(name)).replace("\r\n\t", "").strip(),
"commit": str(" ".join(commit)).replace("\r\n\t", "").strip(),
"shop": " ".join(shop),
"icons": " ".join(icons),
"stock": " ".join(stock)
}
self.f.write(json.dumps(item, ensure_ascii=False) + "\n")
self.save_to_mongo(item)
def save_to_mongo(self, result):
MONGO_URL = "localhost"
MONGO_DB = "JingDong"
MONGO_COLLECTION = "multithreading_iphone"
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
try:
db[MONGO_COLLECTION].insert_one(result)
except Exception:
print("存儲到MongoDB失敗")
def main():
# 生成請求隊列
req_list = queue.Queue()
# 生成數據隊列,請求以後,響應內容放到數據隊列裏
data_list = queue.Queue()
# 創建文件對象
f = open(r"G:\個人總結\csv\JD_Iphone.json", "w", encoding="utf-8")
# 循環生成多個請求URL
ms = time.time()
for i in range(100):
base_url = None
if (i + 1) % 2 != 0:
base_url = "https://search.jd.com/s_new.php?keyword=iphone&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0&page={0}&s={1}&click=0".format(
i + 1, i * 30 + 1)
else:
base_url = "https://search.jd.com/s_new.php?keyword=iphone&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0&page={0}&s={1}&scrolling=y&log_id={2}".format(
i + 1, i * 30 + 1, '%.5f' % (ms)
)
req_list.put(base_url)
# 生成N個採集線程
req_thread = []
for i in range(concurrent):
t = Crawl(i + 1, req_list, data_list)
t.start()
req_thread.append(t)
# 生成N個解析線程
parse_thread = []
for i in range(conparse):
t = Parse(i + 1, data_list, req_thread, f)
t.start()
parse_thread.append(t)
for t in req_thread:
t.join()
for tt in parse_thread:
tt.join()
f.close()
if __name__ == "__main__":
main()