Python多線程爬蟲小demo
class
線程類 及實例 說明:
類 | 實例線程list | 用途 | 繼承自 |
---|---|---|---|
ThreadCrawlCom_URL | threads_crawlC = [ ] | 抓取商品url線程列表 | threading.Thread |
ThreadParseComUrl | threads_parseC = [] | 解析商品url線程列表 | threading.Thread |
ThreadCrawl | threads_crawl = [] | 抓取商品詳情線程列表 | threading.Thread |
ThreadParse | threads_parse = [] | 解析商品詳情線程列表 | threading.Thread |
ThreadStore | threads_Store = [] | 存儲數據線程列表 | threading.Thread |
代碼塊:
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : Spider2.py
# @Author: zjian
# @Date : 18-8-27
# @Contact :[email protected]
# @Software : PyCharm
from queue import Queue
import threading
from lxml import etree
import requests
import time
import pymysql
from Fst_MT_Spider.models.Settings import init_url
class ThreadCrawlCom_URL(threading.Thread):
def __init__(self, threadName, PageurlQueue, ComUrlQueue):
super(ThreadCrawlCom_URL, self).__init__()
self.threadName = threadName
self.PageurlQueue = PageurlQueue
self.ComUrlQueue = ComUrlQueue
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
def run(self):
print('[INFO]啓動%s' % self.threadName)
while not PC_EXIT_FLAG:
try:
url = str(self.PageurlQueue.get(False))
content = requests.get(url=url, headers=self.headers).text
time.sleep(1)
self.ComUrlQueue.put(content) #將請求回的列表頁內容放置於此
except:
pass
print('[INFO]%s線程結束' % self.threadName)
class ThreadCrawl(threading.Thread):
def __init__(self, threadName, dataQueue, urlQueue):
super(ThreadCrawl, self).__init__()
# 線程名
self.threadName = threadName
# 數據隊列
self.dataQueue = dataQueue
# url隊列
self.urlQueue = urlQueue
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
def run(self):
print('[INFO]啓動%s' % self.threadName)
while not C_EXIT_FLAG:
try:
url = str(self.urlQueue.get(False))
content = requests.get(url=url, headers=self.headers).text
time.sleep(1)
self.dataQueue.put(content)
except:
pass
print('[INFO]%s線程結束' % self.threadName)
# 用以解析商品詳情頁
class ThreadParse(threading.Thread):
def __init__(self, threadName, dataQueue, resQueue):
super(ThreadParse, self).__init__()
self.threadName = threadName
self.dataQueue = dataQueue
self.resQueue = resQueue
def run(self):
print('[INFO] 啓動 %s 線程' % self.threadName)
while not P_EXIT_FLAG:
try:
response = self.dataQueue.get(False)
self.parse(response)
except:
pass
print('[INFO] %s 線程結束' % self.threadName)
def parse(self, response):
# 解析爲HTML DOM
response = etree.HTML(response)
title = ''.join(response.xpath('//*[@id="goods_detail_1"]/h1/text()')).strip()
try:
bd = response.xpath('//*[@id="goods_detail_1"]/h1/em/a/span/text()')[0]
except:
bd = ''
goods_nums = response.xpath('//*[@id="goods_detail_1"]/ul/li[1]/span[1]/text()')[0] # 商品貨號
o_price = response.xpath('//*[@id="goods_detail_1"]/ul/li[1]/del/text()')[0]
n_price = response.xpath('//*[@id="goods_detail_1"]/ul/li[2]/em/text()')[0]
sale_count = response.xpath('//*[@id="goods_detail_1"]/ul/li[3]/span/text()')[0]
comment_nums = response.xpath('//*[@id="goods_detail_1"]/ul/li[4]/span[2]/a/text()')[0]
score = response.xpath('//*[@id="goods_detail_1"]/ul/li[4]/span[1]/em/@title')[0]
brand = response.xpath('//*[@id="J_attrBrandName"]/text()')[0]
c_size = response.xpath('//*[@id="J_AttrUL"]/li[3]/text()')[0].replace('\n', '').replace(' ', '').replace('\xa0', ',')
color = response.xpath('//*[@id="J_AttrUL"]/li[4]/text()')[0].replace('\n', '').replace(' ', '').replace('\xa0',',')
item = {'title': title,
'bd': bd,
'goods_nums': goods_nums,
'o_price': o_price,
'n_price': n_price,
'sale_count': sale_count,
'comment_nums': comment_nums,
'score': score,
'brand': brand,
'c_size': c_size,
'color': color
}
self.resQueue.put(item)
#現在需要一個解析商品url的類,
class ThreadParseComUrl(threading.Thread):
def __init__(self,threadName,urlQueue,ComUrlQueue):
super(ThreadParseComUrl, self).__init__()
self.threadName = threadName
self.urlQueue = urlQueue
self.ComUrlQueue = ComUrlQueue
def run(self):
print('[INFO] 啓動 %s 線程' % self.threadName)
while not PP_EXIT_FLAG:
try:
response = self.ComUrlQueue.get(False)
self.parse(response)
except:
pass
print('[INFO] %s 線程結束' % self.threadName)
def parse(self,response):
response = etree.HTML(response)
"""這裏寫提取邏輯"""
for i in range(1,41):
try:
url = 'http://www.handu.com/'+response.xpath('//*[@id="cate_right"]/div[3]/div[1]/div[{0}]/div/div[1]/a/@href'.format(i))[0]
self.urlQueue.put(url) #將url放入隊列
except:
print('[info]當前第%s<div>節點不存在商品url'%i)
continue
class ThreadStore(threading.Thread):
def __init__(self, threadName, resQueue, lock):
super(ThreadStore, self).__init__()
self.threadName = threadName
self.resQueue = resQueue
self.lock = lock
self.conn = pymysql.connect('localhost', 'root', '123456', 'databases_name', charset='utf8')
self.cursor = self.conn.cursor()
self.conn.ping(True)
def run(self, ):
print('[INFO] 啓動%s線程進行存儲' % self.threadName)
while not S_EXIT_FLAG:
try:
item = self.resQueue.get(False)
self.store(item)
except:
pass
print('[INFO] 結束%s存儲線程' % self.threadName)
def store(self, item):
insert_sql = '''insert into Hstyle(title,bd,goods_nums,o_price,n_price,sale_count,score,comment_nums,brand,c_size,color)
values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
params = (item['title'], item['bd'], item['goods_nums'], item['o_price'], item['n_price'], item['sale_count'],
item['score'], item['comment_nums'], item['brand'], item['c_size'], item['color'])
with self.lock:
try:
self.cursor.execute(insert_sql, params)
self.conn.commit()
except Exception as e:
print(e)
def __del__(self):
with self.lock:
self.cursor.close()
self.conn.close()
C_EXIT_FLAG = False
P_EXIT_FLAG = False
S_EXIT_FLAG = False
PC_EXIT_FLAG = False
PP_EXIT_FLAG = False
def main():
pageQueue = Queue()# pageurl隊列 (商品展示陳列頁面,首頁url)
ComUrlQueue = Queue() # 放置請求後返回的商品url隊列(未解析)
urlQueue = Queue() # 將抓取到的商品url放置在此隊列中(已解析)
dataQueue = Queue() # 放置請求後返回的(未經過parse解析的)商品數據隊列
resQueue = Queue() # 結果隊列 (已解析)
lock = threading.Lock()# 創建鎖
pageurl_list = init_url() #這裏生成起始頁面url列表
for pageurl in pageurl_list:
pageQueue.put(pageurl)
#1.抓取商品url線程列表
threads_crawlC = []
for i in range(len(pageurl_list)):
threadName = 'Crawl_URL-thds-' + str(i)
thread_CC = ThreadCrawlCom_URL(threadName,pageQueue,ComUrlQueue)
threads_crawlC.append(thread_CC)
#1.1開啓抓取商品url的線程
for thread_CC in threads_crawlC:
thread_CC.start()
#2解析商品url線程列表
threads_parseC = []
for i in range(100):
threadName = 'Parse_CC-thds-' + str(i)
thread_PC = ThreadParseComUrl(threadName,urlQueue,ComUrlQueue)
threads_parseC.append(thread_PC)
#2.1開始解析商品url線程
for thread_PC in threads_parseC:
thread_PC.start()
#3抓取商品詳情線程列表
threads_crawl = []
for i in range(100): # 經過測試發現在一定範圍內線程數越多執行速度越快,其他影響因素:網速
'''由於這裏不知道url的長度,隨着前一個的爬取,urlQueue的長度也是在不斷增加的,所以這裏先固定設置爲100'''
threadName = 'Crawl-thds-' + str(i)
thread_C = ThreadCrawl(threadName, dataQueue, urlQueue)
threads_crawl.append(thread_C)
#3.1開啓線程
for thread_C in threads_crawl:
thread_C.start()
#4解析商品詳情線程列表
threads_parse = []
# for threadName in parselist:
for i in range(100):
threadName = 'Parse-thds-' + str(i)
thread_P = ThreadParse(threadName, dataQueue, resQueue)
threads_parse.append(thread_P)
#4.1 開啓線程
for thread_P in threads_parse:
thread_P.start()
#5存儲線程列表
threads_Store = []
for i in range(100):
threadName = 'Store-thds-' + str(i)
thread_S = ThreadStore(threadName, resQueue, lock)
threads_Store.append(thread_S)
#5.1開啓線程
for thread_S in threads_Store:
thread_S.start()
'''序號重置'''
#1等待請求頁面隊列爲空,採集商品url線程退出
while not pageQueue.empty():
pass
global PC_EXIT_FLAG
PC_EXIT_FLAG = True
for thread_CC in threads_crawlC:
thread_CC.join()
#1.1等待頁面採集結果(商品url)隊列爲空,退出解析線程
while not ComUrlQueue.empty():
pass
global PP_EXIT_FLAG
PP_EXIT_FLAG = True
for thread_PC in threads_parseC:
thread_PC.join()
#2等待url請求隊列中爲空,採集線程退出循環
while not urlQueue.empty():
pass
global C_EXIT_FLAG
C_EXIT_FLAG = True
for thread_C in threads_crawl:
thread_C.join()
#2.1等待放置採集返回的商品詳情頁隊列爲空,則退出解析線程
while not dataQueue.empty():
pass
global P_EXIT_FLAG
P_EXIT_FLAG = True
for thread_P in threads_parse:
thread_P.join()
#等待 已採集的商品數據隊列爲空,則退出存儲入數據庫的線程
while not resQueue.empty():
pass
global S_EXIT_FLAG
S_EXIT_FLAG = True
for thread_S in threads_Store:
thread_S.join()
if __name__ == '__main__':
start = time.time()
main()
print('[INFO]耗時%s' % (time.time() - start))
C_EXIT_FLAG = False
P_EXIT_FLAG = False
S_EXIT_FLAG = False
PC_EXIT_FLAG = False
PP_EXIT_FLAG = False
這裏五個全局變量是分別控制各個線程退出循環的判定條件
程序及數據通路流程圖:
速度檢測:
更多的關於爬蟲的代碼請訪問個人github自己有關於學習爬蟲的代碼或者資料有時間都會整理好放上去,覺得對你有幫助記得點個star。(另外建議就是對測試的網站最好能友好一點,學習的前提是不對對方服務器造成很大的負擔,謹記)。