pycharm+chromedriver+bs4+re+threading+queue模擬登錄小說多線程爬蟲
首先要安裝selenium,BeautifulSoup庫,下載對應的chromedriver版本
一般在cmd裏面pip install ×××,作者是用pycharm解釋器的,裏面可用搜索庫名進行下載,寫python爬蟲一般推薦這個
而chromedriver版本對應關係可以自行百度,下載完安裝到自己選定的路徑,並記得在下面的代碼修改路徑
具體步驟如下:
①首先是根據輸入的小說名模擬登錄網站http://www.biquge.tv/進行模擬檢索,如有多種可能會生成一個選擇表格 (如果只有一本檢索結果,則會跳過這個選擇步驟,直接進行下一步)
pattern1 = re.compile(r'<td class="odd"><a href="(.*?)">(.*?)</a>', re.S)
contents1 = re.findall(pattern1, driver.page_source)
pattern2 = re.compile(r'<td class="odd">(.*?)</td>', re.S)
contents2 = re.findall(pattern2, driver.page_source)
if len(contents2) and len(contents1):
URLlist = []
namelist = []
authorlist = []
for content in contents1:
URLlist.append(content[0])
namelist.append(content[1])
flag = False
for content in contents2:
if flag == True:
authorlist.append(content)
flag = False
else:
flag = True
print('小說網站搜索的結果如下:')
print('\t' + '編號' + '\t\t' + '小說' + '\t\t' + '作者' + '\t')
num = 1
for name, author, in zip(namelist, authorlist):
print('\t' + str(num) + '\t\t' + name + '\t\t' + author + '\t')
num += 1
step = int(input('請選擇所需的小說,輸入對應的編號:'))
want_url = str(URLlist[step - 1])
②爬取所需要下載的小說各個章節的URL,將其依次存入隊列
在第一步中獲得對應網頁的URL進行隊列存儲
driver.get(want_url)
page_source = driver.page_source
pattern2 = re.compile(r'<dd><a href="(.*?)">(.*?)</a></dd>', re.S)
All_html = re.findall(pattern2, page_source)
for ones in All_html[9:]:
part_url = ones[0]
title = ones[1]
print(title + '+' + base_url + part_url)
q.put(title + '+' + base_url + part_url)
driver.close()
driver.quit()
③首先會打印出章節名字和對應的URL,這可以檢查是否成功爬到所需的小說,選擇所需要的線程數量,一般跟自己的電腦cpu性能有很大關係,選擇20~40就夠了
下面是線程的生成與最後的結束關閉線程
threadnum = int(input('請輸入所要開啓的爬蟲線程數量:'))
start_time = time.time()
for i in range(1, threadnum + 1, 1):
threadList.append('Spider_Thread-' + str(i))
queueLock = threading.Lock() # 避免多個線程保衛同一塊數據的時候,產生錯誤,所以加鎖
threads = []
threadID = 1
# 創建新線程
for tName in threadList:
thread = myThread(threadID, tName, q)
thread.start()
threads.append(thread)
threadID += 1
# 等待隊列清空
while not q.empty():
pass
# 通知線程是時候退出
exitFlag = 1
# 等待所有線程完成
for t in threads:
t.join()
print(t.name + '退出成功')
④選擇完線程數 啓動成功後,就開始爬取各章節到指定的文件夾
此外,最重要的多線程重載threading如下,一般把主要的運行代碼放在重載run(self)函數裏面
class myThread(threading.Thread): # 繼承父類threading.Thread
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
# 把要執行的代碼寫到run函數裏面 線程在創建後會直接運行run函數
print(self.name+'啓動成功')
while not exitFlag:
queueLock.acquire() #鎖定線程
if not q.empty():
item = q.get()
queueLock.release() #釋放線程
title = item.split('+')[0]
href = item.split('+')[1]
get_content(title, href)
else:
print('數據全部結束')
queueLock.release()# 釋放線程
還需要記住開啓線程鎖,防止衝突,定義後,在run函數內使用 ,如上圖
queueLock = threading.Lock() # 避免多個線程保衛同一塊數據的時候,產生錯誤,所以加鎖
好了,具體全部代碼如下(只需更改driverchrome安裝的路徑和存儲小說的文件夾路徑就可以運行了):
import queue
import threading
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import re
from bs4 import BeautifulSoup
import time
#多線程爬取筆趣閣小說,可進行手動搜索所需的小說
#@author Himit_ZH
#qq:372347736
exitFlag = 0
q = queue.Queue()
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#driverchrome安裝的路徑
driver_path = r'E:\py\chromedriver\chromedriver.exe'
base_url = r'http://www.biquge.tv'
#存儲小說的路徑
txt_path = r'E://py//小說//'
#小說總章節數
Sum_Chapters = 0.0
#所要搜索的小說名字
novel_name = str()
class scrapy_biquge():
def get_url(self):
driver = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options)
driver.get('http://www.biquge.tv/')
driver.find_element_by_id('wd').send_keys(novel_name)
driver.find_element_by_id('sss').click()
# 設置窗口句柄跳到該抓取的網頁
handles = driver.window_handles
driver.switch_to.window(handles[1])
if '出現錯誤!' in driver.page_source:
driver.close()
driver.quit()
print('輸入錯誤,請重新輸入')
return False
current_url = driver.current_url
if 'search.php?' in current_url :
pattern1 = re.compile(r'<td class="odd"><a href="(.*?)">(.*?)</a>', re.S)
contents1 = re.findall(pattern1, driver.page_source)
pattern2 = re.compile(r'<td class="odd">(.*?)</td>', re.S)
contents2 = re.findall(pattern2, driver.page_source)
if len(contents2) and len(contents1):
URLlist = []
namelist = []
authorlist = []
for content in contents1:
URLlist.append(content[0])
namelist.append(content[1])
flag = False
for content in contents2:
if flag == True:
authorlist.append(content)
flag = False
else:
flag = True
print('小說網站搜索的結果如下:')
print('\t'+'編號'+'\t\t'+'小說'+'\t\t'+'作者'+'\t')
num = 1
for name, author, in zip(namelist,authorlist):
print('\t'+str(num)+'\t\t'+name+'\t\t'+author+'\t')
num += 1
step = int(input('請選擇所需的小說,輸入對應的編號:'))
want_url = str(URLlist[step-1])
driver.get(want_url)
page_source = driver.page_source
pattern2 = re.compile(r'<dd><a href="(.*?)">(.*?)</a></dd>', re.S)
All_html = re.findall(pattern2, page_source)
for ones in All_html[9:]:
part_url = ones[0]
title = ones[1]
print(title + '+' + base_url+part_url)
q.put(title + '+' + base_url+part_url)
driver.close()
driver.quit()
return True
if '抱歉,搜索沒有結果^_^' in driver.page_source:
driver.close()
driver.quit()
print('抱歉,搜索沒有結果,請重新輸入')
return False
class myThread(threading.Thread): # 繼承父類threading.Thread
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
# 把要執行的代碼寫到run函數裏面 線程在創建後會直接運行run函數
print(self.name+'啓動成功')
while not exitFlag:
queueLock.acquire() #鎖定線程
if not q.empty():
item = q.get()
queueLock.release() #釋放線程
title = item.split('+')[0]
href = item.split('+')[1]
get_content(title, href)
else:
print('數據全部結束')
queueLock.release()# 釋放線程
def get_content(title, href):
driver = webdriver.Chrome(executable_path=driver_path,chrome_options=chrome_options)
driver.get(href)
bs4 = BeautifulSoup(driver.page_source, 'lxml')
title = bs4.h1.get_text() # 章節名
filename = txt_path+''.join(title.split()[0])+'.txt'
content = bs4.find('div', id='content')
content = content.get_text()
with open(filename, 'w', encoding='utf-8') as f:
f.write("\r"+title+"\r\n")
f.write(content)
print('['+title+'] 成功下載,'+'現已下載總章節數的{:.2f}%'.format(((1.0 - q.qsize()/Sum_Chapters))*100))
driver.close()
driver.quit()
if __name__ == '__main__':
# 所有url進隊列以後,啓動線程
while True:
try:
novel_name = input('請輸入你想要搜索的小說名字:')
if scrapy_biquge().get_url():
break
except KeyError:
pass
Sum_Chapters = q.qsize()
threadList = []
threadnum = int(input('請輸入所要開啓的爬蟲線程數量:'))
start_time = time.time()
for i in range(1, threadnum+1, 1):
threadList.append('Spider_Thread-'+str(i))
queueLock = threading.Lock() # 避免多個線程保衛同一塊數據的時候,產生錯誤,所以加鎖
threads = []
threadID = 1
# 創建新線程
for tName in threadList:
thread = myThread(threadID, tName, q)
thread.start()
threads.append(thread)
threadID += 1
# 等待隊列清空
while not q.empty():
pass
# 通知線程是時候退出
exitFlag = 1
# 等待所有線程完成
for t in threads:
t.join()``
print(t.name+'退出成功')
end_time = time.time()
print('本次爬取小說耗時爲'+str(round(end_time-start_time, 2))+'秒')