python多線程小說爬蟲可提供查詢功能

pycharm+chromedriver+bs4+re+threading+queue模擬登錄小說多線程爬蟲

首先要安裝selenium,BeautifulSoup庫,下載對應的chromedriver版本

一般在cmd裏面pip install ×××,作者是用pycharm解釋器的,裏面可用搜索庫名進行下載,寫python爬蟲一般推薦這個
而chromedriver版本對應關係可以自行百度,下載完安裝到自己選定的路徑,並記得在下面的代碼修改路徑

具體步驟如下:

①首先是根據輸入的小說名模擬登錄網站http://www.biquge.tv/進行模擬檢索,如有多種可能會生成一個選擇表格 (如果只有一本檢索結果,則會跳過這個選擇步驟,直接進行下一步)

pattern1 = re.compile(r'<td class="odd"><a href="(.*?)">(.*?)</a>', re.S)
contents1 = re.findall(pattern1, driver.page_source)
pattern2 = re.compile(r'<td class="odd">(.*?)</td>', re.S)
contents2 = re.findall(pattern2, driver.page_source)
if len(contents2) and len(contents1):
    URLlist = []
    namelist = []
    authorlist = []
    for content in contents1:
        URLlist.append(content[0])
        namelist.append(content[1])
    flag = False
    for content in contents2:
        if flag == True:
            authorlist.append(content)
            flag = False
        else:
            flag = True
    print('小說網站搜索的結果如下:')
    print('\t' + '編號' + '\t\t' + '小說' + '\t\t' + '作者' + '\t')
    num = 1
    for name, author, in zip(namelist, authorlist):
        print('\t' + str(num) + '\t\t' + name + '\t\t' + author + '\t')
        num += 1
    step = int(input('請選擇所需的小說,輸入對應的編號:'))
    want_url = str(URLlist[step - 1])

②爬取所需要下載的小說各個章節的URL,將其依次存入隊列
在第一步中獲得對應網頁的URL進行隊列存儲

driver.get(want_url)
page_source = driver.page_source
pattern2 = re.compile(r'<dd><a href="(.*?)">(.*?)</a></dd>', re.S)
All_html = re.findall(pattern2, page_source)

for ones in All_html[9:]:
    part_url = ones[0]
    title = ones[1]
    print(title + '+' + base_url + part_url)
    q.put(title + '+' + base_url + part_url)
driver.close()
driver.quit()

③首先會打印出章節名字和對應的URL,這可以檢查是否成功爬到所需的小說,選擇所需要的線程數量,一般跟自己的電腦cpu性能有很大關係,選擇20~40就夠了
在這裏插入圖片描述
下面是線程的生成與最後的結束關閉線程

threadnum = int(input('請輸入所要開啓的爬蟲線程數量:'))
start_time = time.time()
for i in range(1, threadnum + 1, 1):
    threadList.append('Spider_Thread-' + str(i))
queueLock = threading.Lock()  # 避免多個線程保衛同一塊數據的時候,產生錯誤,所以加鎖
threads = []
threadID = 1
# 創建新線程
for tName in threadList:
    thread = myThread(threadID, tName, q)
    thread.start()
    threads.append(thread)
    threadID += 1
# 等待隊列清空
while not q.empty():
    pass
# 通知線程是時候退出
exitFlag = 1
# 等待所有線程完成
for t in threads:
    t.join()
    print(t.name + '退出成功')

④選擇完線程數 啓動成功後,就開始爬取各章節到指定的文件夾
在這裏插入圖片描述
此外,最重要的多線程重載threading如下,一般把主要的運行代碼放在重載run(self)函數裏面

class myThread(threading.Thread): # 繼承父類threading.Thread

    def __init__(self, threadID, name, counter):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.counter = counter

    def run(self):
          # 把要執行的代碼寫到run函數裏面 線程在創建後會直接運行run函數
        print(self.name+'啓動成功')
        while not exitFlag:
            queueLock.acquire() #鎖定線程
            if not q.empty():
                item = q.get()
                queueLock.release() #釋放線程
                title = item.split('+')[0]
                href = item.split('+')[1]
                get_content(title, href)
            else:
                print('數據全部結束')
                queueLock.release()# 釋放線程

還需要記住開啓線程鎖,防止衝突,定義後,在run函數內使用 ,如上圖

queueLock = threading.Lock() # 避免多個線程保衛同一塊數據的時候,產生錯誤,所以加鎖

好了,具體全部代碼如下(只需更改driverchrome安裝的路徑和存儲小說的文件夾路徑就可以運行了):

import queue
import threading
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import re
from bs4 import BeautifulSoup
import time

#多線程爬取筆趣閣小說,可進行手動搜索所需的小說
#@author Himit_ZH
#qq:372347736

exitFlag = 0
q = queue.Queue()
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#driverchrome安裝的路徑
driver_path = r'E:\py\chromedriver\chromedriver.exe'

base_url = r'http://www.biquge.tv'

#存儲小說的路徑
txt_path = r'E://py//小說//'

#小說總章節數
Sum_Chapters = 0.0

#所要搜索的小說名字
novel_name = str()

class scrapy_biquge():

    def get_url(self):

        driver = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options)
        driver.get('http://www.biquge.tv/')
        driver.find_element_by_id('wd').send_keys(novel_name)
        driver.find_element_by_id('sss').click()
        # 設置窗口句柄跳到該抓取的網頁
        handles = driver.window_handles
        driver.switch_to.window(handles[1])
        if '出現錯誤!' in driver.page_source:
            driver.close()
            driver.quit()
            print('輸入錯誤,請重新輸入')
            return False
        current_url = driver.current_url
        if 'search.php?' in current_url :
            pattern1 = re.compile(r'<td class="odd"><a href="(.*?)">(.*?)</a>', re.S)
            contents1 = re.findall(pattern1, driver.page_source)
            pattern2 = re.compile(r'<td class="odd">(.*?)</td>', re.S)
            contents2 = re.findall(pattern2, driver.page_source)
            if len(contents2) and len(contents1):
                URLlist = []
                namelist = []
                authorlist = []
                for content in contents1:
                    URLlist.append(content[0])
                    namelist.append(content[1])
                flag = False
                for content in contents2:
                    if flag == True:
                        authorlist.append(content)
                        flag = False
                    else:
                        flag = True
                print('小說網站搜索的結果如下:')
                print('\t'+'編號'+'\t\t'+'小說'+'\t\t'+'作者'+'\t')
                num = 1
                for name, author, in zip(namelist,authorlist):
                    print('\t'+str(num)+'\t\t'+name+'\t\t'+author+'\t')
                    num += 1
                step = int(input('請選擇所需的小說,輸入對應的編號:'))
                want_url = str(URLlist[step-1])
                driver.get(want_url)
                page_source = driver.page_source
                pattern2 = re.compile(r'<dd><a href="(.*?)">(.*?)</a></dd>', re.S)
                All_html = re.findall(pattern2, page_source)

                for ones in All_html[9:]:
                    part_url = ones[0]
                    title = ones[1]
                    print(title + '+' + base_url+part_url)
                    q.put(title + '+' + base_url+part_url)
                driver.close()
                driver.quit()
                return True

        if '抱歉,搜索沒有結果^_^' in driver.page_source:
            driver.close()
            driver.quit()
            print('抱歉,搜索沒有結果,請重新輸入')
            return False

class myThread(threading.Thread): # 繼承父類threading.Thread

    def __init__(self, threadID, name, counter):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.counter = counter

    def run(self):
          # 把要執行的代碼寫到run函數裏面 線程在創建後會直接運行run函數
        print(self.name+'啓動成功')
        while not exitFlag:
            queueLock.acquire() #鎖定線程
            if not q.empty():
                item = q.get()
                queueLock.release() #釋放線程
                title = item.split('+')[0]
                href = item.split('+')[1]
                get_content(title, href)
            else:
                print('數據全部結束')
                queueLock.release()# 釋放線程
def get_content(title, href):
        driver = webdriver.Chrome(executable_path=driver_path,chrome_options=chrome_options)
        driver.get(href)
        bs4 = BeautifulSoup(driver.page_source, 'lxml')
        title = bs4.h1.get_text()  # 章節名
        filename = txt_path+''.join(title.split()[0])+'.txt'
        content = bs4.find('div', id='content')
        content = content.get_text()
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("\r"+title+"\r\n")
            f.write(content)
        print('['+title+']  成功下載,'+'現已下載總章節數的{:.2f}%'.format(((1.0 - q.qsize()/Sum_Chapters))*100))
        driver.close()
        driver.quit()


if __name__ == '__main__':
    # 所有url進隊列以後,啓動線程
    while True:
        try:
            novel_name = input('請輸入你想要搜索的小說名字:')
            if scrapy_biquge().get_url():
                break
        except KeyError:
            pass
    Sum_Chapters = q.qsize()
    threadList = []
    threadnum = int(input('請輸入所要開啓的爬蟲線程數量:'))
    start_time = time.time()
    for i in range(1, threadnum+1, 1):
        threadList.append('Spider_Thread-'+str(i))
    queueLock = threading.Lock() # 避免多個線程保衛同一塊數據的時候,產生錯誤,所以加鎖
    threads = []
    threadID = 1
      # 創建新線程
    for tName in threadList:
        thread = myThread(threadID, tName, q)
        thread.start()
        threads.append(thread)
        threadID += 1
      # 等待隊列清空
    while not q.empty():
        pass
      # 通知線程是時候退出
    exitFlag = 1
      # 等待所有線程完成
    for t in threads:
        t.join()``
        print(t.name+'退出成功')
    end_time = time.time()
    print('本次爬取小說耗時爲'+str(round(end_time-start_time, 2))+'秒')
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章