Python爬蟲練習三-多線程爬蟲

前言:

在聯繫二的時候,練習了爬噹噹網、爬豆瓣網,但是爬取的信息都比較慢,

練習:

基礎練習

實現代碼

import  threading
import  time

#創建一個線程子類:
class MyThread(threading.Thread):
    def __init__(self,threadID,name,couter):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.couter = couter

    def run(self):
        print("開始線程: "+self.name)
        moyu_time(self.name, self.couter, 10)
        print("退出線程:"+self.name)

def moyu_time(threadName,delay,counter):
        while counter:
            time.sleep(delay)
            print("%s 開始摸魚 %s" % (threadName,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
            counter -= 1


#創建新線程
# 小明休息一秒
# 小紅消息兩秒
thread1 = MyThread(1,"小明",1)
thread2 = MyThread(2,"小紅",2)

#開啓新線程
thread1.start()
thread2.start()

#等待線程終止
thread1.join()
thread2.join()

print("退出主線程")

結果

線程池練習一:

瘋狂的開啓線程,因爲頻繁的創建線程 銷燬線程,非常的浪費資源,所以呢,應該把他們放到池子裏面去一起洗澡--線程池

可以使用 ThreadPoolExecutor 來實現線程池,這樣就不會去重複的創建銷燬線程了

實現代碼:
 

import  threading
import  time

#創建一個線程子類:
from concurrent.futures.thread import ThreadPoolExecutor


class MyThread(threading.Thread):
    def __init__(self,threadID,name,couter):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.couter = couter

    def run(self):
        print("開始線程: "+self.name)
        moyu_time(self.name, self.couter, 10)
        print("退出線程:"+self.name)

def moyu_time(threadName,delay,counter):
        while counter:
            time.sleep(delay)
            print("%s 開始摸魚 %s" % (threadName,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
            counter -= 1


if __name__ == '__main__':
    pool = ThreadPoolExecutor (29)
    for i  in range(1,5):
        pool.submit(moyu_time(('YDLin'+str(i)),1,3))

 

用隊列做線程池

import threading
import time
from queue import Queue

class CustomThread(threading.Thread):
    def __init__(self,queue):
        threading.Thread.__init__(self)
        self.__queue = queue

    def run(self):
        while True:
            #要執行的話就需要去隊列裏面取了
            q_method = self.__queue.get()
            q_method()
            self.__queue.task_done()

def moyu():
    print(" 開始摸魚 %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

def queue_pool():
    queue = Queue(5)
    for i in range(queue.maxsize):
        t = CustomThread(queue)
        t.setDaemon(True) #每個線程都讓它們處於守護狀態
        t.start()

    for i in range(20):
        queue.put(moyu)
    queue.join()

if __name__ == '__main__':
    queue_pool()

實戰:

之前爬取豆瓣網的改進:
使用多線程,幾分鐘就搞定了

import json
import multiprocessing

import multiprocess as multiprocess
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import re
import time
import xlwt

def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
            }

        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

book = xlwt.Workbook(encoding='utf-8', style_compression=0)

sheet = book.add_sheet('豆瓣電影Top250', cell_overwrite_ok=True)
sheet.write(0, 0, '名稱')
sheet.write(0, 1, '圖片')
sheet.write(0, 2, '排名')
sheet.write(0, 3, '評分')
sheet.write(0, 4, '作者')
sheet.write(0, 5, '簡介')

n = 1
def save_to_excel(soup):
    list = soup.find(class_='grid_view').find_all('li')

    for item in list:
        item_name = item.find(class_='title').string
        item_img = item.find('a').find('img').get('src')
        item_index = item.find(class_='').string
        item_score = item.find(class_='rating_num').string
        item_author = item.find('p').text
        if (item.find(class_='inq') != None):
            item_intr = item.find(class_='inq').string
        # print('爬取電影:' + item_index + ' | ' + item_name +' | ' + item_img +' | ' + item_score +' | ' + item_author +' | ' + item_intr )
        print('爬取電影:' + item_index + ' | ' + item_name + ' | ' + item_score + ' | ' + item_intr)
        global n
        sheet.write(n, 0, item_name)
        sheet.write(n, 1, item_img)
        sheet.write(n, 2, item_index)
        sheet.write(n, 3, item_score)
        sheet.write(n, 4, item_author)
        sheet.write(n, 5, item_intr)
        n = n + 1

def parse_one_page(html):
    pattern = re.compile(
        '<li>.*?<em class="">(.*?)</em>.*?title.*?>(.*?)</span>.*? <span class="rating_num" property="v:average">(.*?)</span>.*?<span class="inq">(.*?)</span>',
        re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield {'index': item[0],
               'title': item[1],
               'score': item[2],
               'comment': item[3]
               }


def write_to_file(content):
    with open('douban250.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')


def main(url):
    html = get_one_page(url)
    soup = BeautifulSoup(html, 'lxml')
    save_to_excel(soup)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)


if __name__ == '__main__':
    start =time.time()
    urls = []
    pool = multiprocessing.Pool(multiprocessing.cpu_count())#我們根據電腦 CPU 的內核數量創建相應的進程池
    for i in range(0, 10):
        url = 'https://movie.douban.com/top250?start=' + str(i * 25) + '&filter='
        urls.append(url)
    pool.map(main, urls)#通過 map 方法去執行我們的主函數將我們獲得的 url 傳過去
    pool.close()
    pool.join()#爲的是讓進程池的進程執行完畢再結束

book.save(u'豆瓣最受歡迎的250部電影.xlsx')

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章