多線程爬取豆瓣電影top250

多線程爬取豆瓣電影top250

之前寫過一篇有關多線程爬蟲的文章,裏面對分析過程進行了詳細的介紹,所以這裏就不對過程進行分析了,如果你是剛接觸爬蟲的新手的話可以參考一下我之前寫的爬蟲:
https://blog.csdn.net/weixin_40481076/article/details/101312325
本次爬蟲的目標網站:https://movie.douban.com/top250?
爬取信息:top250頁面電影排名和電影名字、圖片url,將圖片下載下來,以電影排名+‘—’+電影名字命名圖片
在這裏插入圖片描述
代碼
程序實現多線程採用的是繼承threading.Thread類重寫一個新的線程類,爬蟲的主要邏輯寫在run方法裏面。

import os
import threading
import urllib
import lxml
import requests
from lxml import etree


'''
    多線程爬取豆瓣電影top250
'''

class ConsumerThread(threading.Thread):
    def __init__(self,startUrl,headers,startNum,endNum,path,tname):
        threading.Thread.__init__(self)
        self.startUrl=startUrl
        self.headers=headers
        self.startNum=startNum
        self.endNum=endNum
        self.path=path
        self.tname=tname

    def run(self):
        for page in range(self.startNum, self.endNum + 25, 25):
            res = request_page(self.startUrl + str(page), self.headers)
            # res.encoding = 'utf-8'
            #print(res)
            try:
                res = lxml.etree.HTML(res)
                for div in res.xpath("//div[@class='item']"):
                    try:
                        num=div.xpath('./div[1]/em[1]/text()')[0]
                        name = div.xpath('./div[2]/div[1]/a[1]/span[1]/text()')[0]  # .extract()[0]
                        imageUrl = div.xpath('./div[1]/a[1]/img[1]/@src')[0]  # .extract()[0]
                        print(name)
                        print(imageUrl)
                        # introduce=div.xpath('./div[2]/div[2]/p[1]/text()').extract()[0]
                        # dic = dict(zip('num','name','introduce','imageUrl'),[num,name,introduce,imageUrl])
                        print('線程' + self.tname + '正在下載圖片')
                        print('')
                        download_pic(imageUrl, str(num)+'---'+name, self.path)
                    except Exception as e:
                        print(str(e))
                        continue
            except Exception as e:
                print(str(e))
                continue


def request_page(startUrl,headers):
    try:
        res=requests.get(startUrl,headers=headers)   #去掉headers
        res.encoding="utf-8"
        if res.status_code == 200:
            return res.text
    except requests.RequestException:
        return None

def get_headers():
    headers={
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        # 'Accept-Encoding': 'gzip, deflate, br',   #導致頁面亂碼
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Host': 'movie.douban.com',
        'Referer': 'https://movie.douban.com/top250?start=0&filter=',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    }
    return headers

# def makeFile(path,fileName,):
#     if not os.path.exists(path):
#         os.makedirs(path)

def download_pic(url,name,path):
    if not os.path.exists(path):
        os.makedirs(path)
    try:
        res=urllib.request.urlopen(url,timeout=5).read()
        with open(path+name+'.jpg','wb') as file:
            file.write(res)
            file.close()
    except Exception as e:
        print(str(e))


if __name__ == '__main__':
    url = 'https://movie.douban.com/top250?start='
    header=get_headers()
    thread1 = ConsumerThread(url,header,0,100,'d:/download/豆瓣電影top250AA/','A')
    thread2 = ConsumerThread(url,header,125,225,'d:/download/豆瓣電影top250BB/','B')
    thread1.start()
    thread2.start()
    thread1.join()
    thread2.join()
    print('*'*10+'下載完成!'+'*'*10)

'''
    https://movie.douban.com/top250?start=
'''

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章