python 多線程爬蟲(京東iPhone)

# 多線程模塊
import threading
# 隊列模塊
import queue
import requests
from lxml import etree
import time
import random
import json
import pymongo

# 採集線程數
concurrent = 3
# 解析線程數
conparse = 3


class Crawl(threading.Thread):
    """
    主要作用根據特定URL獲取響應信息
    """

    def __init__(self, number, req_list, data_list):
        # 調用Thread父類方法
        super(Crawl, self).__init__()
        # 初始化子類屬性
        self.number = number
        self.req_list = req_list
        self.data_list = data_list
        self.headers = {
            "authority": "search.jd.com",
            "method": "GET",
            "path": "/s_new.php?keyword=iphone&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0",
            "scheme": "https",
            "cookie": "shshshfpa=817b0820-cebe-1aa5-5eaa-8f8d3b5945b3-1564123066; shshshfpb=nyq8Po%20NsVQIn1ih2zomJlw%3D%3D; TrackID=167xINX9YcKrGWU-JQ0EQWhwYstFi3gNOxFq6Em4_l6J6OECY5-pwyzHxRFr6TTZkLHI1m_3orstgEzPhWGk1pkbfG_ASOMSSscDY_oEz4XQ; pinId=DRR64H7p6D2CxuR9knABB7V9-x-f3wj7; qrsc=3; __jdu=502413746; areaId=2; PCSYCityID=CN_310000_310100_310112; xtest=3925.cf6b6759; ipLoc-djd=2-2825-51931-0; rkv=V0600; user-key=88fa50f5-ec38-48ff-9efe-beecaa5ffc96; cn=0; unpl=V2_ZzNtbRdfQEF8DRMDeR9ZDGIHFAhKUhcRd1tBVnNLXAcwB0FbclRCFX0UR1xnGlgUZwMZWEpcRxVFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8YXwBnARRYRWdzEkU4dlB8G1oEVwIiXHIVF0l1CkJRfxkRAWYAF11AUUYSRQl2Vw%3d%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_d92e84df3748457d94e53b639c13f5b7|1569383391707; __jda=122270672.502413746.1564123065.1569380256.1569383392.14; __jdc=122270672; shshshfp=f5c7274df7eb1773b86a72faa494fed4; 3AB9D23F7A4B3C9B=SHZYIPK2KZYMKRXEKXFSLWSYQOVLA745EZ4NXQTNMFLQMVFRJKZT7VYDEIY6L2USE2KDEOHO2IPNACME4W7GJ2LKTM",
            "referer": "https://search.jd.com/Search?keyword=iphone&enc=utf-8&suggest=1.his.0.0&wq=&pvid=ec4030074500424391b371d06a8a62fd",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
        }

        # 線程啓動的時候調用

    def run(self):
        print("啓動採集線程{}號".format(self.number))
        # 如果請求隊列不爲空, 則無限循環, 從請求隊列裏面請求URL
        while self.req_list.qsize() > 0:
            # 從請求隊列裏提取URL
            url = self.req_list.get()
            print("{}號線程採集:{}".format(self.number, url))
            # 防止請求頻率過快,隨機設置阻塞時間
            time.sleep(random.randint(1, 3))
            # 發起http請求,獲取響應內容, 追加到數據隊列裏,等待解析
            response = requests.get(url, headers=self.headers)
            if response.status_code == 200:
                # 向數據隊列裏面追加
                self.data_list.put(response.text)


class Parse(threading.Thread):
    """
    將響應信息進行數據解析
    """

    def __init__(self, number, data_list, req_thread, f):
        super(Parse, self).__init__()
        # 線程編號
        self.number = number
        # 數據隊列
        self.data_list = data_list
        # 請求隊列,爲了判斷採集線程存活狀態
        self.req_list = req_thread
        # 獲取文件對象
        self.f = f
        # 判斷是否從數據隊列裏提取數據
        self.is_parse = True

    def run(self):
        print("啓動{}號解析線程".format(self.number))
        # 無線循環
        while True:
            # 如何判斷解析線程的結束條件
            for t in self.req_list:
                # 判斷解析線程是否存活
                if t.is_alive():
                    # 如果線程存活,代表請求沒有完成,跳出整個循環,從新開始while True
                    break
            else:  # 如果循環完畢,沒有執行break語句,則進入else
                if self.data_list.qsize() == 0:
                    # 設置解析False
                    self.is_parse = False
            # 判斷是否繼續解析
            if self.is_parse:
                try:
                    # 從數據隊列裏面提取一個數據
                    data = self.data_list.get(timeout=3)
                except Exception as E:  # 超時以後進入異常
                    data = None
                # 如歌成功拿到數據,則條用解析方法
                if data is not None:
                    self.parse(data)
            else:
                # 結束while 無線循環
                break
        print("退出{}號解析線程".format(self.number))

    def parse(self, data):
        html = etree.HTML(data)
        # 獲取多有li
        lis = html.xpath('//li[contains(@class,"gl-item")]')
        for li in lis:
            href = li.xpath('div/div[@class="p-img"]/a/@href')
            title = li.xpath('div/div[@class="p-img"]/a/@title')
            price = li.xpath('div/div[@class="p-price"]/strong/i/text()')
            name = li.xpath('div/div[@class="p-name p-name-type-2"]/a/em/text()')
            commit = li.xpath('div/div[@class="p-commit"]//text()')
            shop = li.xpath('div/div[@class="p-shop"]/span/a/text()')
            icons = li.xpath('div/div[@class="p-icons"]/i/text()')
            stock = li.xpath('div/div[@class="p-stock"]/text()')
            item = {
                "href": "https:" + href[0],
                "title": str(title[0]).replace("\r\n\t", ""),
                "price": " ".join(price),
                "name": str(" ".join(name)).replace("\r\n\t", "").strip(),
                "commit": str(" ".join(commit)).replace("\r\n\t", "").strip(),
                "shop": " ".join(shop),
                "icons": " ".join(icons),
                "stock": " ".join(stock)
            }
            self.f.write(json.dumps(item, ensure_ascii=False) + "\n")
            self.save_to_mongo(item)

    def save_to_mongo(self, result):
        MONGO_URL = "localhost"
        MONGO_DB = "JingDong"
        MONGO_COLLECTION = "multithreading_iphone"
        client = pymongo.MongoClient(MONGO_URL)
        db = client[MONGO_DB]
        try:
            db[MONGO_COLLECTION].insert_one(result)
        except Exception:
            print("存儲到MongoDB失敗")


def main():
    # 生成請求隊列
    req_list = queue.Queue()
    # 生成數據隊列,請求以後,響應內容放到數據隊列裏
    data_list = queue.Queue()
    # 創建文件對象
    f = open(r"G:\個人總結\csv\JD_Iphone.json", "w", encoding="utf-8")
    # 循環生成多個請求URL
    ms = time.time()
    for i in range(100):
        base_url = None
        if (i + 1) % 2 != 0:
            base_url = "https://search.jd.com/s_new.php?keyword=iphone&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0&page={0}&s={1}&click=0".format(
                i + 1, i * 30 + 1)
        else:
            base_url = "https://search.jd.com/s_new.php?keyword=iphone&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.his.0.0&page={0}&s={1}&scrolling=y&log_id={2}".format(
                i + 1, i * 30 + 1, '%.5f' % (ms)
            )
        req_list.put(base_url)
    # 生成N個採集線程
    req_thread = []
    for i in range(concurrent):
        t = Crawl(i + 1, req_list, data_list)
        t.start()
        req_thread.append(t)
    # 生成N個解析線程
    parse_thread = []
    for i in range(conparse):
        t = Parse(i + 1, data_list, req_thread, f)
        t.start()
        parse_thread.append(t)
    for t in req_thread:
        t.join()
        for tt in parse_thread:
            tt.join()
    f.close()


if __name__ == "__main__":
    main()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章