python3 requests多線程爬取xici 代理ip並驗證

使用環境 Python3

使用到的庫
requests,lxml,threading,queue,time
需要額外安裝的庫
requests ,lxml
安裝命令
pip install requests  lxml 

開始爬蟲

工作流程:
1. 構造 URL 列表
2. 獲取 URL 響應頁面 
3. 提取頁面有用數據  
4. 保存數據
5. 清洗數據
上代碼
import requests
import threading
from queue import Queue
from lxml import etree
import time


class proxySpider():
    def __init__(self):
        self.url_temp = "https://www.xicidaili.com/nn/"
        self.headers = {"User-Agent": "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"}
        self.url_queue = Queue()
        self.page_queue = Queue()
        self.content_queue = Queue()

    def get_url_list(self):
        for i in range(1, 6):
            self.url_queue.put(self.url_temp + str(i))
            # print(self.url_temp + str(i))
            # print("empty" if self.url_queue.empty() else "! enpty")

    def parse_url(self):
        while True:
            if self.url_queue.empty() != True:
                url = self.url_queue.get()
                resp = requests.get(url, headers=self.headers)
                # print(resp.status_code)
                if resp.status_code == 200:
                    # print(resp.status_code)
                    self.page_queue.put(resp.content)
                self.url_queue.task_done()

    def get_content_list(self):
        while True:
            if self.page_queue.empty() != True:
                page = self.page_queue.get()
                html = etree.HTML(page.decode("utf-8"))
                proxy_list = []
                for each in html.xpath("//tr[@class='odd']"):
                    ip = each.xpath("./td[2]/text()")[0]
                    port = each.xpath("./td[3]/text()")[0]
                    if ip:
                        proxy = ip[0] + ":" + port[0]  # 拼接 ip和 port
                        proxy_list.append(proxy)
                self.content_queue.put(proxy_list)
                self.page_queue.task_done()

    def save(self):
        while True:
            if self.content_queue.empty() != True:
                print("寫入中...")
                with open("./res2.txt", "a", encoding="utf-8") as f:
                    content = self.content_queue.get()
                    for each in content:
                        f.write(each + "\n")
                self.content_queue.task_done()

    def run(self):
        thread_list = []
        # 1. 構建url
        self.get_url_list()
        # 2. 獲取url響應頁面
        # for i in range(3):
        #     t_paser = threading.Thread(target=self.parse_url)
        #     thread_list.append(t_paser)
        # self.parse_url()
        thread_list.extend([threading.Thread(target=self.parse_url) for i in range(3)])
        # 3. 獲取內容
        thread_list.extend([threading.Thread(target=self.get_content_list) for i in range(3)])
        # 4. 保存
        thread_list.extend([threading.Thread(target=self.save) for i in range(2)])
        for each in thread_list:
            each.setDaemon(True)
            each.start()
        for q in [self.url_queue, self.page_queue, self.content_queue]:
            q.join()


if __name__ == "__main__":
    start_time = time.time()
    spider = proxySpider()
    spider.run()
    end_time = time.time()
    cost_time = end_time - start_time
    print("時間開銷: {}".format(cost_time))

代理ip有效性檢查類

class checkIp():
    def __init__(self):
        self.url = 'http://www.baidu.com'
        self.headers = {"User-Agent": "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"}
        self.ip_queue = Queue()
        self.ip_valid_queue = Queue()

    def get_ip(self):
        f = open("./res2.txt", "r", encoding="utf-8")
        while True:
            ip_temp = f.readline().strip()
            # print('ip:{}'.format(ip_temp))
            if ip_temp == '':
                f.close()
                break
            elif self.ip_queue.full() != True:
                self.ip_queue.put(ip_temp)

    def check_ip(self):
        url = self.url
        # pass
        # print(self.ip_queue.qsize())
        while True:
            ip_temp = self.ip_queue.get()
            # print(ip_temp)
            proxy_ip = {"https": "https://" + ip_temp, "http": "http://" + ip_temp}
            # print(proxy_ip)
            try:
                resp = requests.get(url, headers=self.headers, proxies=proxy_ip, verify=False, timeout=2)
                if resp.status_code == 200:
                    self.ip_valid_queue.put(ip_temp)
                    # print(ip_temp)
            except Exception:
                pass
                # print('丟棄{}'.format(ip_temp))
            self.ip_queue.task_done()

    def save(self):

        while True:
            with open("./res2.txt", "a", encoding="utf-8") as f:
                if self.ip_valid_queue.empty() != True:
                    ip = self.ip_valid_queue.get()
                    f.write(ip + '\n')
                    self.ip_valid_queue.task_done()

    def run(self):
		thread_list = []
        # 1. 取出ip
        # 此處線程數不建議更改,資源讀取問題未解決,故只使用一個線程
        thread_list.extend([threading.Thread(target=self.get_ip) for i in range(1)])
        # 2. 驗證proxyIp
        thread_list.extend([threading.Thread(target=self.check_ip) for i in range(3)])
        # 3. 保存有效proxyIp
        thread_list.extend([threading.Thread(target=self.save) for i in range(4)])
        for each in thread_list:
            # print(type(each))
            each.setDaemon(True)	#設置守護線程
            each.start()
        for i in [self.ip_queue, self.ip_valid_queue]:
            i.join()

遇到的問題與總結:

  1. 驗證代理時出現 報錯: requests.exceptions.ProxyError: HTTPConnectionPool(host=‘113.65.5.186’, port…
    原因: 是因爲我的機器啓用代理
    解決方法: 進入電腦設置 關閉代理, 再啓動 代理 ip有效性測試程序
  2. 有待解決問題: 隊列應當設置上限 (特別是在大項目中). 本人在設置上限後,不能有效的解決出現的bug,所以放棄了 機器資源的節省管理.(但這隻適用於小文本資源的程序)
    最後,歡迎各位看官提出建議.
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章