Python3爬取西刺代理前2頁國內高匿代理IP並驗證有效性,若獲取失敗,使用快代理獲取IP存入表格中

導入的ExcelUtil包:https://blog.csdn.net/z564359805/article/details/88874879 

#!/usr/bin/env python
# coding=utf-8

# 爬取西刺代理前2頁國內高匿代理IP並驗證有效性
# 西刺代理若獲取失敗,使用快代理獲取IP
# https://www.xicidaili.com/nn/1
import urllib.request
from lxml import etree
import time,random,os
from openpyxl import Workbook
from process_excel import ExcelUtil
import requests


# 獲取
class GetProxy(object):
    def __init__(self):
        self.USER_AGENT_LIST = [
            'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
            'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
            'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
            'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
            'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)',
            'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
            'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
            'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)',
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/67.0.3396.62 Safari/537.36'
        ]
        # 獲取的所有IP列表,包含存活時間、驗證時間,列表形式儲存在表格中
        self.list_all_ip = []
        self.wb = Workbook()
        self.data = ['序號','代理IP','最後驗證時間','驗證使用字段','是否有效']
    # 獲取IP
    def getProxy(self):
        print("獲取代理IP數據中......")
        for page in range(1,3):
            time.sleep(2)
            try:
                print("嘗試從西刺代理獲取...")
                headers = {
                    "Host": "www.xicidaili.com",
                    "User-Agent": random.choice(self.USER_AGENT_LIST),
                }
                url = "https://www.xicidaili.com/nn/"
                request = urllib.request.Request(url+str(page),headers=headers)
                response = urllib.request.urlopen(request)
                html_ = response.read().decode('utf-8')
                # with open("xici-" + str(page)+".html","w",encoding="utf-8")as f:
                #     f.write(html_)
                html = etree.HTML(html_)
                # 獲取IP地址  116.209.56.240
                ip_add = html.xpath('//*[@id="ip_list"]//tr/td[2]/text()')
                # 獲取端口  9999
                port = html.xpath('//*[@id="ip_list"]//tr/td[3]/text()')
                # 獲取類型 HTTP
                ip_type = html.xpath('//*[@id="ip_list"]//tr/td[6]/text()')
                # 獲取存活時間
                # live_time = html.xpath('//*[@id="ip_list"]//tr/td[9]/text()')
                # 獲取驗證時間
                verify_time = html.xpath('//*[@id="ip_list"]//tr/td[10]/text()')
            except:
                print("西刺代理獲取失敗,使用快代理獲取IP...")
                headers = {
                    "Host": "www.kuaidaili.com",
                    "User-Agent": random.choice(self.USER_AGENT_LIST),
                }
                url = "https://www.kuaidaili.com/free/inha/"
                request = urllib.request.Request(url + str(page), headers=headers)
                response = urllib.request.urlopen(request)
                html_ = response.read().decode('utf-8')
                # with open("kuaidai-" + str(page)+".html","w",encoding="utf-8")as f:
                #     f.write(html_)
                html = etree.HTML(html_)
                # 獲取IP地址  116.209.56.240
                ip_add = html.xpath('//*[@id="list"]//tbody/tr/td[1]/text()')
                # 獲取端口  9999
                port = html.xpath('//*[@id="list"]//tbody/tr/td[2]/text()')
                # 獲取類型 HTTP
                ip_type = html.xpath('//*[@id="list"]//tbody/tr/td[4]/text()')
                # 獲取驗證時間
                verify_time = html.xpath('//*[@id="list"]//tbody/tr/td[7]/text()')

            length = len(ip_add)
            for i in range(length):
                # urllib.request.ProxyHandler()需要字典形式參數,如下:
                # {"https":"https://94.142.27.4:3128"}
                ip_dic = {}
                list_ip = []
                time.sleep(0.02)
                # print("\r獲取代理IP數據進度-{:.2f}%".format((i+1)*100/length),end="")
                # print("獲取代理IP數據進度:第%d頁(%d/%d)"%(page,(i+1),length))
                # 組合後的IP地址 https://94.142.27.4:3128
                group_ip_add = ip_type[i].lower() + '://' + ip_add[i] + ':' + port[i]
                ip_dic[ip_type[i].lower()] = group_ip_add
                list_ip.append(group_ip_add)
                list_ip.append(verify_time[i])
                list_ip.append(str(ip_dic))
                list_ip.append("是")
                self.list_all_ip.append(list_ip)
        print(">>>>獲取完畢。")
        # 存入表格
        self.process_excel(self.list_all_ip)
    # 從表格獲取數據驗證
    def isActiveProxy(self,data,filePath):
        print("驗證代理IP數據中......")
        length = len(data)
        # 統計有效的個數
        count = 0
        for i in range(length):
            # 空代理測試用
            # proxy = urllib.request.ProxyHandler({})
            proxy_ip = {}
            try:
                # 要將str類型轉換爲映射關係
                # print("嘗試驗證第%d個代理IP中..."%(i+1))
                # print("\r驗證代理IP數據進度:{:.2f}%".format((i + 1) * 100 / length), end="")
                proxy_ip = eval(data[i]['驗證使用字段'])
                # 第一種方法
                response = requests.get(url= 'http://www.baidu.com/', headers= {'User-Agent':random.choice(self.USER_AGENT_LIST)},proxies=proxy_ip, timeout=5)
                if response.status_code == 200:
                    ExcelUtil(filePath).write_excel(i + 2, 5, '有效', filePath)
                    count += 1
                # 第二種方法好像不行,進行到一點就沒反應了
                # proxy = urllib.request.ProxyHandler(proxy_ip)
                # opener = urllib.request.build_opener(proxy)
                # opener.addheaders = [("User-Agent",random.choice(self.USER_AGENT_LIST))]
                # 如果這麼寫,就是將opener應用到全局,之後所有的,
                # 不管是opener.open()還是urlopen() 發送請求,都將使用自定義代理。
                # urllib.request.install_opener(opener)
                # request = urllib.request.Request('http://www.baidu.com/')
                # response = urllib.request.urlopen(request)
                # if response.code == 200:
                #     ExcelUtil(filePath).write_excel(i + 2, 5, '有效', filePath)
                #     count += 1
            except Exception as e:
                print("失敗IP:",proxy_ip)
                ExcelUtil(filePath).write_excel(i + 2, 5, '否', filePath)
        print("\n驗證完畢。")
        return count
    # 表格處理
    def process_excel(self,list_all_ip=None):
        print("將獲取的IP寫入表格中...")
        ws = self.wb.active
        ws.title = "代理IP列表"
        # 凍結首行
        ws.freeze_panes = 'A2'
        ws.column_dimensions['A'].width = 7
        ws.column_dimensions['B'].width = 31
        ws.column_dimensions['C'].width = 19
        ws.column_dimensions['D'].width = 42
        # 寫表頭
        for head in range(1, len(self.data) + 1):
            _ = ws.cell(row=1, column=head, value=self.data[head - 1])
        for i in range(len(list_all_ip)):
                _ = ws.cell(row=i+2, column=1, value=i+1)
                for j in range(len(list_all_ip[i])):
                    _ = ws.cell(row=i+2, column=j+2, value=list_all_ip[i][j])
        self.wb.save(self.path + '\\代理IP.xlsx')

    # 判斷是否有IP表格
    def switch_ip(self):
        # 防止反覆獲取數據,這裏增加一個計數,大於三次停止獲取
        get_num = 0
        self.path = os.path.dirname(os.getcwd())
        while True:
            # 獲取當前目錄下所有文件
            dirlist = os.listdir(self.path)
            if get_num < 3:
                if '代理IP.xlsx' in dirlist:
                    print("本地有表格。")
                    # 如果有就去驗證
                    data = ExcelUtil(self.path+"\\代理IP.xlsx").dict_data()
                    # 有效個數大於等於6返回真
                    count = self.isActiveProxy(data,self.path+"\\代理IP.xlsx")
                    if count >= 6:
                        return data
                    else:
                        get_num += 1
                        print("有效IP個數:%d小於6,重新獲取:第%d次"%(count,get_num))
                        # 去獲取
                        self.getProxy()
                else:
                    print("無表格,需要獲取...")
                    self.getProxy()
                    # self.process_excel()
                    get_num += 1
            else:
                print("ERROR:同一時間獲取IP次數超過三次,請稍後再試!")
                return False


if __name__ == "__main__":
    get = GetProxy()
    get.switch_ip()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章