Python爬蟲——建立IP代理池

原創

2020-04-07 19:39

在使用Python爬蟲時，經常遇見具有反爬機制的網站。我們可以通過僞裝headers來爬取，但是網站還是可以獲取你的ip，從而禁掉你的ip來阻止爬取信息。
在request方法中，我們可以通過proxies參數來僞裝我們的ip，一些網站上有免費的ip代理網站，可以通過爬取這些ip，經檢測後建立ip代理池。

ip代理網站：
（https://www.xicidaili.com/nt/）
（https://www.kuaidaili.com/free/intr/）

推薦一種常用的僞裝頭方法

from fake_useragent import UserAgent
ua = UserAgent()
headers = {'User-Agent':ua.random}

接下來進入正題

爬取ip（IPPool.py）

import requests
from lxml import etree
from fake_useragent import UserAgent
#僞裝
ua = UserAgent()
headers = {'User-Agent':ua.random}
def get_ip():
    ip_list = []
    #路徑
    url = 'https://www.xicidaili.com/nt/' #ip是有時效的，只爬取第一頁
    #請求
    response = requests.get(url=url,headers=headers)
    #設置編碼
    response.encoding = response.apparent_encoding
    response = response.text

    response = etree.HTML(response)

    tr_list = response.xpath('//tr[@class="odd"]')
    for i in tr_list:
        #ip
        ip = i.xpath('./td[2]/text()')[0]
        #端口號
        port = i.xpath('./td[3]/text()')[0]
        #協議
        agreement = i.xpath('./td[6]/text()')[0]
        agreement = agreement.lower()
        #拼裝完整路徑
        ip = agreement + '://' + ip + ':' + port
        ip_list.append(ip)
    return ip_list
if __name__ == '__main__':
    ip_list = get_ip()
    print(ip_list)

測試ip

測試方法一（from multiprocessing.dummy import Pool）

import requests
from multiprocessing.dummy import Pool
#獲取爬取到的ip列表
from IPPool import get_ip
test_list = get_ip()
#定義一個全局列表，用來存放有效ip
ip_list = []
#ip測試網站
url = 'http://icanhazip.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'
}
def ip_test(ip):
    try:
        if ip.split(":")[0] == 'http':
            proxies = {
                'http': ip
            }
        else:
            proxies = {
                'https': ip
            }
        response = requests.get(url=url, headers=headers, proxies=proxies, timeout=3)
        ip_list.append(ip)
        print(ip + "可用")
    except:
        print(ip + "不可用")
if __name__ == '__main__':
    pool = Pool(4)
    pool.map(ip_test, test_list)
    print(ip_list)
    print("總共爬取%s個ip，可用ip爲：%s，不可用ip爲：%s"%(len(test_list),len(ip_list),len(test_list)-len(ip_list)))

測試結果：

測試方法二（Threading多線程隊列）

import threading
import requests
import queue
from fake_useragent import UserAgent

#獲取爬取到的ip列表
from IPPool import get_ip
test_list = get_ip()
#定義一個全局列表，用來存放有效ip
ip_pool = []
#隨機頭僞裝
ua = UserAgent()
headers = {'User-Agent':ua.random}

url = 'https://www.csdn.net/'
# url = 'http://icanhazip.com/'

def test_ip(queue_list):
    while True:
        if queue_list.empty():
            break
        else:
            ip = queue_list.get()
            if ip.split(":")[0] == 'http':
                proxies = {
                    'http' : ip
                }
            else:
                proxies = {
                    'https': ip
                }
            try:
                response = requests.get(url=url, headers=headers, proxies=proxies,timeout=3)
                if response.status_code == 200:
                    print("【%s】測試%s,測試結果【可用】" % (threading.current_thread().name, proxies))
                    ip_pool.append(ip)
            except:
                print("【%s】測試%s,測試結果【不可用】" % (threading.current_thread().name, proxies))

if __name__ == '__main__':
    queue_list = queue.Queue()#創建隊列
    #將爬取的ip放入隊列中
    for i in test_list:
        queue_list.put(i)
    #創建線程
    out_thread = [threading.Thread(target=test_ip, args=(queue_list,), name="進程%s" % item) for item in range(5)]
    for thread in out_thread:
        thread.start()
    for thread in out_thread:
        thread.join()
    print('測試完成')
    print(ip_pool)
    print("總共爬取%s個ip，可用ip爲：%s，不可用ip爲：%s"%(len(test_list),len(ip_pool),len(test_list)-len(ip_pool)))

結果：

測試網址不需要那麼複雜，www.baidu.com一類的都可以，有一位博主推薦了一個測試網站：http://icanhazip.com/

在測試時遇到了一個坑，沒有太注意協議是http還是https，統一用了http，然後發現每一個ip都可以用，當然這是不可能的，經過修改後，測試成功的ip大概在二十五個左右。

https://www.kuaidaili.com/free/intr/這個網址的ip爬取也寫了（ip還沒有處理），但是這個網址的一頁ip有點少，所以就沒有測試

IPPool2.py

import requests
from lxml import etree
from fake_useragent import UserAgent
#僞裝
ua = UserAgent()
headers = {'User-Agent':ua.random}

def get_ip():
    ip_list = []
    #路徑
    url = 'https://www.kuaidaili.com/free/intr/'
    #請求
    response = requests.get(url=url,headers=headers)
    #設置編碼
    response.encoding = response.apparent_encoding
    response = response.text

    response = etree.HTML(response)

    tr_list = response.xpath('//*[@id="list"]/table/tbody/tr')
    for i in tr_list:
        ip = i.xpath('./td[1]/text()')[0]
        ip_list.append(ip)
    return ip_list
if __name__ == '__main__':
    ip_list = get_ip()
    # print(ip_list)

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Python爬蟲——建立IP代理池

爬取ip（IPPool.py）

測試ip

測試方法一（from multiprocessing.dummy import Pool）

測試方法二（Threading多線程隊列）

IPPool2.py

這個網絡爬蟲代碼，拿到數據之後如何存到csv文件中去？

.NET開源強大、易於使用的緩存框架 - FusionCache

面試，有時候是個運氣活

Python爬蟲——建立IP代理池

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結