如何構建自己的IP代理池

在學習爬蟲的過程中,肯定遇到過被封IP的問題,這個時候我們就需要去用代理來幫我們完成爬取任務,然而,爬着爬着又被封了,好,再換一個代理,一會過後又封了。這種情況有兩種方法來解決
方法一:對我們的爬蟲進行限速,使爬取速度接近人工訪問的速度
方法二:使用代理池
在這裏只說明一下代理池,我們可以使用西刺的免費代理,將其爬取下來存在數據庫中,這樣只需要在每次爬取的時候從數據庫中取一個IP就可以了
下面是從西刺上爬取數據並存儲的代碼

import requests
import pymongo
import threading
from requests.exceptions import HTTPError
from datetime import datetime
from lxml.html import fromstring


class DownLoad(object):
    def __init__(self, proxy=None, headers=None):
        self.proxy = proxy
        self.headers = headers
        self.client = pymongo.MongoClient(
            'mongodb_url'
        )
        self.db = self.client['scrapy_items']

    def __call__(self, url):
        tree = self.downloader(url)
        if tree is None:
            print('HTTP ERROR!')
        else:
            ip_info = self.get_ips(tree)
            for ip in ip_info:
                if ip is None:
                    print('invalid ip and port')
                else:
                    try:
                        self.db['IP'].insert_one(ip)
                    except Exception as e:
                        print(e)

    def close(self):
        self.client.close()

    def downloader(self, url):
        try:
            html = requests.get(url, headers=self.headers)
        except HTTPError as err:
            print(err)
        except Exception as e:
            print(e)
        else:
            try:
                tree = fromstring(html.text)
                return tree
            except Exception as e:
                print(e)
        return None

    def get_ips(self, tree):
        table = tree.xpath('//table[@id="ip_list"]//tr[@class]')
        for tr in table:
            ip_info = {}
            try:
                ip_info['ip'] = tr.xpath('.//td[2]/text()')[0]
                ip_info['port'] = tr.xpath('.//td[3]/text()')[0]
                ip_info['status'] = tr.xpath('.//td[5]/text()')[0]
                ip_info['type'] = tr.xpath('.//td[6]/text()')[0]
                ip_info['speed'] = float(tr.xpath('.//td[7]/div/@title')[0].split('秒')[0])
                ip_info['connect_time'] = float(tr.xpath('.//td[8]/div/@title')[0].split('秒')[0])
            except Exception as e:
                print(e)
                yield None
            if self.verification_ip(ip_info['ip'], ip_info['port'], ip_info['type']):
                ip_info['verification_time'] = datetime.now()
                yield ip_info
            else:
                print(ip_info['ip'], end='')
                yield None

    def verification_ip(self, ip, port, type):
        if type == 'HTTP':
            proxy_dict = {
                'http': 'http://%s:%s' % (ip, port),
            }
        else:
            proxy_dict = {
                'https': 'https://%s:%s' % (ip, port),
            }
        try:
            html = requests.get('https://hao.360.com/', headers=self.headers, proxies=proxy_dict,
                                timeout=5)
        except HTTPError as err:
            print(err)
        except Exception as e:
            print(e)
            return False
        else:
            if 200 <= html.status_code < 300:
                return True
            else:
                return False


def runspider(downloader, base_url, start_url, end_url):
	"""執行方法"""
    for i in range(start_url, end_url):
        url = base_url + str(i)
        downloader(url)

其中包括爬取IP以及對IP進行檢測是否能用和存入數據庫中(代碼結構可能有些問題請見諒,新手一枚),對爬取的IP進行檢測是爲了防止無效的IP存入數據庫,畢竟是免費的IP,穩定性是不怎麼樣的,如果你還嫌這些IP有點少的話, 可以再爬取一下其他的IP代理網站
具體怎麼執行看你自己想法啦,剩餘的部分就不貼了(建議不要爬取太快,會封IP的,我就是爬取的太快,還沒有爬完呢,就被封了)
接下來就是隨機從數據庫中取IP了

class GetIP(object):

    def __init__(self):
        self.client = pymongo.MongoClient(
            'mongodb_url'
        )
        self.db = self.client['scrapy_items']
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
        }

    def judge_ip(self, ip):
        if ip['type'] == 'HTTP':
            proxy_dict = {
                'http': 'http://%s:%s' % (ip['ip'], ip['port']),
            }
        else:
            proxy_dict = {
                'https': 'https://%s:%s' % (ip['ip'], ip['port']),
            }
        try:
            html = requests.get('https://hao.360.com/', headers=self.headers, proxies=proxy_dict,
                                timeout=5)
        except HTTPError as err:
            print(ip['ip'], err)
            return False
        except Exception as e:
            print(ip['ip'], e)
            return False
        else:
            if 200 <= html.status_code < 300:
                return True
            else:
                return False

    def get_random_ip(self):
        ip_info = self.db['IP'].aggregate([
            {'$sample': {'size': 1}},
        ])
        for ip in ip_info:
            if self.judge_ip(ip):
                return '%s://%s:%s' % (ip['type'], ip['ip'], ip['port'])
            else:
                self.delete_ip(ip)
                return self.get_random_ip()

    def delete_ip(self, ip):
        self.db['IP'].remove({'ip': ip['ip']})

    def close(self):
        self.client.close()

這樣我們就構建出了自己的IP代理池,我們可以使用下面方法來測試一下

    get_ip = GetIP()
    for i in range(5):
        ip_port = get_ip.get_random_ip()
        print(ip_port)
    get_ip.close()

好了,IP代理池搞定,在爬取信息的時候可能還會用到用戶代理池,用戶代理池有一個第三方庫非常方便,fake_useragent,直接搜索就可以了
在爬取信息的時候最好是代理與限速配合,畢竟誰都不想自己的服務器被別人拿來隨意蹂躪

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章