從零開始打造代理池

引入包

import pymysql
from random import choice
from warnings import filterwarnings
import traceback  
import requests
from bs4 import BeautifulSoup
import aiohttp
import asyncio
import time
# 針對ipython不支持的情況
import nest_asyncio
from flask import Flask, g, render_template
import pandas as pd
nest_asyncio.apply()
# 過濾數據庫報錯
filterwarnings("error",category=pymysql.Warning)

存儲模塊

整個代理池的核心部分,擔負起接收代理、存儲代理、發送代理的重任。數據庫中共有兩個條目,一個是ip,一個是score,從0到100指示ip的可用性。

# 每個代理的分數設定
MAX_SCORE = 100
MIN_SCORE = 0
INITIAL_SCORE = 10


class MysqlClient(object):
    # 構造函數
    def __init__(self,
                 mhost='localhost',
                 muser='root',
                 mpassword='1234',
                 mport=3306,
                 mdb="exercise"):
        # 注意init前後下劃線都是倆
        # 連接數據量
        self.db = pymysql.connect(host=mhost,
                                  user=muser,
                                  password=mpassword,
                                  port=mport,
                                  db=mdb)
        # 取得指針運行命令
        self.cursor = self.db.cursor()
        self.cursor.execute('SELECT VERSION()')
        print('database version', self.cursor.fetchone())
        self.table = 'pools'
        # 注意調用內部函數要加self
        self.create_pools()

    def create_pools(self):
        # 如果表不存在則建新表
        sql = 'CREATE TABLE IF NOT EXISTS pools (ip VARCHAR(255) NOT NULL, score INT NOT NULL, PRIMARY KEY (ip))'
        try:
            self.cursor.execute(sql)
        except:
            pass

    def insert_ip(self, ip, score=INITIAL_SCORE):
        # 插入新的ip和分數
        data = {'ip': ip, 'score': score}
        keys = ', '.join(data.keys())
        values = ', '.join(['%s'] * len(data))
        sql = 'INSERT INTO {table}({keys}) VALUES ({values}) '.format(
            table=self.table, keys=keys, values=values)
        # 增刪查改要用commit和rollback
        try:
            self.cursor.execute(sql, tuple(data.values()))
            self.db.commit()
        except:
#             traceback.print_exc()
            self.db.rollback()

    def delete_ip(self, ip):
        # 從池中刪除某個ip
        condition = "ip = '" + ip + "'"
        sql = 'DELETE FROM {table} WHERE {condition}'.format(
            table=self.table, condition=condition)
        try:
            self.cursor.execute(sql)
            self.db.commit()
        except:
            self.db.rollback()

    def get_score(self, ip):
        # 獲取某個ip的分數,注意字符串要加單引號
        sql = 'SELECT score FROM {table} WHERE {condition}'.format(
            table=self.table, condition="ip = '" + ip + "'")
        try:
            self.cursor.execute(sql)
            self.db.commit()
            return self.cursor.fetchall()[0][0]
        except:
            traceback.print_exc()
            self.db.rollback()

    def get_ip(self):
        # 隨機產生一個可用的代理
        sql = 'SELECT ip FROM {table} WHERE {condition}'.format(
            table=self.table, condition="score = " + str(MAX_SCORE))
        best_results = []
        # 如果滿分代理不存在再查找高分代理
        try:
            self.cursor.execute(sql)
            self.db.commit()
            best_results = self.cursor.fetchall()
            best_results = [x[0] for x in best_results]
        except:
            traceback.print_exc()
            self.db.rollback()
        if len(best_results) == 0:
            # 從分數最高的前百分之二十的ip中隨機選一個
            sql = 'SELECT ip from {table} order by score desc limit {num}'.format(
                table=self.table,
                num=str(np.max(10, int(0.2 * self.get_num()))))
            try:
                self.cursor.execute(sql)
                self.db.commit()
                best_results = self.cursor.fetchall()
                best_results = [x[0] for x in best_results]
            except:
                traceback.print_exc()
                self.db.rollback()
        if len(best_results) > 0:
            return choice(best_results)
        
    def get_ip_by_score(self, num):
        # 獲取排名最高的前若干個代理
        sql = 'SELECT * from {table} order by score desc limit {num}'.format(
                table=self.table,
                num=str(num))
        results = []
        try:
            self.cursor.execute(sql)
            self.db.commit()
            results = self.cursor.fetchall()
        except:
            traceback.print_exc()
            self.db.rollback()
        return results

    def change_score(self, ip, action):
        # action有decrease和max兩種狀態,分別是分數減一和分數最大
        old_score = self.get_score(ip)
        if old_score == None:
            return
        new_score = MAX_SCORE
        if action == "decrease":
            if old_score <= MIN_SCORE + 1:
                self.delete_ip(ip)
                return
            else:
                new_score = old_score - 1
        data = {'ip': ip, 'score': new_score}
        keys = ', '.join(data.keys())
        values = ', '.join(['%s'] * len(data))
        # 整合成sql語句
        sql = 'INSERT INTO {table}({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE'.format(
            table=self.table, keys=keys, values=values)
        update = ','.join([" {key} = %s".format(key=key) for key in data])
        sql += update
        try:
            self.cursor.execute(sql, tuple(data.values()) * 2)
            self.db.commit()
        except:
            traceback.print_exc()
            self.db.rollback()

    def show_all(self):
        # 顯示所有數據
        sql = 'SELECT * FROM {table}'.format(table=self.table)
        try:
            self.cursor.execute(sql)
            self.db.commit()
            data = self.cursor.fetchall()
            print('ip\t \t\tscore')
            for d in data:
                print(d[0] + '\t' + str(d[1]))
        except:
            self.db.rollback()

    def get_num(self):
        # 顯示數據總數
        sql = 'SELECT * FROM {table}'.format(table=self.table)
        try:
            self.cursor.execute(sql)
            self.db.commit()
            data = self.cursor.fetchall()
            return len(data)
        except:
            self.db.rollback()

    def get_all(self):
        # 顯示所有數據
        sql = 'SELECT * FROM {table}'.format(table=self.table)
        try:
            self.cursor.execute(sql)
            self.db.commit()
            data = self.cursor.fetchall()
            return [d[0] for d in data]
        except:
            self.db.rollback()
    
    def delete_all(self):
        # 刪除所有數據
        sql = 'truncate table {table};'.format(table=self.table)
        try:
            self.cursor.execute(sql)
            self.db.commit()
        except:
            self.db.rollback()

    # 析構函數
    def __del__(self):
        # 關閉指針
        self.cursor.close()
        # 關閉數據庫
        self.db.close()

獲取模塊

通過爬蟲從各大網站抓取代理,並送到存儲模塊。

  • 定義爬蟲超類,用於爲後續的爬蟲類增加整合爬取各網站函數
# 注意要繼承type
class SpiderMetaClass(type):
    # 對相關聯的類進行修改,並返回一個新的類
    def __new__(cls, name, bases, attrs):
        # 爬取網站函數名的合集
        attrs['__ProxyFunc__'] = []
        for k, v in attrs.items():
            if 'Spider' in k:
                attrs['__ProxyFunc__'].append(k)
        return type.__new__(cls, name, bases, attrs)
  • 定義由超類動態改變的爬蟲類
class Spider(object, metaclass=SpiderMetaClass):
    def __init__(self, xila_page = 10, xici_page = 10):
        self.header = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        }
        self.xila_page = xila_page
        self.xici_page = xici_page
        
    # 對proxydb網站進行爬取
    def Spider_proxydb(self):
        url = 'http://proxydb.net/?protocol=https&anonlvl=4&country=CN'
        r = requests.get(url, headers=self.header)
        html = r.text
        soup = BeautifulSoup(html)
        Table = soup.select('div.table-responsive')[0]
        for proxy in Table.select('a'):
            yield proxy.string
    
    # 對西拉網爬取前若干頁
    def Spider_xila(self):
        urls = ['http://www.xiladaili.com/https/']
        if self.xila_page:
            urls = urls + ['http://www.xiladaili.com/https/'+str(i+1)+'/' for i in range(1,self.xila_page)]
        for url in urls:
            r = requests.get(url, headers=self.header)
            html = r.text
            soup = BeautifulSoup(html)
#             print(soup.select('div.mt-4'))
            Table = soup.select('div.mt-4')[0]
            for proxy in Table.select('tr')[1:]:
                yield proxy.select('td')[0].string
                
    # 對西刺網爬取前若干頁
    def Spider_xici(self):
        urls = ['https://www.xicidaili.com/nn/'+str(i+1) for i in range(self.xici_page)]
        for url in urls:
            r = requests.get(url, headers=self.header)
            html = r.text
            soup = BeautifulSoup(html)
            Table = soup.select('table#ip_list')[0]
            for proxy in Table.select('tr')[1:]:
                yield proxy.select('td')[1].string
                
    # 通過彙總函數彙總代理結果,輸入參數爲超類新增的模型屬性
    def get_proxy(self, function_names):
        proxies = []
        for function_name in function_names:
            for proxy in eval("self.{}()".format(function_name)):
                proxies.append(proxy)
        return proxies

將獲取代理與存儲代理相結合

class Getter():
    def __init__(self):
        self.client = MysqlClient()
        self.spider = Spider()
    def run(self):
        for proxy in self.spider.get_proxy(self.spider.__ProxyFunc__):
            self.client.insert_ip(proxy)
getter = Getter()
database version ('8.0.15',)
getter.run()

檢測模塊

檢測模塊對數據庫中存儲的ip代理進行檢測,爲了提高速度,使用異步方式進行檢測。

# 可通過校驗的狀態碼集合
VALID_STATUS_CODES = [200]
# 測試用URL,注意不要用https,目前不支持
TEST_URL = 'http://www.baidu.com'
# 每次異步測試的規模
BATCH_SIZE = 100
class Tester(object):
    def __init__(self):
        self.client = MysqlClient()
    
    # 單個測試URL的函數,由於是異步的,需要加上async標識
    async def test_one_proxy(self, proxy):
        conn = aiohttp.TCPConnector(ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://'+proxy
                print('正在測試', proxy)
                async with session.get(TEST_URL, proxy=real_proxy, timeout = 100) as response:
                    # 狀態碼正常
                    if response.status in VALID_STATUS_CODES:
                        self.client.change_score(proxy, 'max')
                        print('代理可用', proxy)
                    else:
                        self.client.change_score(proxy, 'decrease')
                        print('狀態不對', proxy)
            except:
                print('測試失敗', proxy)
                traceback.print_exc()
                self.client.change_score(proxy, 'decrease')
                
    # 對數據庫中的所有proxy進行測試
    def test_all_proxy(self):
        try:
            proxies = self.client.get_all()
            # event_loop 事件循環:程序開啓一個無限的循環,程序員會把一些函數註冊到事件循環上。當滿足事件發生的時候,調用相應的協程函數。
            loop = asyncio.get_event_loop()
            for i in range(0, len(proxies), BATCH_SIZE):
                test_proxies = proxies[i:i+BATCH_SIZE]
                # 構建新任務
                # 協程對象不能直接運行,在註冊事件循環的時候,其實是run_until_complete方法將協程包裝成爲了一個任務(task)對象。
                # 所謂task對象是Future類的子類。保存了協程運行後的狀態,用於未來獲取協程的結果。
                tasks = [self.test_one_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(3)
        except:
            print('測試器錯誤')
            traceback.print_exc()
tester = Tester()
database version ('8.0.15',)
tester.test_all_proxy()
正在測試 106.14.206.26
正在測試 1.197.204.251:9999
正在測試 106.14.76.134
正在測試 113.117.121.141
正在測試 113.117.27.223
狀態不對 106.14.206.26
代理可用 111.231.239.143
代理可用 101.231.104.82
測試失敗 111.231.202.91
...

顯示模塊

顯示模塊用網頁來輸出當前代理池的信息

class Weber(object):
    def __init__(self):
        self.client = MysqlClient()
        app = Flask(__name__)
        
        # 主頁
        @app.route('/')
        def index():
            return '<h2>Welcome to Proxy Pool System</h2>'
        
        # 獲取一個隨機可用代理
        @app.route('/random')
        def get_porxy():
            return '可用代理:'+ self.client.get_ip()
        
        # 獲取多個最高分數代理
        @app.route('/count/<number>')
        def get_porxies(number):
            results = self.client.get_ip_by_score(int(number))
            df = pd.DataFrame({'ip':[x[0] for x in results], 'score':[x[1] for x in results]})
            html = df.to_html(classes='data', index = False, bold_rows=False, header=True)
            # 加標題
            title = '<h3>可用代理表</h3>'
            html = title + html
            # 加修飾
            css = "<style>body{text-align:center;}table {margin:auto;border-collapse: collapse;font-family: Futura, Arial, sans-serif;}caption {font-size: 200;margin: 1em auto;}th,td {padding: .65em;text-align:center;}td {/* border: 1px solid #777; */}tbody tr:nth-child(odd) {background: #ccc;}th:first-child {border-radius: 9px 0 0 0;}th:last-child {border-radius: 0 9px 0 0;}tr:last-child td:first-child {border-radius: 0 0 0 9px;}tr:last-child td:last-child {border-radius: 0 0 9px 0;}</style>"
            html = css + html
            return html
    
        app.run()
weber = Weber()
database version ('8.0.15',)
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   WARNING: This is a development server. Do not use it in a production deployment.
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [09/Jun/2020 00:02:38] "[37mGET /count/10 HTTP/1.1[0m" 200 -

效果

在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章