引入包
import pymysql
from random import choice
from warnings import filterwarnings
import traceback
import requests
from bs4 import BeautifulSoup
import aiohttp
import asyncio
import time
# 針對ipython不支持的情況
import nest_asyncio
from flask import Flask, g, render_template
import pandas as pd
nest_asyncio.apply()
# 過濾數據庫報錯
filterwarnings("error",category=pymysql.Warning)
存儲模塊
整個代理池的核心部分,擔負起接收代理、存儲代理、發送代理的重任。數據庫中共有兩個條目,一個是ip,一個是score,從0到100指示ip的可用性。
# 每個代理的分數設定
MAX_SCORE = 100
MIN_SCORE = 0
INITIAL_SCORE = 10
class MysqlClient(object):
# 構造函數
def __init__(self,
mhost='localhost',
muser='root',
mpassword='1234',
mport=3306,
mdb="exercise"):
# 注意init前後下劃線都是倆
# 連接數據量
self.db = pymysql.connect(host=mhost,
user=muser,
password=mpassword,
port=mport,
db=mdb)
# 取得指針運行命令
self.cursor = self.db.cursor()
self.cursor.execute('SELECT VERSION()')
print('database version', self.cursor.fetchone())
self.table = 'pools'
# 注意調用內部函數要加self
self.create_pools()
def create_pools(self):
# 如果表不存在則建新表
sql = 'CREATE TABLE IF NOT EXISTS pools (ip VARCHAR(255) NOT NULL, score INT NOT NULL, PRIMARY KEY (ip))'
try:
self.cursor.execute(sql)
except:
pass
def insert_ip(self, ip, score=INITIAL_SCORE):
# 插入新的ip和分數
data = {'ip': ip, 'score': score}
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
sql = 'INSERT INTO {table}({keys}) VALUES ({values}) '.format(
table=self.table, keys=keys, values=values)
# 增刪查改要用commit和rollback
try:
self.cursor.execute(sql, tuple(data.values()))
self.db.commit()
except:
# traceback.print_exc()
self.db.rollback()
def delete_ip(self, ip):
# 從池中刪除某個ip
condition = "ip = '" + ip + "'"
sql = 'DELETE FROM {table} WHERE {condition}'.format(
table=self.table, condition=condition)
try:
self.cursor.execute(sql)
self.db.commit()
except:
self.db.rollback()
def get_score(self, ip):
# 獲取某個ip的分數,注意字符串要加單引號
sql = 'SELECT score FROM {table} WHERE {condition}'.format(
table=self.table, condition="ip = '" + ip + "'")
try:
self.cursor.execute(sql)
self.db.commit()
return self.cursor.fetchall()[0][0]
except:
traceback.print_exc()
self.db.rollback()
def get_ip(self):
# 隨機產生一個可用的代理
sql = 'SELECT ip FROM {table} WHERE {condition}'.format(
table=self.table, condition="score = " + str(MAX_SCORE))
best_results = []
# 如果滿分代理不存在再查找高分代理
try:
self.cursor.execute(sql)
self.db.commit()
best_results = self.cursor.fetchall()
best_results = [x[0] for x in best_results]
except:
traceback.print_exc()
self.db.rollback()
if len(best_results) == 0:
# 從分數最高的前百分之二十的ip中隨機選一個
sql = 'SELECT ip from {table} order by score desc limit {num}'.format(
table=self.table,
num=str(np.max(10, int(0.2 * self.get_num()))))
try:
self.cursor.execute(sql)
self.db.commit()
best_results = self.cursor.fetchall()
best_results = [x[0] for x in best_results]
except:
traceback.print_exc()
self.db.rollback()
if len(best_results) > 0:
return choice(best_results)
def get_ip_by_score(self, num):
# 獲取排名最高的前若干個代理
sql = 'SELECT * from {table} order by score desc limit {num}'.format(
table=self.table,
num=str(num))
results = []
try:
self.cursor.execute(sql)
self.db.commit()
results = self.cursor.fetchall()
except:
traceback.print_exc()
self.db.rollback()
return results
def change_score(self, ip, action):
# action有decrease和max兩種狀態,分別是分數減一和分數最大
old_score = self.get_score(ip)
if old_score == None:
return
new_score = MAX_SCORE
if action == "decrease":
if old_score <= MIN_SCORE + 1:
self.delete_ip(ip)
return
else:
new_score = old_score - 1
data = {'ip': ip, 'score': new_score}
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
# 整合成sql語句
sql = 'INSERT INTO {table}({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE'.format(
table=self.table, keys=keys, values=values)
update = ','.join([" {key} = %s".format(key=key) for key in data])
sql += update
try:
self.cursor.execute(sql, tuple(data.values()) * 2)
self.db.commit()
except:
traceback.print_exc()
self.db.rollback()
def show_all(self):
# 顯示所有數據
sql = 'SELECT * FROM {table}'.format(table=self.table)
try:
self.cursor.execute(sql)
self.db.commit()
data = self.cursor.fetchall()
print('ip\t \t\tscore')
for d in data:
print(d[0] + '\t' + str(d[1]))
except:
self.db.rollback()
def get_num(self):
# 顯示數據總數
sql = 'SELECT * FROM {table}'.format(table=self.table)
try:
self.cursor.execute(sql)
self.db.commit()
data = self.cursor.fetchall()
return len(data)
except:
self.db.rollback()
def get_all(self):
# 顯示所有數據
sql = 'SELECT * FROM {table}'.format(table=self.table)
try:
self.cursor.execute(sql)
self.db.commit()
data = self.cursor.fetchall()
return [d[0] for d in data]
except:
self.db.rollback()
def delete_all(self):
# 刪除所有數據
sql = 'truncate table {table};'.format(table=self.table)
try:
self.cursor.execute(sql)
self.db.commit()
except:
self.db.rollback()
# 析構函數
def __del__(self):
# 關閉指針
self.cursor.close()
# 關閉數據庫
self.db.close()
獲取模塊
通過爬蟲從各大網站抓取代理,並送到存儲模塊。
- 定義爬蟲超類,用於爲後續的爬蟲類增加整合爬取各網站函數
# 注意要繼承type
class SpiderMetaClass(type):
# 對相關聯的類進行修改,並返回一個新的類
def __new__(cls, name, bases, attrs):
# 爬取網站函數名的合集
attrs['__ProxyFunc__'] = []
for k, v in attrs.items():
if 'Spider' in k:
attrs['__ProxyFunc__'].append(k)
return type.__new__(cls, name, bases, attrs)
- 定義由超類動態改變的爬蟲類
class Spider(object, metaclass=SpiderMetaClass):
def __init__(self, xila_page = 10, xici_page = 10):
self.header = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
self.xila_page = xila_page
self.xici_page = xici_page
# 對proxydb網站進行爬取
def Spider_proxydb(self):
url = 'http://proxydb.net/?protocol=https&anonlvl=4&country=CN'
r = requests.get(url, headers=self.header)
html = r.text
soup = BeautifulSoup(html)
Table = soup.select('div.table-responsive')[0]
for proxy in Table.select('a'):
yield proxy.string
# 對西拉網爬取前若干頁
def Spider_xila(self):
urls = ['http://www.xiladaili.com/https/']
if self.xila_page:
urls = urls + ['http://www.xiladaili.com/https/'+str(i+1)+'/' for i in range(1,self.xila_page)]
for url in urls:
r = requests.get(url, headers=self.header)
html = r.text
soup = BeautifulSoup(html)
# print(soup.select('div.mt-4'))
Table = soup.select('div.mt-4')[0]
for proxy in Table.select('tr')[1:]:
yield proxy.select('td')[0].string
# 對西刺網爬取前若干頁
def Spider_xici(self):
urls = ['https://www.xicidaili.com/nn/'+str(i+1) for i in range(self.xici_page)]
for url in urls:
r = requests.get(url, headers=self.header)
html = r.text
soup = BeautifulSoup(html)
Table = soup.select('table#ip_list')[0]
for proxy in Table.select('tr')[1:]:
yield proxy.select('td')[1].string
# 通過彙總函數彙總代理結果,輸入參數爲超類新增的模型屬性
def get_proxy(self, function_names):
proxies = []
for function_name in function_names:
for proxy in eval("self.{}()".format(function_name)):
proxies.append(proxy)
return proxies
將獲取代理與存儲代理相結合
class Getter():
def __init__(self):
self.client = MysqlClient()
self.spider = Spider()
def run(self):
for proxy in self.spider.get_proxy(self.spider.__ProxyFunc__):
self.client.insert_ip(proxy)
getter = Getter()
database version ('8.0.15',)
getter.run()
檢測模塊
檢測模塊對數據庫中存儲的ip代理進行檢測,爲了提高速度,使用異步方式進行檢測。
# 可通過校驗的狀態碼集合
VALID_STATUS_CODES = [200]
# 測試用URL,注意不要用https,目前不支持
TEST_URL = 'http://www.baidu.com'
# 每次異步測試的規模
BATCH_SIZE = 100
class Tester(object):
def __init__(self):
self.client = MysqlClient()
# 單個測試URL的函數,由於是異步的,需要加上async標識
async def test_one_proxy(self, proxy):
conn = aiohttp.TCPConnector(ssl=False)
async with aiohttp.ClientSession(connector=conn) as session:
try:
if isinstance(proxy, bytes):
proxy = proxy.decode('utf-8')
real_proxy = 'http://'+proxy
print('正在測試', proxy)
async with session.get(TEST_URL, proxy=real_proxy, timeout = 100) as response:
# 狀態碼正常
if response.status in VALID_STATUS_CODES:
self.client.change_score(proxy, 'max')
print('代理可用', proxy)
else:
self.client.change_score(proxy, 'decrease')
print('狀態不對', proxy)
except:
print('測試失敗', proxy)
traceback.print_exc()
self.client.change_score(proxy, 'decrease')
# 對數據庫中的所有proxy進行測試
def test_all_proxy(self):
try:
proxies = self.client.get_all()
# event_loop 事件循環:程序開啓一個無限的循環,程序員會把一些函數註冊到事件循環上。當滿足事件發生的時候,調用相應的協程函數。
loop = asyncio.get_event_loop()
for i in range(0, len(proxies), BATCH_SIZE):
test_proxies = proxies[i:i+BATCH_SIZE]
# 構建新任務
# 協程對象不能直接運行,在註冊事件循環的時候,其實是run_until_complete方法將協程包裝成爲了一個任務(task)對象。
# 所謂task對象是Future類的子類。保存了協程運行後的狀態,用於未來獲取協程的結果。
tasks = [self.test_one_proxy(proxy) for proxy in test_proxies]
loop.run_until_complete(asyncio.wait(tasks))
time.sleep(3)
except:
print('測試器錯誤')
traceback.print_exc()
tester = Tester()
database version ('8.0.15',)
tester.test_all_proxy()
正在測試 106.14.206.26
正在測試 1.197.204.251:9999
正在測試 106.14.76.134
正在測試 113.117.121.141
正在測試 113.117.27.223
狀態不對 106.14.206.26
代理可用 111.231.239.143
代理可用 101.231.104.82
測試失敗 111.231.202.91
...
顯示模塊
顯示模塊用網頁來輸出當前代理池的信息
class Weber(object):
def __init__(self):
self.client = MysqlClient()
app = Flask(__name__)
# 主頁
@app.route('/')
def index():
return '<h2>Welcome to Proxy Pool System</h2>'
# 獲取一個隨機可用代理
@app.route('/random')
def get_porxy():
return '可用代理:'+ self.client.get_ip()
# 獲取多個最高分數代理
@app.route('/count/<number>')
def get_porxies(number):
results = self.client.get_ip_by_score(int(number))
df = pd.DataFrame({'ip':[x[0] for x in results], 'score':[x[1] for x in results]})
html = df.to_html(classes='data', index = False, bold_rows=False, header=True)
# 加標題
title = '<h3>可用代理表</h3>'
html = title + html
# 加修飾
css = "<style>body{text-align:center;}table {margin:auto;border-collapse: collapse;font-family: Futura, Arial, sans-serif;}caption {font-size: 200;margin: 1em auto;}th,td {padding: .65em;text-align:center;}td {/* border: 1px solid #777; */}tbody tr:nth-child(odd) {background: #ccc;}th:first-child {border-radius: 9px 0 0 0;}th:last-child {border-radius: 0 9px 0 0;}tr:last-child td:first-child {border-radius: 0 0 0 9px;}tr:last-child td:last-child {border-radius: 0 0 9px 0;}</style>"
html = css + html
return html
app.run()
weber = Weber()
database version ('8.0.15',)
* Serving Flask app "__main__" (lazy loading)
* Environment: production
WARNING: This is a development server. Do not use it in a production deployment.
Use a production WSGI server instead.
* Debug mode: off
* Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [09/Jun/2020 00:02:38] "[37mGET /count/10 HTTP/1.1[0m" 200 -
效果