目錄
一、背景
在爬蟲的時候,可能IP會被封掉!!!怎樣子才能解決此問題呢?
代理IP池
由於題目的原因,我想先給大家看看,UA代理池是如何實現的!!!
二、UA代理池
2.1資源準備
首先準備一個列表存放多個UA:
# 用戶代理池
agent = [
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
]
2.2頭部生成
開始構造頭部:
備註:
由於我這代碼是我代理IP池構建的過程中寫的(生成頭部我用了一個Utils類),所以我直接貼出我的源代碼;
getHeaders函數用於構造頭部
class Utils(object):
def getHeaders(self):
"""
構造頭部
"""
headers = {
'User-Agent' : random.sample(agent, 1)[0]
}
return headers
...
構造原理:
隨機生成一個數字,獲取列表之中的值!!!
2.3請求
utils = Utils()
request = requests.get(url, headers=utils.getHeaders(), params = prms)
三、代理IP池
現在大家已經瞭解完了UA代理池的基本原理!其實代理IP池的請求也和上面差不多!UA可以直接從瀏覽器或者其他地方獲取,但是代理IP改如何獲取呢?其實大家可以百度搜索“免費代理IP”,找到其網站直接爬回來即可!!!
備註:
這裏我使用的是:快代理
2.1抓取代理IP
備註:
我們需要抓取的是IP、PORT、類型、響應時間!!!
由於本文章是專屬代理構建的,在這裏我不教學該如何去抓取數據(代理IP),我直接貼上我的代碼:
class Spider(object):
"""
捕獲數據類
"""
def __init__(self, url):
self.url = url
def send_request(self, prms = {}, url = '', page = 1):
"""
發送請求
:prms:請求字段
:url:請求地址
"""
if not url:
url = self.url
url = url + str(page)
print(url)
utils = Utils()
request = requests.get(url, headers=utils.getHeaders(), params = prms)
return request.text
def parse_request(self, text):
"""
解析文本
:text:文本數據
"""
soup = BeautifulSoup(text,'lxml')
ip_list = soup.find('tbody').find_all('tr')
data_list = []
for data in ip_list:
data_list.append({
'ip' : data.find('td', attrs={"data-title": "IP"}).get_text(),
'port' : data.find('td', attrs={"data-title": "PORT"}).get_text(),
'request_type' : data.find('td', attrs={"data-title": "類型"}).get_text(),
'time' : re.search(r"(\d+\.?\d*)", data.find('td',attrs={"data-title": "響應速度"}).get_text()).group(1)
})
return data_list
2.2測試代理IP可用性
通過抓回來的IP,直接使用與請求一個網站,假如響應=>代理IP可用!!!
class Test(object):
"""
驗證類
"""
def check_ip(self, req_type, ip, port):
"""
測試代理ip
:req_type:協議類型
:ip:代理ip
:port:端口號
"""
utils = Utils()
proxies = {
req_type.lower() : req_type.lower() + '://' + ip + ':' + port
}
request = requests.get("https://www.baidu.com",headers = utils.getHeaders(), proxies = proxies, timeout = 1)
return request.status_code == 200
2.3存儲數據
將抓取回來的IP進行存儲,這裏我使用的是CSV!!!
class Save(object):
def __init__(self, path):
self.path = path
def csv_save(self, ip_list):
utils = Utils()
test = Test()
if(test.check_ip(ip_list['request_type'], ip_list['ip'], ip_list['port'])):
with open('ip.csv',mode = 'a', newline='') as f:
writer = csv.DictWriter(f, csv_headers)
writer.writerow(ip_list)
else:
print("此ip不可用!!!")
在這裏,代理IP池就已經構造完畢了!!!如何使用請看下一節!!!
下面是我全部代碼:
import requests import random import re import os import csv import time from bs4 import BeautifulSoup # 免費快代理 url = 'https://www.kuaidaili.com/free/inha/' # 用戶代理池 agent = [ 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11', 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' ] # 請求頭部 params = {} # csv 頭部 csv_headers = ['ip','port','request_type','time'] # 工具類 class Utils(object): def getHeaders(self): """ 構造頭部 """ headers = { 'User-Agent' : random.sample(agent, 1)[0] } return headers def get_path(self): """ 獲取本地路徑 """ return os.getcwd() # 數據獲取類 class Spider(object): """ 捕獲數據類 """ def __init__(self, url): self.url = url def send_request(self, prms = {}, url = '', page = 1): """ 發送請求 :prms:請求字段 :url:請求地址 """ if not url: url = self.url url = url + str(page) print(url) utils = Utils() request = requests.get(url, headers=utils.getHeaders(), params = prms) return request.text def parse_request(self, text): """ 解析文本 :text:文本數據 """ soup = BeautifulSoup(text,'lxml') ip_list = soup.find('tbody').find_all('tr') data_list = [] for data in ip_list: data_list.append({ 'ip' : data.find('td', attrs={"data-title": "IP"}).get_text(), 'port' : data.find('td', attrs={"data-title": "PORT"}).get_text(), 'request_type' : data.find('td', attrs={"data-title": "類型"}).get_text(), 'time' : re.search(r"(\d+\.?\d*)", data.find('td',attrs={"data-title": "響應速度"}).get_text()).group(1) }) return data_list # 測試類 class Test(object): """ 驗證類 """ def check_ip(self, req_type, ip, port): """ 測試代理ip :req_type:協議類型 :ip:代理ip :port:端口號 """ utils = Utils() proxies = { req_type.lower() : req_type.lower() + '://' + ip + ':' + port } request = requests.get("https://www.baidu.com",headers = utils.getHeaders(), proxies = proxies, timeout = 1) return request.status_code == 200 # 數據存儲類 class Save(object): def __init__(self, path): self.path = path def csv_save(self, ip_list): utils = Utils() test = Test() if(test.check_ip(ip_list['request_type'], ip_list['ip'], ip_list['port'])): with open('ip.csv',mode = 'a', newline='') as f: writer = csv.DictWriter(f, csv_headers) writer.writerow(ip_list) else: print("此ip不可用!!!") # 入口函數(保存可行代理ip) if __name__ == '__main__': spider = Spider(url) utils = Utils() save = Save(utils.get_path()) # 存儲csv頭 with open('ip.csv',mode = 'a', newline='') as f: writer = csv.DictWriter(f, csv_headers) writer.writeheader() page_list = spider.parse_request(spider.send_request(params, 0)) for ip in page_list: save.csv_save(ip) time.sleep(1)
四、代理IP池使用
這裏我主要是通過get_proxies函數從CSV文件獲取代理IP,並且通過get_proxies_agent函數生成proxies,兩個函數具體信息如下:
def get_proxies(self):
"""
讀取csv文件獲取代理ip
"""
with open('ip.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
proxies_agent.append(row) # proxies_agent屬於一個全局列表
print("獲取代理ip成功!!!")
def get_proxies_agent(self):
"""
生成代理ip頭部
{
http : 'http://ip:port'
}
"""
tmp = random.sample(proxies_agent, 1)[0]
proxies = {
tmp[2].lower() : tmp[2].lower() + '://' + str(tmp[0]) + ':' + str(tmp[1])
}
print('本次請求的代理ip:' + proxies[tmp[2].lower()])
return proxies
使用代理IP:
utils = Utils()
request = requests.get(url, headers=utils.get_headers(), proxies = utils.get_proxies_agent(), params = prms)