寫爬蟲時爲避免因頻繁訪問而被禁止訪問的情況,常用到代理的方法
首先是代理的寫法
from urllib.request import Request, ProxyHandler, build_opener
from fake_useragent import UserAgent
url = "http://httpbin.org/get"
headers = {
"User-Agent": UserAgent().chrome
}
request = Request(url, headers=headers)
# handler = ProxyHandler({"http": "username:password@ip:port"})
handler = ProxyHandler({"http": "119.101.113.58:9999"})
opener = build_opener(handler)
response = opener.open(request)
print(response.read().decode())
額,免費的代理一般都要多試幾個才能成功。帶註釋的一行是付費的賬號,可靠性較高。一般情況下,用幾個免費的就行了。
初學爬蟲,沒什麼太大用處,但免費的代理有的不能用,本想寫個類來篩選可用的代理,代碼貼在這了,求教。verify()方法似乎有點問題,單獨測這個方法時沒問題,但在這裏調用的時候把爬到的代理都判斷爲可用代理寫入ip_ok.txt裏了,我還沒想好怎麼改(~_~)(我jio得沒問題呀,但運行結果表明方法有錯誤~~,頭疼)
from fake_useragent import UserAgent
import requests
from lxml import etree
import time
from urllib.request import Request, ProxyHandler, build_opener
from urllib.error import URLError
# IP代理池
class ProxyPool(object):
@staticmethod
def get_url(initial_url): # 高匿代理網址,返回網址的list
url_list = []
for i in range(1, 4): # 抓取的頁數
url_new = initial_url + str(i)
url_list.append(url_new)
return url_list
@staticmethod
def get_content(page_url): # 獲取網頁內容
response = requests.get(page_url, headers={"User-Agent": UserAgent().random}) # 隨機ua
if response.status_code == 200:
time.sleep(2) # 睡眠2s,保證頁面加載完畢;降低訪問頻率。
return response.text
else:
print(response.status_code, page_url)
return "error"
@staticmethod
def get_info(html): # 解析網頁獲取Type & IP & Port
if html != "error":
e = etree.HTML(html)
ips = e.xpath('//tbody/tr/td[1]/text()')
ports = e.xpath('//tbody/tr/td[2]/text()')
types = e.xpath('//tbody/tr/td[4]/text()')
# 代理網站爬到的代理,可用性未知
with open("ip_port.txt", 'a') as f:
for i in range(0, len(ips)):
info = str(types[i].lower()) + "," + str(ips[i]) + ":" + str(ports[i])
f.write(info + u"\n")
else:
print("not html page")
return "error"
# 測試代理ip是否可用
@staticmethod
def verify(type_, ip_port):
test_url = "http://httpbin.org/get"
request = Request(test_url, headers={"User-Agent": UserAgent().random})
# 構建代理
proxy = {type_: ip_port}
print("testing proxy ", proxy)
handler = ProxyHandler(proxy)
opener = build_opener(handler)
time.sleep(1)
try:
response = opener.open(request)
time.sleep(3)
if response.read():
print("proxy-testing is ok ==>", proxy)
# 保存可用的代理
with open("ip_ok.txt", "a") as f2:
f2.write(str(proxy) + "\n")
else:
print("proxy-test failure!")
except URLError as e:
print(e.reason)
if __name__ == '__main__':
url = "https://www.kuaidaili.com/free/inha/"
test = ProxyPool()
urls = test.get_url(url)
for url_ in urls:
html_ = test.get_content(url_)
test.get_info(html_)
with open("ip_port.txt", "r") as f3:
infos = f3.readlines()
for data in infos:
# 對讀入的一行數據進行處理
temp_type = data.split(",")[0]
temp_ip_port = data.split(",")[1].strip()
test.verify(temp_type, temp_ip_port)