使用環境 Python3
使用到的庫
requests,lxml,threading,queue,time
需要額外安裝的庫
requests ,lxml
安裝命令
pip install requests lxml
開始爬蟲
工作流程:
1. 構造 URL 列表
2. 獲取 URL 響應頁面
3. 提取頁面有用數據
4. 保存數據
5. 清洗數據
上代碼
import requests
import threading
from queue import Queue
from lxml import etree
import time
class proxySpider():
def __init__(self):
self.url_temp = "https://www.xicidaili.com/nn/"
self.headers = {"User-Agent": "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"}
self.url_queue = Queue()
self.page_queue = Queue()
self.content_queue = Queue()
def get_url_list(self):
for i in range(1, 6):
self.url_queue.put(self.url_temp + str(i))
def parse_url(self):
while True:
if self.url_queue.empty() != True:
url = self.url_queue.get()
resp = requests.get(url, headers=self.headers)
if resp.status_code == 200:
self.page_queue.put(resp.content)
self.url_queue.task_done()
def get_content_list(self):
while True:
if self.page_queue.empty() != True:
page = self.page_queue.get()
html = etree.HTML(page.decode("utf-8"))
proxy_list = []
for each in html.xpath("//tr[@class='odd']"):
ip = each.xpath("./td[2]/text()")[0]
port = each.xpath("./td[3]/text()")[0]
if ip:
proxy = ip[0] + ":" + port[0]
proxy_list.append(proxy)
self.content_queue.put(proxy_list)
self.page_queue.task_done()
def save(self):
while True:
if self.content_queue.empty() != True:
print("寫入中...")
with open("./res2.txt", "a", encoding="utf-8") as f:
content = self.content_queue.get()
for each in content:
f.write(each + "\n")
self.content_queue.task_done()
def run(self):
thread_list = []
self.get_url_list()
thread_list.extend([threading.Thread(target=self.parse_url) for i in range(3)])
thread_list.extend([threading.Thread(target=self.get_content_list) for i in range(3)])
thread_list.extend([threading.Thread(target=self.save) for i in range(2)])
for each in thread_list:
each.setDaemon(True)
each.start()
for q in [self.url_queue, self.page_queue, self.content_queue]:
q.join()
if __name__ == "__main__":
start_time = time.time()
spider = proxySpider()
spider.run()
end_time = time.time()
cost_time = end_time - start_time
print("時間開銷: {}".format(cost_time))
代理ip有效性檢查類
class checkIp():
def __init__(self):
self.url = 'http://www.baidu.com'
self.headers = {"User-Agent": "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"}
self.ip_queue = Queue()
self.ip_valid_queue = Queue()
def get_ip(self):
f = open("./res2.txt", "r", encoding="utf-8")
while True:
ip_temp = f.readline().strip()
if ip_temp == '':
f.close()
break
elif self.ip_queue.full() != True:
self.ip_queue.put(ip_temp)
def check_ip(self):
url = self.url
while True:
ip_temp = self.ip_queue.get()
proxy_ip = {"https": "https://" + ip_temp, "http": "http://" + ip_temp}
try:
resp = requests.get(url, headers=self.headers, proxies=proxy_ip, verify=False, timeout=2)
if resp.status_code == 200:
self.ip_valid_queue.put(ip_temp)
except Exception:
pass
self.ip_queue.task_done()
def save(self):
while True:
with open("./res2.txt", "a", encoding="utf-8") as f:
if self.ip_valid_queue.empty() != True:
ip = self.ip_valid_queue.get()
f.write(ip + '\n')
self.ip_valid_queue.task_done()
def run(self):
thread_list = []
thread_list.extend([threading.Thread(target=self.get_ip) for i in range(1)])
thread_list.extend([threading.Thread(target=self.check_ip) for i in range(3)])
thread_list.extend([threading.Thread(target=self.save) for i in range(4)])
for each in thread_list:
each.setDaemon(True)
each.start()
for i in [self.ip_queue, self.ip_valid_queue]:
i.join()
遇到的問題與總結:
- 驗證代理時出現 報錯: requests.exceptions.ProxyError: HTTPConnectionPool(host=‘113.65.5.186’, port…
原因: 是因爲我的機器啓用代理
解決方法: 進入電腦設置 關閉代理, 再啓動 代理 ip有效性測試程序
- 有待解決問題: 隊列應當設置上限 (特別是在大項目中). 本人在設置上限後,不能有效的解決出現的bug,所以放棄了 機器資源的節省管理.(但這隻適用於小文本資源的程序)
最後,歡迎各位看官提出建議.