python爬蟲利用requests製作代理池s

爬取代理然後驗證代理,將可用代理放入txt文件。

辣雞編碼,大佬們輕噴,有問題留言。。。。。。。謝謝。
結果如圖
在這裏插入圖片描述

import requests
from scrapy import Selector

start_url = 'http://www.89ip.cn/index_1.html'
url = 'http://www.89ip.cn/index_{}.html'

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}

class MyProxy(object):
	def GetPage(self,url):#頁面源碼獲取
		response = requests.get(url=url,headers=headers)
		text = response.text
		return text
	def GetInfo(self,text):#頁面信息獲取
		selector = Selector(text=text)
		FindTable = selector.xpath('//div[@class="layui-form"]/table/tbody/tr')
		for proxy in FindTable:
			ip = "".join(proxy.xpath('.//td[1]/text()').get()).replace('\t','').replace('\n','')
			port = "".join(proxy.xpath('.//td[2]/text()').get()).replace('\t','').replace('\n','')
			print(ip,port)
			self.TestIP(ip,port)
	def TabPage(self,text):#切換頁面
		selector = Selector(text=text)
		page = selector.xpath('//*[@id="layui-laypage-1"]/a[8]/@data-page').get()
		self.new_url = url.format(page)
	def TestIP(self,ip,port):
		try:
			response = requests.get(url='https://www.baidu.com/',headers=headers,proxies={"http":"{}:{}".format(ip,port)})
			print(response.status_code)
			if response.status_code<200 or response.status_code>200:
				print("訪問失敗")
			else:
				self.file = open('proxy.txt', 'a+')
				self.file.write('{}:{}\n'.format(ip,port))
				self.file.close()
		except Exception as e:
			print("訪問失敗")
	def close(self):
		self.file.close()
mypoxy = MyProxy()
text = mypoxy.GetPage(start_url)
while True:
	try:
		mypoxy.GetInfo(text)
		mypoxy.GetPage(text)
		text = mypoxy.GetPage(mypoxy.new_url)
	except Exception as e:
		print('**'*10)
		# mypoxy.close()

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章