如何使用ip代理爬蟲

import urllib
import socket
import urllib2
import time
from bs4 import BeautifulSoup


url = 'http://www.xicidaili.com/nn/'
target="https://msdn.microsoft.com"
dirt={}
proxy = {'http': '223.15.151.149:8888'}
proxy_support = urllib2.ProxyHandler(proxy)
# opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler(debuglevel=1))
opener = urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)

# 添加頭信息,模仿瀏覽器抓取網頁,對付返回403禁止訪問的問題
# i_headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
i_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}


f = open("proxy.txt","w")

for i in range(1,1504):
	new_url=url+str(i)
	print new_url
	time.sleep(3)
	req = urllib2.Request(new_url, headers=i_headers)
	html = urllib2.urlopen(req).read()
	soup=BeautifulSoup(html,"html.parser")
	#print soup.body
	ips = soup.find_all('tr')
	#print ips

	for x in range(1,len(ips)):
	    ip = ips[x]
	    tds = ip.find_all("td")
	    #print tds[1].text,tds[2].text
	    dirt[tds[1].text]=tds[2].text
	    f.write(tds[1].text+":"+tds[2].text+"\n")
print len(dirt)
socket.setdefaulttimeout(3)




發佈了78 篇原創文章 · 獲贊 28 · 訪問量 9萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章