代碼如下:
import re
import urllib.error
import urllib.request
import threading
import time
import random
def use_proxy(url,proxy_addr):
#建立請求頭並設爲全局變量opener
proxy=urllib.request.ProxyHandler({"http:":proxy_addr})
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
#模擬瀏覽器的操作
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
#獲取返回信息
data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat = re.compile(r'<div class="limit_width">(.*?)</div>', re.S)
data = pat.findall(data)
return data
class Onr(threading.Thread):#新建Onr線程類
def __init__(self):
threading.Thread.__init__(self)
def run(self):
#奇數頁爬取
for i in range(1,20,2):
try:
#構建頁面的url
url="https://so.csdn.net/so/search/s.do?p="+str(i)+"&q=ios&t=blog&domain=&o=&s=&u=&l=&rbg=0"
# pagedata=urllib.request.urlopen(url).read().decode()
data=use_proxy(url,"1.198.72.44:9999")
print("第"+str(i)+"頁段子是:"+str(data))
span = round(random.random() * 6, 1)
time.sleep(span)
except urllib.error.URLError as e:
#*except urllib2.HTTPError, e:
# print e.code
#except urllib2.URLError, e:
# print e.reason*
#下面兩個if作用和上面註釋的代碼作用相同,判斷錯誤屬性
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
time.sleep(2)
except Exception as e:
print("exception" + str(e))
time.sleep(1)
class Two(threading.Thread):#新建Two 線程類
def __init__(self):
threading.Thread.__init__(self)
def run(self):
#訪問偶數頁
for i in range(2,20,2):
try:
#構建頁面的url
url = "https://so.csdn.net/so/search/s.do?p=" + str(i) + "&q=ios&t=blog&domain=&o=&s=&u=&l=&rbg=0"
# pagedata=urllib.request.urlopen(url).read().decode()
data = use_proxy(url, "1.198.72.44:9999")
print("第" + str(i) + "頁段子是:" + str(data))
span = round(random.random() * 6, 1)
time.sleep(span)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
time.sleep(2)
except Exception as e:
print("exception" + str(e))
time.sleep(1)
#建立線程
one=Onr()
two=Two()
#開始線程
one.start()
two.start()