import requests
from lxml import etree
link_head='https://alexa.chinaz.com/Country/index_CN'
link_end='.html'
hd={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
c=1
for i in range(1,21):
if i==1:
link=link_head+link_end
else:
link=link_head+'_'+str(i)+link_end
r=requests.get(link,headers=hd)
html=etree.HTML(r.text)
path1='/html/body/div[2]/div[4]/ul/li['
path2=']/div[2]/h3/a[1]/text()'
for j in range(1,26):
web_list=html.xpath(path1+str(j)+path2)
print(c,web_list[0])
c+=1
f = open(r"C:\Users\Heisenberg\Desktop\newfile.txt", "a+")
f.write(web_list[0]+'\n')
f.close()
import requests
import time
link_list=[]
f=open(r"C:\Users\Heisenberg\Desktop\newfile.txt", "r")
web_list=f.readlines()
#hd={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
for each in web_list:
link=each.replace('\n',"")
#print(link)
link_list.append(link)
start=time.time()
link_head="http://"
for each in link_list:
try:
r=requests.get(link_head+each)
print(r.status_code,each)
except Exception as e:
print('error:',e)
end=time.time()
print('串行的總時間爲:',end-start)
並行
import threading
import requests
import time
link_list=[]
f=open(r"C:\Users\Heisenberg\Desktop\newfile.txt", "r")
web_list=f.readlines()
link_head="http://"
for each in web_list:
link=each.replace('\n','')
link_list.append(link)
start=time.time()
class myThread (threading.Thread):
def __init__(self,name,link_range):
threading.Thread.__init__(self)
self.name=name
self.link_range=link_range
def run(self):
print('starting ',self.name)
crawler(self.name,self.link_range)
print('exiting ',self.name)
def crawler(threadName,link_range):
for i in range(link_range[0],link_range[1]+1):
try:
r=requests.get(link_head+link_list[i],timeout=20)
print(threadName,r.status_code,link_list[i])
except Exception as e:
print(threadName,'Error',e)
thread_list=[]
link_range_list=[(0,100),(101,200),(201,300),(301,400),(401,500)]
for i in range(1,6):
thread=myThread("thread-"+str(i),link_range_list[i-1])
thread.start()
thread_list.append(thread)
for thread in thread_list:
thread.join()
end=time.time()
print('簡單多線程爬蟲時間爲:',end-start)
print('Exiting main thread')
時間有點快
上述代碼存在一些可以改進之處:因爲我們把整個鏈接列表分成了5等份,所以當某個線程先完成200條網頁的爬蟲後會退出線程,這樣就只剩下有4個線程在運行。相對於5個線程,速度會有所下降,到最後剩下一個線程在運行時,就會變成單線程。
有沒有一種方式能夠在完成1000個網頁的抓取之前都使用5個線程的全速爬蟲呢?這時可以使用Queue。Python的Queue模塊中提供了同步的、線程安全的隊列類,包括FIFO(先入先出)隊列Queue、LIFO(後入先出)隊列LifoQueue和優先級隊列PriorityQueue。
import threading
import requests
import time
import queue as Queue
link_list=[]
link_head="http://"
f=open(r"C:\Users\Heisenberg\Desktop\newfile.txt", "r")
web_list=f.readlines()
for each in web_list:
link=each.replace('\n','')
link_list.append(link)
start=time.time()
class myThread(threading.Thread):
def __init__(self,name,q):
threading.Thread.__init__(self)
self.name=name
self.q=q
def run(self):
print("start ",self.name)
while True:
try:
crawler(self.name,self.q)
except:
break
print("exiting ",self.name)
def crawler(threadName,q):
url=q.get(timeout=2)
try:
r=requests.get(link_head+url,timeout=20)
print(q.qsize(),threadName,r.status_code,url)
except Exception as e:
print(q.qsize(),threadName,url,"Error ",e)
threadList=['Thread-1','Thread-2','Thread-3','Thread-4','Thread-5']
workQueue=Queue.Queue(1000)
for url in link_list:
workQueue.put(url)
threads=[]
#創建threads
for tName in threadList:
thread=myThread(tName,workQueue)
thread.start()
threads.append(thread)
#等待所有線程
for t in threads:
t.join()
end=time.time()
print("總時間爲 ",end-start)
print('Exiting main thread')