爬取中文top500中文網站並計算串行並行訪問時間對比

import requests
from lxml import etree
link_head='https://alexa.chinaz.com/Country/index_CN'
link_end='.html'
hd={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
c=1
for i in range(1,21):
    if i==1:
        link=link_head+link_end
    else:
        link=link_head+'_'+str(i)+link_end
    r=requests.get(link,headers=hd)
    html=etree.HTML(r.text)
    path1='/html/body/div[2]/div[4]/ul/li['
    path2=']/div[2]/h3/a[1]/text()'
    for j in range(1,26):
        web_list=html.xpath(path1+str(j)+path2)
        print(c,web_list[0])
        c+=1
        f = open(r"C:\Users\Heisenberg\Desktop\newfile.txt", "a+")
        f.write(web_list[0]+'\n')
        f.close()

import requests
import time
link_list=[]
f=open(r"C:\Users\Heisenberg\Desktop\newfile.txt", "r")
web_list=f.readlines()
#hd={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
for each in web_list:
    link=each.replace('\n',"")
    #print(link)
    link_list.append(link)
start=time.time()
link_head="http://"
for each in link_list:
    try:
        r=requests.get(link_head+each)
        print(r.status_code,each)
    except Exception as e:
        print('error:',e)
end=time.time()
print('串行的總時間爲：',end-start)

並行

import threading
import requests
import time
link_list=[]
f=open(r"C:\Users\Heisenberg\Desktop\newfile.txt", "r")
web_list=f.readlines()
link_head="http://"
for each in web_list:
    link=each.replace('\n','')
    link_list.append(link)
start=time.time()

class myThread (threading.Thread):
    def __init__(self,name,link_range):
        threading.Thread.__init__(self)
        self.name=name
        self.link_range=link_range
    def run(self):
        print('starting ',self.name)
        crawler(self.name,self.link_range)
        print('exiting ',self.name)
def crawler(threadName,link_range):
    for i in range(link_range[0],link_range[1]+1):
        try:
            r=requests.get(link_head+link_list[i],timeout=20)
            print(threadName,r.status_code,link_list[i])
        except Exception as e:
            print(threadName,'Error',e)
            
thread_list=[]
link_range_list=[(0,100),(101,200),(201,300),(301,400),(401,500)]

for i in range(1,6):
    thread=myThread("thread-"+str(i),link_range_list[i-1])
    thread.start()
    thread_list.append(thread)
for thread in thread_list:
    thread.join()
end=time.time()
print('簡單多線程爬蟲時間爲：',end-start)
print('Exiting main thread')

時間有點快

上述代碼存在一些可以改進之處：因爲我們把整個鏈接列表分成了5等份，所以當某個線程先完成200條網頁的爬蟲後會退出線程，這樣就只剩下有4個線程在運行。相對於5個線程，速度會有所下降，到最後剩下一個線程在運行時，就會變成單線程。

有沒有一種方式能夠在完成1000個網頁的抓取之前都使用5個線程的全速爬蟲呢？這時可以使用Queue。Python的Queue模塊中提供了同步的、線程安全的隊列類，包括FIFO（先入先出）隊列Queue、LIFO（後入先出）隊列LifoQueue和優先級隊列PriorityQueue。

import threading
import requests
import time
import queue as Queue
link_list=[]
link_head="http://"
f=open(r"C:\Users\Heisenberg\Desktop\newfile.txt", "r")
web_list=f.readlines()
for each in web_list:
    link=each.replace('\n','')
    link_list.append(link)
start=time.time()

class myThread(threading.Thread):
    def __init__(self,name,q):
        threading.Thread.__init__(self)
        self.name=name
        self.q=q
    def run(self):
        print("start ",self.name)
        while True:
            try:
                crawler(self.name,self.q)
            except:
                break
        print("exiting ",self.name)
def crawler(threadName,q):
    url=q.get(timeout=2)
    try:
        r=requests.get(link_head+url,timeout=20)
        print(q.qsize(),threadName,r.status_code,url)
    except Exception as e:
        print(q.qsize(),threadName,url,"Error ",e)

threadList=['Thread-1','Thread-2','Thread-3','Thread-4','Thread-5']
workQueue=Queue.Queue(1000)
for url in link_list:
    workQueue.put(url)
threads=[]
#創建threads
for tName in threadList:
    thread=myThread(tName,workQueue)
    thread.start()
    threads.append(thread)



#等待所有線程
for t in threads:
    t.join()

end=time.time()
print("總時間爲 ",end-start)
print('Exiting main thread')

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

爬取中文top500中文網站並計算串行並行訪問時間對比

windows10安裝gym環境後運行atari-py失敗，但是atari-py已經安裝，運行env=gym.make(‘Pong-v0‘)失敗

表格型方法求解RL——Sarsa和Q_learining

強化學習的概念

林業病蟲害數據集和數據預處理方法介紹

電影推薦模型設計

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結