人民日報語料庫抓取python實現（二）--多線程

原創

2018-08-25 22:40

由於有大量的IO，多線程可以提高爬取的效率。出於不同隊列存儲不同url和對於爬蟲進行分工的初衷，這裏實現了兩個隊列shareMonthQueue和shareReportQueue。其中shareMonthQueue存儲所有月份初始url和包含的其他頁面（一個月份有很多page，例：1946年5月包含30個page）。shareReportQueue存儲所有新聞的url。兩個隊列有其專用的爬蟲monthSpider和reportSpider。師兄說：從操作系統的角度來看，兩個隊列是多此一舉，增加代碼複雜度，並不提高效率。我想了想，師兄說的對。

上代碼：

#coding:utf-8
#author:zhangyang
#date:2015-5-21
#此程序用於爬取人民日報下的數據資源。主頁面需要提取包括1946年到2003年之間所有月份
#次級頁面是各個月份的所有報道
#末級頁面是報道內容
#使用多線程提高爬取效率

import urllib2,bs4,os,re
from time import clock
import threading,Queue

#關於bs4解析url的方法可以參看：http://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html


starturl="http://rmrbw.info/"
shareMonthQueue=Queue.Queue()  #存儲月份url的公共隊列
shareReportQueue=Queue.Queue() #c存儲新聞url的公共隊列
_WORK_MONTH_THREAD_NUM=3       #用於處理月份url的爬蟲數量
_WORK_REPORT_THREAD_NUM_=10    #用於處理新聞url的爬蟲數量
totalNum=0  #全局計數器
mutex=threading.Lock() #互斥鎖
tlist=[]<span style="white-space:pre">	</span>#線程列表
t1=clock()
t2=clock()
t3=clock()
t4=clock()


class monthSplider(threading.Thread):
	def __init__(self,name,dicPath = os.getcwd()+os.path.sep+"data"+os.path.sep):
		threading.Thread.__init__(self)
		self.name=name
		self.dicPath=dicPath
		self.TIMEOUT=10

	def run(self):
		start=clock()
		end=clock()
		while True:
			if shareMonthQueue.empty()==False:
				start=clock()
				monthurl=shareMonthQueue.get()
				try:
					page=urllib2.urlopen(monthurl).read()
					soup=bs4.BeautifulSoup(''.join(page),'lxml')
				except Exception as e:
					print "loading url error at line 43"
					print e
					continue
				title=soup.find('a','fl')   #找到年月的標籤位置
				month=title.contents[0]
				curpath=os.getcwd()
				#print month.encode('utf8')
				datapath=self.dicPath+month.encode('gbk')
				if os.path.exists(datapath)==False:
					os.mkdir(datapath)                       #創建好當月文件夾

				pages=soup.find('div','pages').contents[-1]
				totalpage=pages.split(' ')[3].split('/')[1]   #得到總頁面數
				templist=monthurl.split('=')
				curpage=templist[-1]
				curpage=int(curpage.strip())              #得到當前頁面值
		
				#判斷如果curpage小於totalpage，則把curpage+1得到下一個頁面放入shareMonthQueue中
				if curpage<totalpage:
					templist[-1]=str(curpage+1)
					nexturl='='.join(templist)
					shareMonthQueue.put(nexturl)
				#獲取當前頁面所有新聞的url,並把url放入shareReportQueue裏
				res=soup.find_all(id=re.compile("a_ajax_"))
				for item in res:
					shareReportQueue.put(starturl+item['href'])
			else:
				#在shareMonthQueue爲空的情況下等待TIMEOUT秒後退出
				end=clock()
				if (end-start)>self.TIMEOUT:
					break
					
class reportSpider(threading.Thread):
	def __init__(self,name,dicPath = os.getcwd()+os.path.sep+"data"+os.path.sep):
		threading.Thread.__init__(self)
		self.name=name
		self.dicPath=dicPath
		self.TIMEOUT=10
		
	def run(self):
		start=clock()
		end=clock()
		while True:
			if shareReportQueue.empty()==False:
				start=clock()
				url=shareReportQueue.get()
				try:
					page=urllib2.urlopen(url).read()
					soup=bs4.BeautifulSoup(''.join(page),'lxml')
				except Exception as e:
					print "loading url error at line 93"
					print e
					continue
				month=soup.find('a',href=re.compile('thread.php')).get_text().strip() #解析當前網頁所在年月
				month=month.encode('gbk')
				title=soup.find('h1','fl').get_text() #解析當前網頁的新聞標題

				title=title.strip().split(' ')[0]
				#print title.encode('utf8')
				cont_div=soup.find('div','tpc_content')
				cont=cont_div.get_text().strip()   #解析當前網頁的新聞內容
				title=title.encode('gbk')
				cont=cont.encode('gbk')
				try:
					filename=self.dicPath+month+os.path.sep+title+'.txt'
					f=open(filename,'w')
					f.write(cont)
				except Exception as e:
					print str(e)+self.name
					continue
				global totalNum
				global mutex
				if mutex.acquire(1):
					totalNum+=1
					mutex.release()
				#print self.name+"處理了一個頁面"
				if totalNum%100==0:
					global t3,t4
					t4=clock()
					print "已處理了"+str(totalNum)+"條數據,用時"+str(t4-t3)+'s'
			else:
				end=clock()
				if (end-start)>self.TIMEOUT:
					break


def main():
	global t1,t2,t3,t4
	t1=clock()
	pape=urllib2.urlopen(starturl)
	mainsoup=bs4.BeautifulSoup(''.join(pape),'lxml')
	alist=mainsoup.find_all('a',class_='fnamecolor',limit=10)

	for item in alist:
		monthurl=item['href']+'&page=1'
		shareMonthQueue.put(starturl+monthurl)
	t2=clock()
	print "主頁面爬取完成，用時"+str(t2-t1)+'s'

	for i in xrange(_WORK_REPORT_THREAD_NUM_):
		if i<_WORK_MONTH_THREAD_NUM:
			ms=monthSplider('ms'+str(i))
			tlist.append(ms)
		rs=reportSpider('rs'+str(i))
		tlist.append(rs)
	t3=clock()
	print "爬蟲準備就緒,用時"+str(t3-t2)+'s'
	for t in tlist:
		t.start()
	for t in tlist:
		t.join()




if __name__=="__main__":
	main()

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

人民日報語料庫抓取python實現（二）--多線程

結巴分詞1.8.2版本源代碼解析(一)

人民日報語料庫抓取python實現（二）--多線程

結巴分詞源代碼解析（二）

人民日報語料庫抓取python實現

HMM模型之viterbi算法

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結