前言:
在聯繫二的時候,練習了爬噹噹網、爬豆瓣網,但是爬取的信息都比較慢,
練習:
基礎練習
實現代碼
import threading
import time
#創建一個線程子類:
class MyThread(threading.Thread):
def __init__(self,threadID,name,couter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.couter = couter
def run(self):
print("開始線程: "+self.name)
moyu_time(self.name, self.couter, 10)
print("退出線程:"+self.name)
def moyu_time(threadName,delay,counter):
while counter:
time.sleep(delay)
print("%s 開始摸魚 %s" % (threadName,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
counter -= 1
#創建新線程
# 小明休息一秒
# 小紅消息兩秒
thread1 = MyThread(1,"小明",1)
thread2 = MyThread(2,"小紅",2)
#開啓新線程
thread1.start()
thread2.start()
#等待線程終止
thread1.join()
thread2.join()
print("退出主線程")
結果
線程池練習一:
瘋狂的開啓線程,因爲頻繁的創建線程 銷燬線程,非常的浪費資源,所以呢,應該把他們放到池子裏面去一起洗澡--線程池
可以使用 ThreadPoolExecutor 來實現線程池,這樣就不會去重複的創建銷燬線程了
實現代碼:
import threading
import time
#創建一個線程子類:
from concurrent.futures.thread import ThreadPoolExecutor
class MyThread(threading.Thread):
def __init__(self,threadID,name,couter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.couter = couter
def run(self):
print("開始線程: "+self.name)
moyu_time(self.name, self.couter, 10)
print("退出線程:"+self.name)
def moyu_time(threadName,delay,counter):
while counter:
time.sleep(delay)
print("%s 開始摸魚 %s" % (threadName,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
counter -= 1
if __name__ == '__main__':
pool = ThreadPoolExecutor (29)
for i in range(1,5):
pool.submit(moyu_time(('YDLin'+str(i)),1,3))
用隊列做線程池
import threading
import time
from queue import Queue
class CustomThread(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
self.__queue = queue
def run(self):
while True:
#要執行的話就需要去隊列裏面取了
q_method = self.__queue.get()
q_method()
self.__queue.task_done()
def moyu():
print(" 開始摸魚 %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
def queue_pool():
queue = Queue(5)
for i in range(queue.maxsize):
t = CustomThread(queue)
t.setDaemon(True) #每個線程都讓它們處於守護狀態
t.start()
for i in range(20):
queue.put(moyu)
queue.join()
if __name__ == '__main__':
queue_pool()
實戰:
之前爬取豆瓣網的改進:
使用多線程,幾分鐘就搞定了
import json
import multiprocessing
import multiprocess as multiprocess
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import re
import time
import xlwt
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('豆瓣電影Top250', cell_overwrite_ok=True)
sheet.write(0, 0, '名稱')
sheet.write(0, 1, '圖片')
sheet.write(0, 2, '排名')
sheet.write(0, 3, '評分')
sheet.write(0, 4, '作者')
sheet.write(0, 5, '簡介')
n = 1
def save_to_excel(soup):
list = soup.find(class_='grid_view').find_all('li')
for item in list:
item_name = item.find(class_='title').string
item_img = item.find('a').find('img').get('src')
item_index = item.find(class_='').string
item_score = item.find(class_='rating_num').string
item_author = item.find('p').text
if (item.find(class_='inq') != None):
item_intr = item.find(class_='inq').string
# print('爬取電影:' + item_index + ' | ' + item_name +' | ' + item_img +' | ' + item_score +' | ' + item_author +' | ' + item_intr )
print('爬取電影:' + item_index + ' | ' + item_name + ' | ' + item_score + ' | ' + item_intr)
global n
sheet.write(n, 0, item_name)
sheet.write(n, 1, item_img)
sheet.write(n, 2, item_index)
sheet.write(n, 3, item_score)
sheet.write(n, 4, item_author)
sheet.write(n, 5, item_intr)
n = n + 1
def parse_one_page(html):
pattern = re.compile(
'<li>.*?<em class="">(.*?)</em>.*?title.*?>(.*?)</span>.*? <span class="rating_num" property="v:average">(.*?)</span>.*?<span class="inq">(.*?)</span>',
re.S)
items = re.findall(pattern, html)
for item in items:
yield {'index': item[0],
'title': item[1],
'score': item[2],
'comment': item[3]
}
def write_to_file(content):
with open('douban250.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
def main(url):
html = get_one_page(url)
soup = BeautifulSoup(html, 'lxml')
save_to_excel(soup)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
start =time.time()
urls = []
pool = multiprocessing.Pool(multiprocessing.cpu_count())#我們根據電腦 CPU 的內核數量創建相應的進程池
for i in range(0, 10):
url = 'https://movie.douban.com/top250?start=' + str(i * 25) + '&filter='
urls.append(url)
pool.map(main, urls)#通過 map 方法去執行我們的主函數將我們獲得的 url 傳過去
pool.close()
pool.join()#爲的是讓進程池的進程執行完畢再結束
book.save(u'豆瓣最受歡迎的250部電影.xlsx')