前言:
在联系二的时候,练习了爬当当网、爬豆瓣网,但是爬取的信息都比较慢,
练习:
基础练习
实现代码
import threading
import time
#创建一个线程子类:
class MyThread(threading.Thread):
def __init__(self,threadID,name,couter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.couter = couter
def run(self):
print("开始线程: "+self.name)
moyu_time(self.name, self.couter, 10)
print("退出线程:"+self.name)
def moyu_time(threadName,delay,counter):
while counter:
time.sleep(delay)
print("%s 开始摸鱼 %s" % (threadName,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
counter -= 1
#创建新线程
# 小明休息一秒
# 小红消息两秒
thread1 = MyThread(1,"小明",1)
thread2 = MyThread(2,"小红",2)
#开启新线程
thread1.start()
thread2.start()
#等待线程终止
thread1.join()
thread2.join()
print("退出主线程")
结果
线程池练习一:
疯狂的开启线程,因为频繁的创建线程 销毁线程,非常的浪费资源,所以呢,应该把他们放到池子里面去一起洗澡--线程池
可以使用 ThreadPoolExecutor 来实现线程池,这样就不会去重复的创建销毁线程了
实现代码:
import threading
import time
#创建一个线程子类:
from concurrent.futures.thread import ThreadPoolExecutor
class MyThread(threading.Thread):
def __init__(self,threadID,name,couter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.couter = couter
def run(self):
print("开始线程: "+self.name)
moyu_time(self.name, self.couter, 10)
print("退出线程:"+self.name)
def moyu_time(threadName,delay,counter):
while counter:
time.sleep(delay)
print("%s 开始摸鱼 %s" % (threadName,time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
counter -= 1
if __name__ == '__main__':
pool = ThreadPoolExecutor (29)
for i in range(1,5):
pool.submit(moyu_time(('YDLin'+str(i)),1,3))
用队列做线程池
import threading
import time
from queue import Queue
class CustomThread(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
self.__queue = queue
def run(self):
while True:
#要执行的话就需要去队列里面取了
q_method = self.__queue.get()
q_method()
self.__queue.task_done()
def moyu():
print(" 开始摸鱼 %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
def queue_pool():
queue = Queue(5)
for i in range(queue.maxsize):
t = CustomThread(queue)
t.setDaemon(True) #每个线程都让它们处于守护状态
t.start()
for i in range(20):
queue.put(moyu)
queue.join()
if __name__ == '__main__':
queue_pool()
实战:
之前爬取豆瓣网的改进:
使用多线程,几分钟就搞定了
import json
import multiprocessing
import multiprocess as multiprocess
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import re
import time
import xlwt
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('豆瓣电影Top250', cell_overwrite_ok=True)
sheet.write(0, 0, '名称')
sheet.write(0, 1, '图片')
sheet.write(0, 2, '排名')
sheet.write(0, 3, '评分')
sheet.write(0, 4, '作者')
sheet.write(0, 5, '简介')
n = 1
def save_to_excel(soup):
list = soup.find(class_='grid_view').find_all('li')
for item in list:
item_name = item.find(class_='title').string
item_img = item.find('a').find('img').get('src')
item_index = item.find(class_='').string
item_score = item.find(class_='rating_num').string
item_author = item.find('p').text
if (item.find(class_='inq') != None):
item_intr = item.find(class_='inq').string
# print('爬取电影:' + item_index + ' | ' + item_name +' | ' + item_img +' | ' + item_score +' | ' + item_author +' | ' + item_intr )
print('爬取电影:' + item_index + ' | ' + item_name + ' | ' + item_score + ' | ' + item_intr)
global n
sheet.write(n, 0, item_name)
sheet.write(n, 1, item_img)
sheet.write(n, 2, item_index)
sheet.write(n, 3, item_score)
sheet.write(n, 4, item_author)
sheet.write(n, 5, item_intr)
n = n + 1
def parse_one_page(html):
pattern = re.compile(
'<li>.*?<em class="">(.*?)</em>.*?title.*?>(.*?)</span>.*? <span class="rating_num" property="v:average">(.*?)</span>.*?<span class="inq">(.*?)</span>',
re.S)
items = re.findall(pattern, html)
for item in items:
yield {'index': item[0],
'title': item[1],
'score': item[2],
'comment': item[3]
}
def write_to_file(content):
with open('douban250.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
def main(url):
html = get_one_page(url)
soup = BeautifulSoup(html, 'lxml')
save_to_excel(soup)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
start =time.time()
urls = []
pool = multiprocessing.Pool(multiprocessing.cpu_count())#我们根据电脑 CPU 的内核数量创建相应的进程池
for i in range(0, 10):
url = 'https://movie.douban.com/top250?start=' + str(i * 25) + '&filter='
urls.append(url)
pool.map(main, urls)#通过 map 方法去执行我们的主函数将我们获得的 url 传过去
pool.close()
pool.join()#为的是让进程池的进程执行完毕再结束
book.save(u'豆瓣最受欢迎的250部电影.xlsx')