代碼:
# -*- coding: utf-8 -*-
# @Time : 2020/4/19 21:54
# @Author : Oneqq
# @File : 24.threads的使用.py
# @Software: PyCharm
from threading import Thread
from queue import Queue
from fake_useragent import UserAgent
import requests
from lxml import etree
class CrawlInof(Thread):
def __init__(self, url_queue, html_queue):
Thread.__init__(self)
self.url_queue = url_queue
self.html_queue = html_queue
def run(self):
headers = {
"User-Agent": UserAgent().random
}
while url_queue.empty() == False:
response = requests.get(url_queue.get(), headers=headers)
if response.status_code == 200:
html_queue.put(response.text)
class ParseInfo(Thread):
def __init__(self, html_queue):
Thread.__init__(self)
self.html_queue = html_queue
def run(self):
while self.html_queue.empty() == False:
e = etree.HTML(self.html_queue.get())
span_contents = e.xpath('//div[@class="content"]/span[1]')
with open('duanzi.txt', 'a', encoding='utf-8') as f:
for span in span_contents:
info = span.xpath('string(.)')
f.write(info)
if __name__ == '__main__':
base_url = "https://www.qiushibaike.com/text/page/{}/"
url_queue = Queue()
html_queue = Queue()
for i in range(1, 14):
new_url = base_url.format(i)
url_queue.put(new_url)
crawl_list = []
for i in range(0, 3):
crawl1 = CrawlInof(url_queue, html_queue)
crawl_list.append(crawl1)
crawl1.start()
for crawl in crawl_list:
crawl.join()
parse_list = []
for i in range(0, 3):
parseInfo = ParseInfo(html_queue)
parse_list.append(parseInfo)
parseInfo.start()
for parse in parse_list:
parse.join()
結果: