本次代码用于练习爬虫的基本步骤,并且添加了与mongoDB数据库的交互
导入的模块:requests、threading、lxml、queue、pymongo
代码思路:
- 生成url列表
- 请求url,获取响应。
- 用xpath解析响应中的li元素(指定ul下的)
- 遍历包含li元素的列表,再次用xpath解析得到所有文本,将所需的文本放进字典中。
- 把所有字典放进一个列表,实例化MongoClient对象,在"test_db"数据库下创建"auto_info"集合,将包含汽车信息的列表添加到集合。
# conding=utf-8
import requests
import threading
from lxml import etree
from queue import Queue
from pymongo import MongoClient
class AutoSpider(object):
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/65.0.3325.181 Safari/537.36"}
self.url_list = ["https://price.pcauto.com.cn/top/oil/s2-t" + str(i) + "-p" for i in range(1, 11)]
# 链接本地的mongodb
client = MongoClient()
# 在test_db数据库中创建auto_info集合
self.collection = client["test_db"]['auto_info']
# 创建三个队列用于存放信息
self.url_queue = Queue()
self.li_queue = Queue()
self.info_queue = Queue()
# 请求url的方法单独写
def parse_url(self, url):
response = requests.get(url, headers=self.headers)
return response.content.decode('gbk')
# 从url_queue中拿链接,对每一个url发送请求,解析html获取需要的li标签以及是否有下一页的信息
def get_li_list(self):
while True:
url_temp = self.url_queue.get()
page_info = "下一页"
page_num = 1
while page_info == "下一页":
url = url_temp + str(page_num) + ".html"
res = self.parse_url(url)
html = etree.HTML(res)
li_list = html.xpath('''//ul[@class='listA']//li''')
page_info = html.xpath('''//div[@class="pcauto_page"]/a[last()]/text()''')
page_info = page_info[0] if len(page_info) else None
self.li_queue.put(li_list)
page_num += 1
self.url_queue.task_done()
# 从li_queue队列中获取包含li元素的列表,遍历解析,获取需要的文本信息
def get_auto_info(self):
while True:
li_list = self.li_queue.get()
auto_infos = []
for temp in li_list:
auto_info = {}
info = temp.xpath('''./div[@class='info']//p//text()''')
auto_info['name'] = info[0]
auto_info['price'] = info[2]
auto_info['oil'] = info[4]
auto_info['brand'] = info[5].split(':')[1]
auto_info['type'] = info[7].split(':')[1]
auto_info['emission'] = info[9]
auto_infos.append(auto_info)
self.info_queue.put(auto_infos)
self.li_queue.task_done()
# 将info_queue中的汽车信息写入mongodb指定的集合中
def write_to_mongodb(self):
while True:
info = self.info_queue.get()
if info:
self.collection.insert_many(info)
print("写入mongodb")
self.info_queue.task_done()
def run(self):
# 将链接模板放进url队列中
for i in self.url_list:
self.url_queue.put(i)
t_list = []
for i in range(3):
li_t = threading.Thread(target=self.get_li_list)
t_list.append(li_t)
for i in range(3):
info_t = threading.Thread(target=self.get_auto_info)
t_list.append(info_t)
for i in range(3):
write_t = threading.Thread(target=self.write_to_mongodb)
t_list.append(write_t)
for t in t_list:
t.setDaemon(True)
t.start()
for q in [self.url_queue, self.li_queue, self.info_queue]:
q.join()
if __name__ == "__main__":
a = AutoSpider()
a.run()