本次代碼用於練習爬蟲的基本步驟,並且添加了與mongoDB數據庫的交互
導入的模塊:requests、threading、lxml、queue、pymongo
代碼思路:
- 生成url列表
- 請求url,獲取響應。
- 用xpath解析響應中的li元素(指定ul下的)
- 遍歷包含li元素的列表,再次用xpath解析得到所有文本,將所需的文本放進字典中。
- 把所有字典放進一個列表,實例化MongoClient對象,在"test_db"數據庫下創建"auto_info"集合,將包含汽車信息的列表添加到集合。
# conding=utf-8
import requests
import threading
from lxml import etree
from queue import Queue
from pymongo import MongoClient
class AutoSpider(object):
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/65.0.3325.181 Safari/537.36"}
self.url_list = ["https://price.pcauto.com.cn/top/oil/s2-t" + str(i) + "-p" for i in range(1, 11)]
# 鏈接本地的mongodb
client = MongoClient()
# 在test_db數據庫中創建auto_info集合
self.collection = client["test_db"]['auto_info']
# 創建三個隊列用於存放信息
self.url_queue = Queue()
self.li_queue = Queue()
self.info_queue = Queue()
# 請求url的方法單獨寫
def parse_url(self, url):
response = requests.get(url, headers=self.headers)
return response.content.decode('gbk')
# 從url_queue中拿鏈接,對每一個url發送請求,解析html獲取需要的li標籤以及是否有下一頁的信息
def get_li_list(self):
while True:
url_temp = self.url_queue.get()
page_info = "下一頁"
page_num = 1
while page_info == "下一頁":
url = url_temp + str(page_num) + ".html"
res = self.parse_url(url)
html = etree.HTML(res)
li_list = html.xpath('''//ul[@class='listA']//li''')
page_info = html.xpath('''//div[@class="pcauto_page"]/a[last()]/text()''')
page_info = page_info[0] if len(page_info) else None
self.li_queue.put(li_list)
page_num += 1
self.url_queue.task_done()
# 從li_queue隊列中獲取包含li元素的列表,遍歷解析,獲取需要的文本信息
def get_auto_info(self):
while True:
li_list = self.li_queue.get()
auto_infos = []
for temp in li_list:
auto_info = {}
info = temp.xpath('''./div[@class='info']//p//text()''')
auto_info['name'] = info[0]
auto_info['price'] = info[2]
auto_info['oil'] = info[4]
auto_info['brand'] = info[5].split(':')[1]
auto_info['type'] = info[7].split(':')[1]
auto_info['emission'] = info[9]
auto_infos.append(auto_info)
self.info_queue.put(auto_infos)
self.li_queue.task_done()
# 將info_queue中的汽車信息寫入mongodb指定的集合中
def write_to_mongodb(self):
while True:
info = self.info_queue.get()
if info:
self.collection.insert_many(info)
print("寫入mongodb")
self.info_queue.task_done()
def run(self):
# 將鏈接模板放進url隊列中
for i in self.url_list:
self.url_queue.put(i)
t_list = []
for i in range(3):
li_t = threading.Thread(target=self.get_li_list)
t_list.append(li_t)
for i in range(3):
info_t = threading.Thread(target=self.get_auto_info)
t_list.append(info_t)
for i in range(3):
write_t = threading.Thread(target=self.write_to_mongodb)
t_list.append(write_t)
for t in t_list:
t.setDaemon(True)
t.start()
for q in [self.url_queue, self.li_queue, self.info_queue]:
q.join()
if __name__ == "__main__":
a = AutoSpider()
a.run()