PYTHON爬取汽車之家數據
使用知識
- 使用BeautifulSoup模塊
- 使用正則表達式
- 使用到多線程爬取
使用說明
- 使用前請安裝BeauifulSoup
- 起始頁面: https://www.autohome.com.cn/grade/carhtml/A.html
- 運行程序後會在當前目錄下生成txt文件,內容爲json格式.如下所示:
{“branch_first_letter”: “S”, “branch_name”: “薩博”, “branch_id”: “64”, “producer”: “薩博”, “producer_id”: “”, “car_series”: “Saab 900”, “car_series_id”: “s2630”, “car_price”: “暫無報價”}
源代碼
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/1/16 15:34
# @Author : wsx
# @Site :
# @File : cars.py
# @Software: PyCharm
import json
from multiprocessing import Pool
import requests
from requests.exceptions import RequestException
import re
from bs4 import BeautifulSoup
def get_one_page(url):
"""
請求網頁函數.
:param url:
:return:
"""
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0'}
try:
response = requests.get(url, headers=headers)
print(response.status_code)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html, first_letter):
"""
網頁處理函數, 生成器
:param html:
:param first_letter:
:return:iterable
"""
# 加載網頁
soup = BeautifulSoup(html, 'lxml')
# 創建字典,存儲數據
info = {'branch_first_letter': '', 'branch_name': '', 'branch_id': '', 'producer': '', 'producer_id': '',
'car_series': '', 'car_series_id': '', 'car_price': ''}
# 找出所需信息在的標籤
branches = soup.find_all('dl')
# 先獲取品牌
for branch in branches:
info['branch_name'] = branch.dt.div.a.string.strip()
info['branch_id'] = branch['id']
info['branch_first_letter'] = first_letter
print('正在抓取...品牌:', info['branch_name'])
# 生成新的處理塊
block = branch.find_all('dd')
soup = BeautifulSoup(str(block), 'lxml')
# 獲取某一品牌下的所有制造商
producers = soup.find_all('div', attrs={'class': 'h3-tit'})
for producer in producers:
info['producer'] = producer.a.get_text().strip()
# 找不到這個參數呀.
info['producer_id'] = ''
print('正在抓取...生產商:', info['producer'])
cars = producer.find_next('ul')
for car in cars.find_all('li', attrs={'id': True}):
info['car_series_id'] = car['id']
info['car_series'] = car.h4.a.get_text().strip()
# 價格這個參數難提取, 初步過濾一下
price = car.find_all('a', attrs={'class': True, 'data-value': False})
# 判斷一下抓取的是不是價格, 用正則表達式再過濾一下
if price:
print(price[0].get_text())
if re.match('.*?萬.*?', price[0].get_text(), re.S):
info['car_price'] = price[0].get_text().strip()
else:
info['car_price'] = '暫無報價'
# 做成迭代器
yield info
def write_file(content):
"""
將抓取數據保存成Json文件
:param content:
:return: None
"""
with open('cars.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
f.close()
def main(first_letter):
"""
主函數
:param first_letter:
:return: None
"""
html = get_one_page('https://www.autohome.com.cn/grade/carhtml/' + first_letter + '.html')
soup = BeautifulSoup(html, 'lxml')
html = soup.prettify()
# 測試時先存在本地以免頻繁訪問站點
# with open('car_home.html', 'w', encoding='utf-8') as f:
# f.write(html)
# f.close()
# with open('car_home.html', 'r', encoding='utf-8') as f:
# html = f.read()
# f.close()
for item in parse_one_page(html, first_letter):
write_file(item)
if __name__ == '__main__':
# 如不需要按照字母順序, 則uncomment
# pool = Pool()
# pool.map(main, [chr(i + ord('A')) for i in range(26)])
# 如需要多線程, 則comment
for letter in [chr(i + ord('A')) for i in range(26)]:
main(letter)
大家可能會問:爲什麼爬取個簡單的數據還要三層循環?我主要考慮到數據之間的關聯性、層級性才使用了三層循環,這樣才能保證數據之間的層級關係保持不亂。
編寫代碼過程中遇到BeautifulSoup中,find_all()方法如果只需要確定是否存在某個屬性,而不指定具體屬性值,可以寫成下面這樣:
car.find_all('a', attrs={'class': True, 'data-value': False})
本人小白,大神輕噴!