這是一篇不完整的文章,嗯,因爲後期我需要的參數不能完全爬取出來,所以我要先去借鑑一下其他大神的步驟。
以先代碼都是自己想的,所以想記錄一下。
首先,打開汽車之家的車型庫https://car.autohome.com.cn/
然後按F12進入開發者模式,找到Ashx的連接,因爲還是新手,所有的連接都一個個的點擊了,找到了車型品牌的連接。
好了,現在可以進行代碼爬取了。爬取車型品牌及連接。之前操作的時候,並沒有發現少了一個或幾個品牌。
import requests
import re
import random
import time
#獲取車型的品牌及ID
url="https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1%20&brandId=94%20&fctId=0%20&seriesId=0"
Html = requests.get(url, timeout=2).text
pattern= "<h3><a href='(.*?)'><i class='icon10 icon10-sjr'></i>(.*?)<em>(.*?)</em></a></h3>"
uids = re.findall(pattern, Html)
with open('cars_qiche.txt', 'w', encoding='utf-8') as f:
for i in uids:
f.write(str(i) + '\n')
f.close()
結果如下:
將獲得的品牌的id,構建成完整的連接。菜鳥階段,只能一步一步的操作。
#將品牌ID添加到鏈接當中
with open('cars_qiche.txt', 'r', encoding='utf-8') as f:
pattern = '/price/brand-(.*?).html'
with open('brandid_qiche.txt','w') as b:
for i in f:
sid = re.findall(pattern, i)
urls = 'https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1%20&brandId='+sid[0]+'%20&fctId=0%20&seriesId=0'
b.write(urls+'\n')
b.close()
f.close()
遍歷車型品牌的連接,找到車系的id。
#獲得車型系列的鏈接
with open('brandid_qiche.txt', 'r') as f:
with open('Series_qiche.txt', 'a+', encoding='utf-8') as b:
for i in f:
url = i
Html2 = requests.get(url, timeout=3).text
pattern = '<a id=\'series_.*?href=\'(.*?)\'>(.*?)<em>(.*?)</em></a>'
Series = re.findall(pattern, Html2)
for j in Series:
b.write(str(j)+'\n')
b.close()
f.close()
後面的代碼我就一次性貼上了,半成品代碼orz,因爲我是一步一步做的,所以每次操作完了一步都需要將前面的註釋掉。
#獲得車型品牌的細分品牌的在售鏈接和停售鏈接
#在售鏈接url = 'https://car.autohome.com.cn//price/series-3825.html'
# 停售鏈接url = 'https://car.autohome.com.cn//price/series-3825-0-3-1.html'
with open('Series_qiche.txt','r',encoding='utf-8') as f:
with open('OnSales_qiche.txt','a+') as b:
with open('OffSales_qiche.txt','a+') as a:
for i in f:
cut1 = i.split(',')[0]
cut2 = cut1.split('(')[1]
cut3 = cut2.split('.')[0] + '\''
Onsales = 'https://car.autohome.com.cn/' + eval(cut2)
Offsales = 'https://car.autohome.com.cn/' + eval(cut3) + '-0-3-1.html'
b.write(Onsales + '\n')
a.write(Offsales + '\n')
b.close()
f.close()
# #獲得停售車型鏈接對應的分頁
with open('OffSales_qiche.txt', 'r') as f:
with open('OffSale_page.txt', 'a+') as b:
for i in f:
#url='https://car.autohome.com.cn//price/series-19-0-3.html'
url = i.split('\n')
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
timeSleep = random.randrange(1,5)
Html0 = requests.get(url[0], headers=headers ).text
pattern = '<a href=.*?-0-3-0-0-0-0-.*?>(\d)</a>'
page = re.findall(pattern, Html0)
b.write(str(page)+'\n')
time.sleep(timeSleep)
print(page)
b.close()
f.close()
#獲得所有車型在售及停售的鏈接
with open('OffSale_page.txt', 'r') as f:
with open('Offsales_qiche.txt','r') as b:
with open('allSales_qiche.txt', 'a+') as a:
list1 = []#分頁
list2 = []#鏈接
list3 = []
count = 0
for i in f:
list1.append(i.split('\n')[0])
for i in b:
list2.append(i.split('\n')[0])
for i in list1:
if i == '[]':
list3.append(list2[count])
else:
str1 = list2[count]
for j in list1[count]:
pattern = '(.*?)-1.html'
str2 = re.findall(pattern,str1)
if j.isdigit():
bouns = (str2)[0] + '-0-0-0-0-'+ j + '.html'
list3.append(bouns)
count = count + 1
for i in list3:
a.write(i+'\n')
a.close()
b.close()
f.close()
#獲得所有停售車型的ID
url='https://car.autohome.com.cn//price/series-692.html'
with open('allSales_qiche.txt', 'r') as a:
with open('qichezhijiaCar.txt', 'a+', encoding='utf-8') as f:
for i in a:
url = i.split('\n')[0]
timeSleep = random.randrange(1,5)
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
r = requests.get(url, headers = headers)
Html = r.text
time.sleep(timeSleep)
pattern = 'id="p(\d{5})".*?<a.*?target="_blank">(.*?)</a>.*?<span>(.*?)</span>.*?<span>(.*?)</span>.*?</p>'
cut = re.findall(pattern, Html)
for i in cut:
f.write(str(i)+ '\n')
f.close()
a.close()
#獲得所有在售的車型ID
#url='https://car.autohome.com.cn//price/series-692.html'
with open('OffSales_qiche.txt', 'r') as a:
with open('qichezhijiaCar.txt', 'a+', encoding='utf-8') as f:
for i in a:
url = i.split('\n')[0]
timeSleep = random.randrange(1,5)
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
r = requests.get(url, headers = headers)
Html = r.text
time.sleep(timeSleep)
pattern = 'id="p(\d{7})".*?<a.*?target="_blank">(.*?)</a>.*?<span>(.*?)</span>.*?<span>(.*?)</span>.*?</p>'
cut = re.findall(pattern, Html)
for i in cut:
f.write(str(i)+ '\n')
f.close()
a.close()
#獲取參數
with open('Cars_parmers.txt','a+') as a:
with open('qichezhijiaCar.txt', 'r', encoding='utf-8') as f:
list = []
list2 = []
for i in f:
list.append(i.split(',')[0])
for i in list:
s = i.split('\'')
list2.append(s)
for i in list2:
url = 'https://car.autohome.com.cn/config/spec/' + i[1] + '.html'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
r = requests.get(url, headers = headers)
time.sleep(random.randrange(1,5))
Html = r.text
pattern = re.compile(r'{"specid":' + i[1] + ',"value":(.*?)}')
strings = re.findall(pattern, Html)
# print(Html.encode(encoding='utf-8'))
# print(r.status_code)
for i in strings:
a.write(i + '\n')
a.write(url+'\n')
a.close()
with open('Cars_parmers2.txt','a+') as f:
with open('Cars_parmers.txt','r') as a:
count = 0
counts = 0
cuts1 = []
list0 = []
list1 = []
list2 = []
for i in a:
if re.match('https://.*?',i):
cuts = count - counts
counts = count
cuts1.append(cuts)
list2.append(count)
count += 1
list0.append(i)
# print(cuts1)
for i in cuts1:
list1.append(list0[:i])
del list0[0:i]
for i in list1:
f.write(str(i)+'\n')
# print(list0)
# print(list1)
a.close()
f.close()