跟搜狐車庫的爬取思路是一樣的。首先找到每個車型的連接,然後遍歷每個車型的連接去爬取所需的數據。不過網易車型庫相較於搜狐車庫而言是爬取的時間是遠遠少於搜狐汽車的。比較網易汽車的數據是不用渲染就可以爬取下來的,而搜狐汽車的數據需要渲染之後纔可以爬取下來。
步驟1:獲得品牌的連接
import requests
import re
url = 'http://product.auto.163.com/'
def getHtml(url):
data={'test':'data'}
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Accept - Encoding': 'gzip, deflate',
'Accept - Language': 'zh - CN, zh;q = 0.9'
}
html=requests.get(url,headers=headers,params=data)
html.encoding='GBK'
return html.text
def cutstr(html):
pattern=re.compile('<a.*?id="(.*?)".*?_seriseId=.*?</a>')
strs=re.findall(pattern,html)
return strs
def gotoFile():
html = getHtml(url)
with open('wangyicar2.txt','w',encoding='utf-8') as f:
for i in cutstr(html):
str='http://product.auto.163.com/series/'+i+'.html#008B00'
f.write(str+'\n')
f.close()
gotoFile()
步驟2:獲得每個車型的連接
import requests
import re
# url = 'http://product.auto.163.com/series/16979.html#008B00'
def getHtml(url):
data={'test':'data'}
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Accept - Encoding': 'gzip, deflate',
'Accept - Language': 'zh - CN, zh;q = 0.9'
}
html=requests.get(url,headers=headers,params=data)
html.encoding='GBK'
return html.text
def cutstr(html):
pattern=re.compile('{product_id:(.*?),.*?product_name:(.*?)}')
strs=re.findall(pattern,html)
return strs
def gotoFile():
with open('wangyicar2.txt','r',encoding='utf-8') as a:
for url in a:
html = getHtml(url)
with open('wangyicar6.txt', 'a+', encoding='utf-8') as f:
for i in cutstr(html):
all=[]
urls='http://product.auto.163.com/config_compare/'+eval(i[0])+'.html#ncx00023'
name=i[1]
all.append(urls)
all.append(name)
all.append(url)
f.write(str(all)+'\n')
f.close()
a.close()
gotoFile()
步驟3:遍歷每個車型的連接獲得想要的數據
import requests
import re
from bs4 import BeautifulSoup
def getHtml(url):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
html=requests.get(url,headers=headers)
html.encoding = 'GBK'
return html.text
def Soup(html):
soup=BeautifulSoup(html,'lxml')
text=soup.find_all(class_="car_config_param_list")
return str(text)
def gotoFile():
# url = 'http://product.auto.163.com/config_compare/000BedBL.html#ncx00023'
with open('wangyicar5.txt', 'r', encoding='utf-8') as a:
for url in a:
str = []
html = getHtml(url)
text = Soup(html)
pattern = re.compile('<div class="cell"><span class="cell_text">(.*?)</span></div>')
datas = re.findall(pattern, text)
str = [datas[0], datas[3], datas[4], datas[9], datas[30], datas[38], datas[39], datas[41], datas[42],
datas[73], datas[74], datas[81], url]
with open('wangyi2.txt','a+',encoding='utf-8') as f:
for i in str:
f.write(i+'\n')
f.close()
str=[]
a.close()
gotoFile()
應爲爬取搜狐車型應該讓我對這種車型的爬取有些熟悉了,所以我這邊現在只爬取了我想要的一些參數數據。如果想要整個的參數數據可以直接遍歷datas,然後再寫入TXT裏面。