主要工具Python+selenium+Excel
因爲工作需要想要獲得車型參數數據,查找了網絡上面的教程和相關的文章都沒有達到我想要的效果,所以自己慢慢摸索着寫了這些代碼,可能有一些代碼繁瑣且沒有必要,但我畢竟只是懂點皮毛的小菜鳥,如果大家有什麼可以優化的方法,歡迎指教~
如果你同我一樣是需要車型參數數據的,可以按照我的方法來,不過selenium有個缺點就是非常慢,而且有時候會出現抓不到數據的情況,但是如果你跟我一樣有耐心且網速好的話,車型數據是肯定可以抓取下來的,而且時間說不定會比我少,畢竟我抓取這些數據花費了我一星期的時間,哎,無力吐槽的網速啊。
主要獲取的數據如下圖所示:
Python部分
因爲搜狐汽車車型庫的數據用requests爬取之後得不到對應的數據,所以需要用selenium來抓取渲染之後的數據。
步驟1:
首先打開搜狐車型庫的網頁,按F12進入開發者模式,點擊Network裏面的Doc查看源網頁數據,可以看到每個品牌對應的鏈接。
import requests
import re
from bs4 import BeautifulSoup
def getHtml(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
html = requests.get(url, headers=headers)
return html.text
# def getCCfirm(html): # 獲得廠商的名稱和鏈接
# soup = BeautifulSoup(html, 'lxml')
# string = soup.find_all(class_='con_tit') # 獲得車型的品牌的分類
# pattern = re.compile('<a href="(.*?)".*?>(.*?)</a>', re.S) # 獲得車型的URL和車型的品牌
# items = re.findall(pattern, str(string))
# for item in items:
# yield {
# 'href': item[0],
# 'name': item[1].split()
# }
def getCarModel(html):
soup = BeautifulSoup(html, 'lxml')
string = soup.find_all(class_='model-a') # 獲得車型的品牌的分類
pattern = re.compile('<a.*?href="(.*?)".*?</em>(.*?)<span>', re.S) # 獲得車型的URL和車型的品牌
items = re.findall(pattern, str(string))
for item in items:
yield {
'href': item[0],
'name': item[1]
}
url = 'http://db.auto.sohu.com/home/'
html = getHtml(url)
with open('Model_ID.txt', 'w', encoding='utf-8') as f:
for i in getCarModel(html):
f.write(str(i) + '\n')
f.close()
步驟2:
通過遍歷步驟1得到的Model_ID.txt裏面的鏈接,獲得每個車系對應的鏈接。
import re
import requests
# 獲得品牌的ID
def Uniq():
with open('Model_ID.txt', 'r', encoding='utf-8') as f:
for i in f:
pattern = re.compile('/(\d{4})\'')
uniq = re.findall(pattern, i)
yield {
'uniq': uniq[0]
}
f.close()
def getallyrl(mids, tids):
with open('Model_ID.txt', 'r', encoding='utf-8') as f:
for i in f:
pattern = re.compile('/(\d{4})\'')
uniq = re.findall(pattern, i)
pattern3 = re.compile('\'href\': \'(.*?)\'')
uniqHref = re.findall(pattern3, i)
if mids == uniq:
urlone = 'http:' + uniqHref[0]
urltwo = urlone + '/' + tids + '/trim.html'
return urltwo
f.close()
def getmid():
for mid in Uniq():
url = 'http://db.auto.sohu.com/api/model/select/trims_' + mid['uniq'] + '.json'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
html = requests.get(url, headers=headers).text
pattern1 = re.compile('"tid":(\d{6})')
pattern2 = re.compile('"mid":(\d{4})')
mids = re.findall(pattern2, html)
tids = re.findall(pattern1, html)
for i in tids:
perfer = getallyrl(mids, i)
yield {
'all_url': perfer
}
with open('allurls.txt', 'w', encoding='utf-8') as f:
for i in getmid():
f.write(str(i) + '\n')
f.close()
步驟3:
根據步驟2裏面得的allurls.txt,遍歷裏面的數據,得到最終需要的車型參數數據。
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
from random import randint
from selenium.webdriver.chrome.options import Options
start = time.clock()
chrome_options = Options()
chrome_options.add_argument('--headless ')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
# browser = webdriver.Chrome()
def getHTML(url):
browser.get(url)
time.sleep(randint(2,5)*2)
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
strtext = soup.find_all(class_="th2")
return strtext
def rules(strtext):
len = str(strtext).count('"th2"') # 確定th2的個數
return len
def getstep(len):
stepnum = int(len / 208)
return stepnum
def getUrl():
with open('allurls_new.txt', 'r', encoding='utf-8') as f:
for i in f:
pattern = re.compile('\'all_url\': \'(.*?)\'}')
urls = re.findall(pattern, i)
yield {
'url': urls[0]
}
def clearFile(): # 後期導入Excel會用到
with open('cartest.txt', 'w', encoding='utf-8') as f:
f.close()
def clearUFile(): # 後期導入Excel會用到
with open('allurls_new.txt', 'w', encoding='utf-8') as f:
f.close()
def main():
clearFile()
for i in getUrl():
try:
strs=[]
url = i['url']
strtext = getHTML(url)
len = rules(strtext)
stepnum = getstep(len)
pattern = re.compile('>(.*?)<')
with open('cartest.txt', 'a+', encoding='utf-8') as f:
for i in range(0, len, stepnum):
strmiddata = re.findall(pattern, str(strtext[i]))
strs.append(strmiddata)
strs.append(url)
for i in strs:
f.write(str(i) + '\n')
f.close()
except:
continue
clearUFile()
main()
browser.close()
elapsed = (time.clock() - start) / 60
print("Time used:%d 分鐘" % elapsed)
步驟4:
str1=[]
str2=[]
str3=[]
with open('allurls.txt','r',encoding='utf-8') as f:
for i in f:
str1.append(i)
f.close()
with open('allurls0.txt','r',encoding='utf-8') as a:
for i in a:
str2.append(i)
a.close()
with open('allurls_new.txt','w',encoding='utf-8') as s:
for i in str1:
if i not in str2:
s.write(i)
s.close()
with open('allurls0.txt','a+',encoding='utf-8') as m:
with open('allurls_new.txt', 'r', encoding='utf-8') as s:
for i in s:
m.write(i)
s.close()
m.close()
其實主要的時間花費在第三步,因爲需要遍歷3萬多條鏈接,還要每個鏈接之間的設置的時間段,雖然情感上我不想設置時間,但是理智上必須設置,畢竟我怕搜狐把我給掛掉,所以耗費的時間可想而知。。。。漫長呀。。。。
我在簡書上也有寫的,恩,可以瀏覽一下,咳咳,雖然內容是一樣的。
簡書鏈接:https://www.jianshu.com/p/e2b54c7eefb1(未修改)
--------2018.8.8-----------
修改:
1.之前複製的時候,未檢查代碼,導致之後複製到pycharm中運行時出現錯誤,已修改代碼。
2.在步驟的cartest裏面增加的了每個車型的連接,避免當出現錯誤時,不知道自己已經爬到了那個車型。
-----2018.8.10------------
新增:
1.新增步驟4,將獲取的所有連接再重新匹配一遍,如果有新增的儲存在一個新的TXT裏面,
2.獲得新的連接,再遍歷得到新車型的數據