Python爬取搜狐車型數據

主要工具Python+selenium+Excel

因爲工作需要想要獲得車型參數數據,查找了網絡上面的教程和相關的文章都沒有達到我想要的效果,所以自己慢慢摸索着寫了這些代碼,可能有一些代碼繁瑣且沒有必要,但我畢竟只是懂點皮毛的小菜鳥,如果大家有什麼可以優化的方法,歡迎指教~

如果你同我一樣是需要車型參數數據的,可以按照我的方法來,不過selenium有個缺點就是非常慢,而且有時候會出現抓不到數據的情況,但是如果你跟我一樣有耐心且網速好的話,車型數據是肯定可以抓取下來的,而且時間說不定會比我少,畢竟我抓取這些數據花費了我一星期的時間,哎,無力吐槽的網速啊。

主要獲取的數據如下圖所示:

 

Python部分

因爲搜狐汽車車型庫的數據用requests爬取之後得不到對應的數據,所以需要用selenium來抓取渲染之後的數據。

步驟1:

首先打開搜狐車型庫的網頁,按F12進入開發者模式,點擊Network裏面的Doc查看源網頁數據,可以看到每個品牌對應的鏈接。

 

import requests
import re
from bs4 import BeautifulSoup


def getHtml(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    html = requests.get(url, headers=headers)
    return html.text

# def getCCfirm(html):  # 獲得廠商的名稱和鏈接
#     soup = BeautifulSoup(html, 'lxml')
#     string = soup.find_all(class_='con_tit')  # 獲得車型的品牌的分類
#     pattern = re.compile('<a href="(.*?)".*?>(.*?)</a>', re.S)  # 獲得車型的URL和車型的品牌
#     items = re.findall(pattern, str(string))
#     for item in items:
#         yield {
#             'href': item[0],
#             'name': item[1].split()
#         }


def getCarModel(html):
    soup = BeautifulSoup(html, 'lxml')
    string = soup.find_all(class_='model-a')  # 獲得車型的品牌的分類
    pattern = re.compile('<a.*?href="(.*?)".*?</em>(.*?)<span>', re.S)  # 獲得車型的URL和車型的品牌
    items = re.findall(pattern, str(string))
    for item in items:
        yield {
            'href': item[0],
            'name': item[1]
        }



url = 'http://db.auto.sohu.com/home/'
html = getHtml(url)
with open('Model_ID.txt', 'w', encoding='utf-8') as f:
    for i in getCarModel(html):
        f.write(str(i) + '\n')
f.close()

步驟2:

通過遍歷步驟1得到的Model_ID.txt裏面的鏈接,獲得每個車系對應的鏈接。

import re
import requests
# 獲得品牌的ID
def Uniq():
    with open('Model_ID.txt', 'r', encoding='utf-8') as f:
        for i in f:
            pattern = re.compile('/(\d{4})\'')
            uniq = re.findall(pattern, i)
            yield {
                'uniq': uniq[0]
            }
        f.close()


def getallyrl(mids, tids):
    with open('Model_ID.txt', 'r', encoding='utf-8') as f:
        for i in f:
            pattern = re.compile('/(\d{4})\'')
            uniq = re.findall(pattern, i)
            pattern3 = re.compile('\'href\': \'(.*?)\'')
            uniqHref = re.findall(pattern3, i)
            if mids == uniq:
                urlone = 'http:' + uniqHref[0]
                urltwo = urlone + '/' + tids + '/trim.html'
                return urltwo
    f.close()


def getmid():
    for mid in Uniq():
        url = 'http://db.auto.sohu.com/api/model/select/trims_' + mid['uniq'] + '.json'
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
        html = requests.get(url, headers=headers).text
        pattern1 = re.compile('"tid":(\d{6})')
        pattern2 = re.compile('"mid":(\d{4})')
        mids = re.findall(pattern2, html)
        tids = re.findall(pattern1, html)
        for i in tids:
            perfer = getallyrl(mids, i)
            yield {
                'all_url': perfer
            }
with open('allurls.txt', 'w', encoding='utf-8') as f:
    for i in getmid():
        f.write(str(i) + '\n')
f.close()

步驟3:

根據步驟2裏面得的allurls.txt,遍歷裏面的數據,得到最終需要的車型參數數據。

from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
from random import randint
from selenium.webdriver.chrome.options import Options

start = time.clock()
chrome_options = Options()
chrome_options.add_argument('--headless ')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
# browser = webdriver.Chrome()

def getHTML(url):
    browser.get(url)
    time.sleep(randint(2,5)*2)
    html = browser.page_source
    soup = BeautifulSoup(html, 'lxml')
    strtext = soup.find_all(class_="th2")
    return strtext


def rules(strtext):
    len = str(strtext).count('"th2"')  # 確定th2的個數
    return len


def getstep(len):
    stepnum = int(len / 208)
    return stepnum


def getUrl():
    with open('allurls_new.txt', 'r', encoding='utf-8') as f:
        for i in f:
            pattern = re.compile('\'all_url\': \'(.*?)\'}')
            urls = re.findall(pattern, i)
            yield {
                'url': urls[0]
            }


def clearFile():  # 後期導入Excel會用到
    with open('cartest.txt', 'w', encoding='utf-8') as f:
        f.close()


def clearUFile():  # 後期導入Excel會用到
    with open('allurls_new.txt', 'w', encoding='utf-8') as f:
        f.close()


def main():
        clearFile()
        for i in getUrl():
            try:
                strs=[]
                url = i['url']
                strtext = getHTML(url)
                len = rules(strtext)
                stepnum = getstep(len)
                pattern = re.compile('>(.*?)<')
                with open('cartest.txt', 'a+', encoding='utf-8') as f:
                    for i in range(0, len, stepnum):
                        strmiddata = re.findall(pattern, str(strtext[i]))
                        strs.append(strmiddata)
                    strs.append(url)
                    for i in strs:
                        f.write(str(i) + '\n')
                f.close()
            except:
                continue
        clearUFile()

main()
browser.close()
elapsed = (time.clock() - start) / 60
print("Time used:%d 分鐘" % elapsed)

步驟4:

str1=[]
str2=[]
str3=[]
with open('allurls.txt','r',encoding='utf-8') as f:
    for i in f:
        str1.append(i)
f.close()
with open('allurls0.txt','r',encoding='utf-8') as a:
    for i in a:
        str2.append(i)
a.close()
with open('allurls_new.txt','w',encoding='utf-8') as s:
    for i in str1:
        if i not in str2:
            s.write(i)
s.close()
with open('allurls0.txt','a+',encoding='utf-8') as m:
    with open('allurls_new.txt', 'r', encoding='utf-8') as s:
        for i in s:
            m.write(i)
    s.close()
m.close()

其實主要的時間花費在第三步,因爲需要遍歷3萬多條鏈接,還要每個鏈接之間的設置的時間段,雖然情感上我不想設置時間,但是理智上必須設置,畢竟我怕搜狐把我給掛掉,所以耗費的時間可想而知。。。。漫長呀。。。。

我在簡書上也有寫的,恩,可以瀏覽一下,咳咳,雖然內容是一樣的。

簡書鏈接:https://www.jianshu.com/p/e2b54c7eefb1(未修改)

--------2018.8.8-----------

修改:

1.之前複製的時候,未檢查代碼,導致之後複製到pycharm中運行時出現錯誤,已修改代碼。

2.在步驟的cartest裏面增加的了每個車型的連接,避免當出現錯誤時,不知道自己已經爬到了那個車型。

-----2018.8.10------------

新增:

1.新增步驟4,將獲取的所有連接再重新匹配一遍,如果有新增的儲存在一個新的TXT裏面,

2.獲得新的連接,再遍歷得到新車型的數據


 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章