python 獲取瓜子二手車數據(request+parsel)

1 獲取列表頁的內容
2 解析列表頁的內容
3 從列表頁的內容中獲取詳情頁的鏈接
4 請求詳情頁鏈接
5 獲取詳情頁內容(含圖片解析)
6 將獲取的數據寫入文件。

運行腳本替換 一下 Cookie 值

# -*- coding: utf-8 -*-
"""
Created on Wed May  6 22:41:39 2020
獲取瓜子二手車的數據
@author: Administrator
"""
#引入爬取數據模塊
import requests
import time
import random
import parsel

#以下兩個模塊是對在線圖片的解析
from urllib.request import urlretrieve
import subprocess

class GuaziCrawler:
    #初始化數據
    def __init__(self):
        self.list_url='https://www.guazi.com/bj/buy/o1/#bread'
        self.info_url='https://www.guazi.com/bj/6e122216f475b400x.htm#fr_page=index&fr_pos=rec&fr_no=1'
        self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                        'Accept-Encoding': 'gzip, deflate, br',
                        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
                        'Cache-Control': 'max-age=0',
                        'Connection': 'keep-alive',
                        'Cookie': 'antipas=0i700287c4261831904729401355; uuid=f1872dd7-7d53-4a0f-8d6f-d08753ece825; clueSourceCode=%2A%2300; ganji_uuid=4877704952668124855730; sessionid=ed6aa56e-b69d-4fa5-aa2f-7c20d710fa26; lg=1; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22self%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22-%22%2C%22ca_campaign%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22display_finance_flag%22%3A%22-%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22f1872dd7-7d53-4a0f-8d6f-d08753ece825%22%2C%22ca_city%22%3A%22bj%22%2C%22sessionid%22%3A%22ed6aa56e-b69d-4fa5-aa2f-7c20d710fa26%22%7D; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A82964674616%7D; cityDomain=nn; user_city_id=142; preTime=%7B%22last%22%3A1590769222%2C%22this%22%3A1590764207%2C%22pre%22%3A1590764207%7D; lng_lat=116.366903_39.942624; gps_type=1; close_finance_popup=2020-05-30',
                        'DNT': '1',
                        'Host': 'www.guazi.com',
                        'Sec-Fetch-Dest': 'document',
                        'Sec-Fetch-Mode': 'navigate',
                        'Sec-Fetch-Site': 'none',
                        'Sec-Fetch-User': '?1',
                        'Upgrade-Insecure-Requests': '1',
                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
                                    
        pass
    
    #進行數據爬取
    def crawler_info_fun(self):
        time.sleep(random.randint(1,3))
        resp = requests.get(self.info_url,headers=self.headers)
        time.sleep(random.randint(1,3))
        self.req_con = resp.text
# =============================================================================
#         with open('guazi_info.txt','wb') as f:
#             f.write(resp.content) 
# =============================================================================
        pass
    #將列表頁所有內容抓下來存儲到pages列表中
    def crawler_list_fun(self):
        self.pages =[]
        for page in range(1,51): 
            self.list_url='https://www.guazi.com/bj/buy/o'+str(page)+'/#bread'
            time.sleep(random.randint(1,2))
            resp = requests.get(self.list_url,headers=self.headers)
            time.sleep(random.randint(1,2))
            self.req_con = resp.text
            self.pages.append(self.req_con)
            #print(self.req_con)
            print('獲取第{}頁'.format(page))
    
    #獲取詳情界面數據,並存儲。
    def get_info_url(self):
        self.url_list=[]
        for page in self.pages:
            html_par = parsel.Selector(page)
            for i in range(1,41):
                url =  html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/@href').get()
                self.url_list.append(url)
                
        #print(self.url_list)
        pass
    def _info_data(self):
        self.info_list=[]
        for info_url in self.url_list:
            url_tmp = str('https://www.guazi.com')+info_url
            try:
                resp = requests.get(url_tmp,headers=self.headers)
            except Exception as e:
                print(e)
                return 
            if resp.status_code != 200:
                return
            print(url_tmp,'相應結果',resp.status_code)
            #print(url_tmp,'相應結果',resp.status_code,resp.text)
            time.sleep(random.randint(1,2))  
            html_par = parsel.Selector(resp.text)
            self.info_dic={}
            
            titile = html_par.xpath('/html/body/div[4]/div[3]/div[2]/h2/text()').extract_first()          
            self.info_dic['title'] =titile.strip()
            
            #解析圖片
            #根據src 獲取到圖片地址  src
            img_xpath = '/html/body/div[4]/div[3]/div[2]/ul/li[1]/span/img/@src'
            img_url = html_par.xpath(img_xpath).get() 
            #print(img_url)
            #讀取圖片到本地  保存爲  page.jpg
            urlretrieve(img_url, "page.jpg")
            #根據回調函數   subprocess.PIPE  解析圖片。
            p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            p.wait() 
            #將解析的內容寫入到文件中
            f = open("page.txt", "r")
            #print(f.read().strip()) 
            #上牌時間
            self.info_dic['spsj'] =  f.read().strip()
            #車源編碼
            self.info_dic['cheyuan_code']=  html_par.xpath('/html/body/div[4]/div[2]/div[2]/text()').extract_first().strip()
            #表顯里程
            self.info_dic['bxlc'] =  html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[2]/span/text()').extract_first().strip()
            #上牌地
            self.info_dic['spd'] =  html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[3]/span/text()').extract_first().strip()
            #排量
            self.info_dic['pl'] =  html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[4]/span/text()').extract_first().strip()
            #變速箱
            self.info_dic['bsx'] =  html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[5]/span/text()').extract_first()
            #全款價
            self.info_dic['qkj'] =  html_par.xpath('/html/body/div[4]/div[3]/div[2]/div[1]/div[2]/span[1]/text()').extract_first().strip()
            # print('標題:{}\n 車源號:{}\n 上牌時間:{}\n 表顯里程:{}\n 排量:{}\n 變速箱:{}\n 全款價:{}'
            #       .format(self.title,self.cheyuan_code,self.spsj,self.bxlc,self.pl,self.bsx,self.qkj))
            self.info_list.append(self.info_dic)
            f.close()
            p.kill()
        f = open('guazi_page_info.txt','w+') 
        
        for li in self.info_list:
            f.write(str(li)+'\n')
        f.close()
        pass
    
    #獲取列表頁內容 
    def get_list_data(self):
        self.li_list=[]
        for page in self.pages:
            html_par = parsel.Selector(page)
            for i in range(1,41):
                self.lia_dic={}
                self.lia_dic["title"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/h2/text()').extract_first()
                self.lia_dic["niandu"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[1]/text()[1]').extract_first()
                self.lia_dic["gongli"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[1]/text()[2]').extract_first()
                self.lia_dic["xianjia"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[2]/p/text()').extract_first()
                self.lia_dic["yuanjia"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[2]/em/text()').extract_first()
                self.li_list.append(self.lia_dic)
        
        f = open('guazi_list_info.txt','w+') 
        for li in self.li_list:
            f.write(str(li)+'\n')
        f.close()
        pass
         
    def run_info_scraler(self):
        self.crawler_list_fun()
        self.get_info_url()
        self._info_data()
        pass
    def run_list_scraler(self):
        self.crawler_list_fun()
        self.get_list_data()
    pass

gc = GuaziCrawler()

gc.run_info_scraler() #詳情頁內容抓取
#gc.run_list_scraler() #列表頁內容的抓取

經過以上步驟完成腳本編寫,僅供大家學習,請勿進行大量爬取。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章