python 获取瓜子二手车数据（request+parsel）

1 获取列表页的内容
2 解析列表页的内容
3 从列表页的内容中获取详情页的链接
4 请求详情页链接
5 获取详情页内容（含图片解析）
6 将获取的数据写入文件。

运行脚本替换一下 Cookie 值

# -*- coding: utf-8 -*-
"""
Created on Wed May  6 22:41:39 2020
获取瓜子二手车的数据
@author: Administrator
"""
#引入爬取数据模块
import requests
import time
import random
import parsel

#以下两个模块是对在线图片的解析
from urllib.request import urlretrieve
import subprocess

class GuaziCrawler:
    #初始化数据
    def __init__(self):
        self.list_url='https://www.guazi.com/bj/buy/o1/#bread'
        self.info_url='https://www.guazi.com/bj/6e122216f475b400x.htm#fr_page=index&fr_pos=rec&fr_no=1'
        self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                        'Accept-Encoding': 'gzip, deflate, br',
                        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
                        'Cache-Control': 'max-age=0',
                        'Connection': 'keep-alive',
                        'Cookie': 'antipas=0i700287c4261831904729401355; uuid=f1872dd7-7d53-4a0f-8d6f-d08753ece825; clueSourceCode=%2A%2300; ganji_uuid=4877704952668124855730; sessionid=ed6aa56e-b69d-4fa5-aa2f-7c20d710fa26; lg=1; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22self%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22-%22%2C%22ca_campaign%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22display_finance_flag%22%3A%22-%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22f1872dd7-7d53-4a0f-8d6f-d08753ece825%22%2C%22ca_city%22%3A%22bj%22%2C%22sessionid%22%3A%22ed6aa56e-b69d-4fa5-aa2f-7c20d710fa26%22%7D; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A82964674616%7D; cityDomain=nn; user_city_id=142; preTime=%7B%22last%22%3A1590769222%2C%22this%22%3A1590764207%2C%22pre%22%3A1590764207%7D; lng_lat=116.366903_39.942624; gps_type=1; close_finance_popup=2020-05-30',
                        'DNT': '1',
                        'Host': 'www.guazi.com',
                        'Sec-Fetch-Dest': 'document',
                        'Sec-Fetch-Mode': 'navigate',
                        'Sec-Fetch-Site': 'none',
                        'Sec-Fetch-User': '?1',
                        'Upgrade-Insecure-Requests': '1',
                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
                                    
        pass
    
    #进行数据爬取
    def crawler_info_fun(self):
        time.sleep(random.randint(1,3))
        resp = requests.get(self.info_url,headers=self.headers)
        time.sleep(random.randint(1,3))
        self.req_con = resp.text
# =============================================================================
#         with open('guazi_info.txt','wb') as f:
#             f.write(resp.content) 
# =============================================================================
        pass
    #将列表页所有内容抓下来存储到pages列表中
    def crawler_list_fun(self):
        self.pages =[]
        for page in range(1,51): 
            self.list_url='https://www.guazi.com/bj/buy/o'+str(page)+'/#bread'
            time.sleep(random.randint(1,2))
            resp = requests.get(self.list_url,headers=self.headers)
            time.sleep(random.randint(1,2))
            self.req_con = resp.text
            self.pages.append(self.req_con)
            #print(self.req_con)
            print('获取第{}页'.format(page))
    
    #获取详情界面数据，并存储。
    def get_info_url(self):
        self.url_list=[]
        for page in self.pages:
            html_par = parsel.Selector(page)
            for i in range(1,41):
                url =  html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/@href').get()
                self.url_list.append(url)
                
        #print(self.url_list)
        pass
    def _info_data(self):
        self.info_list=[]
        for info_url in self.url_list:
            url_tmp = str('https://www.guazi.com')+info_url
            try:
                resp = requests.get(url_tmp,headers=self.headers)
            except Exception as e:
                print(e)
                return 
            if resp.status_code != 200:
                return
            print(url_tmp,'相应结果',resp.status_code)
            #print(url_tmp,'相应结果',resp.status_code,resp.text)
            time.sleep(random.randint(1,2))  
            html_par = parsel.Selector(resp.text)
            self.info_dic={}
            
            titile = html_par.xpath('/html/body/div[4]/div[3]/div[2]/h2/text()').extract_first()          
            self.info_dic['title'] =titile.strip()
            
            #解析图片
            #根据src 获取到图片地址  src
            img_xpath = '/html/body/div[4]/div[3]/div[2]/ul/li[1]/span/img/@src'
            img_url = html_par.xpath(img_xpath).get() 
            #print(img_url)
            #读取图片到本地  保存为  page.jpg
            urlretrieve(img_url, "page.jpg")
            #根据回调函数   subprocess.PIPE  解析图片。
            p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            p.wait() 
            #将解析的内容写入到文件中
            f = open("page.txt", "r")
            #print(f.read().strip()) 
            #上牌时间
            self.info_dic['spsj'] =  f.read().strip()
            #车源编码
            self.info_dic['cheyuan_code']=  html_par.xpath('/html/body/div[4]/div[2]/div[2]/text()').extract_first().strip()
            #表显里程
            self.info_dic['bxlc'] =  html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[2]/span/text()').extract_first().strip()
            #上牌地
            self.info_dic['spd'] =  html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[3]/span/text()').extract_first().strip()
            #排量
            self.info_dic['pl'] =  html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[4]/span/text()').extract_first().strip()
            #变速箱
            self.info_dic['bsx'] =  html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[5]/span/text()').extract_first()
            #全款价
            self.info_dic['qkj'] =  html_par.xpath('/html/body/div[4]/div[3]/div[2]/div[1]/div[2]/span[1]/text()').extract_first().strip()
            # print('标题：{}\n 车源号：{}\n 上牌时间：{}\n 表显里程：{}\n 排量：{}\n 变速箱：{}\n 全款价：{}'
            #       .format(self.title,self.cheyuan_code,self.spsj,self.bxlc,self.pl,self.bsx,self.qkj))
            self.info_list.append(self.info_dic)
            f.close()
            p.kill()
        f = open('guazi_page_info.txt','w+') 
        
        for li in self.info_list:
            f.write(str(li)+'\n')
        f.close()
        pass
    
    #获取列表页内容 
    def get_list_data(self):
        self.li_list=[]
        for page in self.pages:
            html_par = parsel.Selector(page)
            for i in range(1,41):
                self.lia_dic={}
                self.lia_dic["title"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/h2/text()').extract_first()
                self.lia_dic["niandu"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[1]/text()[1]').extract_first()
                self.lia_dic["gongli"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[1]/text()[2]').extract_first()
                self.lia_dic["xianjia"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[2]/p/text()').extract_first()
                self.lia_dic["yuanjia"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[2]/em/text()').extract_first()
                self.li_list.append(self.lia_dic)
        
        f = open('guazi_list_info.txt','w+') 
        for li in self.li_list:
            f.write(str(li)+'\n')
        f.close()
        pass
         
    def run_info_scraler(self):
        self.crawler_list_fun()
        self.get_info_url()
        self._info_data()
        pass
    def run_list_scraler(self):
        self.crawler_list_fun()
        self.get_list_data()
    pass

gc = GuaziCrawler()

gc.run_info_scraler() #详情页内容抓取
#gc.run_list_scraler() #列表页内容的抓取

经过以上步骤完成脚本编写，仅供大家学习，请勿进行大量爬取。

python 获取瓜子二手车数据（request+parsel）

容器中nginx无法使用同一个网络下的容器域名

Python: SunMoonTimeCalculator

「Pygors跨平台GUI」1：Pygors跨平台GUI应用研究

NETCore中实现一个轻量无负担的极简任务调度ScheduleTask

docker使用特定的网络

使用c#强大的表达式树实现对象的深克隆之解决循环引用的问题

「Pygors跨平台GUI」2：安装MinGW-w64、MSYS2还是WSL2

nodejs学习07——API

避免DbContext同时在多个线程调用

GPT-4o 引领人机交互新风向，向量数据库赛道沸腾了

[ 7天學習Python編程，第一天]-----1.3在Python中打印字符串【python舵手】

[ 7天學習Python編程，第一天]-----1.5 Python變量的操作聲明，連接，全局和局部【python舵手】

[ 7天學習Python編程，第一天]-----1.2創建您的第一個Python程序【python舵手】

[ 7天學習Python編程，第一天]-----1.4 Python main函數：瞭解main【python舵手】

[ 7天學習Python編程，第一天]-----1.1在Windows上安裝Python 以及開發工具 Pycharm IDE【python舵手】

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結