1 获取列表页的内容
2 解析列表页的内容
3 从列表页的内容中获取详情页的链接
4 请求详情页链接
5 获取详情页内容(含图片解析)
6 将获取的数据写入文件。
运行脚本替换 一下 Cookie 值
# -*- coding: utf-8 -*-
"""
Created on Wed May 6 22:41:39 2020
获取瓜子二手车的数据
@author: Administrator
"""
#引入爬取数据模块
import requests
import time
import random
import parsel
#以下两个模块是对在线图片的解析
from urllib.request import urlretrieve
import subprocess
class GuaziCrawler:
#初始化数据
def __init__(self):
self.list_url='https://www.guazi.com/bj/buy/o1/#bread'
self.info_url='https://www.guazi.com/bj/6e122216f475b400x.htm#fr_page=index&fr_pos=rec&fr_no=1'
self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'antipas=0i700287c4261831904729401355; uuid=f1872dd7-7d53-4a0f-8d6f-d08753ece825; clueSourceCode=%2A%2300; ganji_uuid=4877704952668124855730; sessionid=ed6aa56e-b69d-4fa5-aa2f-7c20d710fa26; lg=1; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22self%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22-%22%2C%22ca_campaign%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22display_finance_flag%22%3A%22-%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22f1872dd7-7d53-4a0f-8d6f-d08753ece825%22%2C%22ca_city%22%3A%22bj%22%2C%22sessionid%22%3A%22ed6aa56e-b69d-4fa5-aa2f-7c20d710fa26%22%7D; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A82964674616%7D; cityDomain=nn; user_city_id=142; preTime=%7B%22last%22%3A1590769222%2C%22this%22%3A1590764207%2C%22pre%22%3A1590764207%7D; lng_lat=116.366903_39.942624; gps_type=1; close_finance_popup=2020-05-30',
'DNT': '1',
'Host': 'www.guazi.com',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
pass
#进行数据爬取
def crawler_info_fun(self):
time.sleep(random.randint(1,3))
resp = requests.get(self.info_url,headers=self.headers)
time.sleep(random.randint(1,3))
self.req_con = resp.text
# =============================================================================
# with open('guazi_info.txt','wb') as f:
# f.write(resp.content)
# =============================================================================
pass
#将列表页所有内容抓下来存储到pages列表中
def crawler_list_fun(self):
self.pages =[]
for page in range(1,51):
self.list_url='https://www.guazi.com/bj/buy/o'+str(page)+'/#bread'
time.sleep(random.randint(1,2))
resp = requests.get(self.list_url,headers=self.headers)
time.sleep(random.randint(1,2))
self.req_con = resp.text
self.pages.append(self.req_con)
#print(self.req_con)
print('获取第{}页'.format(page))
#获取详情界面数据,并存储。
def get_info_url(self):
self.url_list=[]
for page in self.pages:
html_par = parsel.Selector(page)
for i in range(1,41):
url = html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/@href').get()
self.url_list.append(url)
#print(self.url_list)
pass
def _info_data(self):
self.info_list=[]
for info_url in self.url_list:
url_tmp = str('https://www.guazi.com')+info_url
try:
resp = requests.get(url_tmp,headers=self.headers)
except Exception as e:
print(e)
return
if resp.status_code != 200:
return
print(url_tmp,'相应结果',resp.status_code)
#print(url_tmp,'相应结果',resp.status_code,resp.text)
time.sleep(random.randint(1,2))
html_par = parsel.Selector(resp.text)
self.info_dic={}
titile = html_par.xpath('/html/body/div[4]/div[3]/div[2]/h2/text()').extract_first()
self.info_dic['title'] =titile.strip()
#解析图片
#根据src 获取到图片地址 src
img_xpath = '/html/body/div[4]/div[3]/div[2]/ul/li[1]/span/img/@src'
img_url = html_par.xpath(img_xpath).get()
#print(img_url)
#读取图片到本地 保存为 page.jpg
urlretrieve(img_url, "page.jpg")
#根据回调函数 subprocess.PIPE 解析图片。
p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
p.wait()
#将解析的内容写入到文件中
f = open("page.txt", "r")
#print(f.read().strip())
#上牌时间
self.info_dic['spsj'] = f.read().strip()
#车源编码
self.info_dic['cheyuan_code']= html_par.xpath('/html/body/div[4]/div[2]/div[2]/text()').extract_first().strip()
#表显里程
self.info_dic['bxlc'] = html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[2]/span/text()').extract_first().strip()
#上牌地
self.info_dic['spd'] = html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[3]/span/text()').extract_first().strip()
#排量
self.info_dic['pl'] = html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[4]/span/text()').extract_first().strip()
#变速箱
self.info_dic['bsx'] = html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[5]/span/text()').extract_first()
#全款价
self.info_dic['qkj'] = html_par.xpath('/html/body/div[4]/div[3]/div[2]/div[1]/div[2]/span[1]/text()').extract_first().strip()
# print('标题:{}\n 车源号:{}\n 上牌时间:{}\n 表显里程:{}\n 排量:{}\n 变速箱:{}\n 全款价:{}'
# .format(self.title,self.cheyuan_code,self.spsj,self.bxlc,self.pl,self.bsx,self.qkj))
self.info_list.append(self.info_dic)
f.close()
p.kill()
f = open('guazi_page_info.txt','w+')
for li in self.info_list:
f.write(str(li)+'\n')
f.close()
pass
#获取列表页内容
def get_list_data(self):
self.li_list=[]
for page in self.pages:
html_par = parsel.Selector(page)
for i in range(1,41):
self.lia_dic={}
self.lia_dic["title"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/h2/text()').extract_first()
self.lia_dic["niandu"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[1]/text()[1]').extract_first()
self.lia_dic["gongli"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[1]/text()[2]').extract_first()
self.lia_dic["xianjia"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[2]/p/text()').extract_first()
self.lia_dic["yuanjia"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[2]/em/text()').extract_first()
self.li_list.append(self.lia_dic)
f = open('guazi_list_info.txt','w+')
for li in self.li_list:
f.write(str(li)+'\n')
f.close()
pass
def run_info_scraler(self):
self.crawler_list_fun()
self.get_info_url()
self._info_data()
pass
def run_list_scraler(self):
self.crawler_list_fun()
self.get_list_data()
pass
gc = GuaziCrawler()
gc.run_info_scraler() #详情页内容抓取
#gc.run_list_scraler() #列表页内容的抓取
经过以上步骤完成脚本编写,仅供大家学习,请勿进行大量爬取。