1 獲取列表頁的內容
2 解析列表頁的內容
3 從列表頁的內容中獲取詳情頁的鏈接
4 請求詳情頁鏈接
5 獲取詳情頁內容(含圖片解析)
6 將獲取的數據寫入文件。
運行腳本替換 一下 Cookie 值
# -*- coding: utf-8 -*-
"""
Created on Wed May 6 22:41:39 2020
獲取瓜子二手車的數據
@author: Administrator
"""
#引入爬取數據模塊
import requests
import time
import random
import parsel
#以下兩個模塊是對在線圖片的解析
from urllib.request import urlretrieve
import subprocess
class GuaziCrawler:
#初始化數據
def __init__(self):
self.list_url='https://www.guazi.com/bj/buy/o1/#bread'
self.info_url='https://www.guazi.com/bj/6e122216f475b400x.htm#fr_page=index&fr_pos=rec&fr_no=1'
self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'antipas=0i700287c4261831904729401355; uuid=f1872dd7-7d53-4a0f-8d6f-d08753ece825; clueSourceCode=%2A%2300; ganji_uuid=4877704952668124855730; sessionid=ed6aa56e-b69d-4fa5-aa2f-7c20d710fa26; lg=1; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22self%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22-%22%2C%22ca_campaign%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22display_finance_flag%22%3A%22-%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22f1872dd7-7d53-4a0f-8d6f-d08753ece825%22%2C%22ca_city%22%3A%22bj%22%2C%22sessionid%22%3A%22ed6aa56e-b69d-4fa5-aa2f-7c20d710fa26%22%7D; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A82964674616%7D; cityDomain=nn; user_city_id=142; preTime=%7B%22last%22%3A1590769222%2C%22this%22%3A1590764207%2C%22pre%22%3A1590764207%7D; lng_lat=116.366903_39.942624; gps_type=1; close_finance_popup=2020-05-30',
'DNT': '1',
'Host': 'www.guazi.com',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
pass
#進行數據爬取
def crawler_info_fun(self):
time.sleep(random.randint(1,3))
resp = requests.get(self.info_url,headers=self.headers)
time.sleep(random.randint(1,3))
self.req_con = resp.text
# =============================================================================
# with open('guazi_info.txt','wb') as f:
# f.write(resp.content)
# =============================================================================
pass
#將列表頁所有內容抓下來存儲到pages列表中
def crawler_list_fun(self):
self.pages =[]
for page in range(1,51):
self.list_url='https://www.guazi.com/bj/buy/o'+str(page)+'/#bread'
time.sleep(random.randint(1,2))
resp = requests.get(self.list_url,headers=self.headers)
time.sleep(random.randint(1,2))
self.req_con = resp.text
self.pages.append(self.req_con)
#print(self.req_con)
print('獲取第{}頁'.format(page))
#獲取詳情界面數據,並存儲。
def get_info_url(self):
self.url_list=[]
for page in self.pages:
html_par = parsel.Selector(page)
for i in range(1,41):
url = html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/@href').get()
self.url_list.append(url)
#print(self.url_list)
pass
def _info_data(self):
self.info_list=[]
for info_url in self.url_list:
url_tmp = str('https://www.guazi.com')+info_url
try:
resp = requests.get(url_tmp,headers=self.headers)
except Exception as e:
print(e)
return
if resp.status_code != 200:
return
print(url_tmp,'相應結果',resp.status_code)
#print(url_tmp,'相應結果',resp.status_code,resp.text)
time.sleep(random.randint(1,2))
html_par = parsel.Selector(resp.text)
self.info_dic={}
titile = html_par.xpath('/html/body/div[4]/div[3]/div[2]/h2/text()').extract_first()
self.info_dic['title'] =titile.strip()
#解析圖片
#根據src 獲取到圖片地址 src
img_xpath = '/html/body/div[4]/div[3]/div[2]/ul/li[1]/span/img/@src'
img_url = html_par.xpath(img_xpath).get()
#print(img_url)
#讀取圖片到本地 保存爲 page.jpg
urlretrieve(img_url, "page.jpg")
#根據回調函數 subprocess.PIPE 解析圖片。
p = subprocess.Popen(["tesseract", "page.jpg", "page"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
p.wait()
#將解析的內容寫入到文件中
f = open("page.txt", "r")
#print(f.read().strip())
#上牌時間
self.info_dic['spsj'] = f.read().strip()
#車源編碼
self.info_dic['cheyuan_code']= html_par.xpath('/html/body/div[4]/div[2]/div[2]/text()').extract_first().strip()
#表顯里程
self.info_dic['bxlc'] = html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[2]/span/text()').extract_first().strip()
#上牌地
self.info_dic['spd'] = html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[3]/span/text()').extract_first().strip()
#排量
self.info_dic['pl'] = html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[4]/span/text()').extract_first().strip()
#變速箱
self.info_dic['bsx'] = html_par.xpath('/html/body/div[4]/div[3]/div[2]/ul/li[5]/span/text()').extract_first()
#全款價
self.info_dic['qkj'] = html_par.xpath('/html/body/div[4]/div[3]/div[2]/div[1]/div[2]/span[1]/text()').extract_first().strip()
# print('標題:{}\n 車源號:{}\n 上牌時間:{}\n 表顯里程:{}\n 排量:{}\n 變速箱:{}\n 全款價:{}'
# .format(self.title,self.cheyuan_code,self.spsj,self.bxlc,self.pl,self.bsx,self.qkj))
self.info_list.append(self.info_dic)
f.close()
p.kill()
f = open('guazi_page_info.txt','w+')
for li in self.info_list:
f.write(str(li)+'\n')
f.close()
pass
#獲取列表頁內容
def get_list_data(self):
self.li_list=[]
for page in self.pages:
html_par = parsel.Selector(page)
for i in range(1,41):
self.lia_dic={}
self.lia_dic["title"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/h2/text()').extract_first()
self.lia_dic["niandu"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[1]/text()[1]').extract_first()
self.lia_dic["gongli"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[1]/text()[2]').extract_first()
self.lia_dic["xianjia"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[2]/p/text()').extract_first()
self.lia_dic["yuanjia"]= html_par.xpath('/html/body/div[6]/ul/li['+str(i)+']/a/div[2]/em/text()').extract_first()
self.li_list.append(self.lia_dic)
f = open('guazi_list_info.txt','w+')
for li in self.li_list:
f.write(str(li)+'\n')
f.close()
pass
def run_info_scraler(self):
self.crawler_list_fun()
self.get_info_url()
self._info_data()
pass
def run_list_scraler(self):
self.crawler_list_fun()
self.get_list_data()
pass
gc = GuaziCrawler()
gc.run_info_scraler() #詳情頁內容抓取
#gc.run_list_scraler() #列表頁內容的抓取
經過以上步驟完成腳本編寫,僅供大家學習,請勿進行大量爬取。