前不久需要一批手機數據做測試,所以就爬取了ZOL上關於手機的各項參數,現在把代碼分享出來,希望大家能夠多提改進意見。
ZOL手機信息
想要抓取ZOL關於手機的信息需要三個步驟:
手機商城列表頁 —》單個手機詳情頁 ----》當前手機更多詳情頁面
爬蟲代碼
# -*- coding: gbk -*-
from scrapy.spiders import CrawlSpider
import scrapy
from urllib.parse import urljoin
class PhoneSpider(CrawlSpider):
name = "phone"
allowed_domains = ["detail.zol.com.cn"]
def start_requests(self):
for i in range(30):
yield scrapy.Request('http://detail.zol.com.cn/cell_phone_index/subcate57_list_' + str(i + 1) + '.html',
self.parse,
dont_filter=True)
def parse(self, response): # 手機商城列表頁
phone_plane = response.css('div.pic-mode-box')
phone_list = phone_plane.css('ul li')
for phone in phone_list:
phone = phone.css('h3 a[href]')
phone_url = phone.css('a::attr(href)').extract_first()
phone_title = phone.css("a::attr(title)").extract_first()
next_url = urljoin(response.url, phone_url)
yield scrapy.Request(next_url, self.get_phone_page, dont_filter=False, meta={
'title': phone_title,
})
def get_phone_page(self, response): # 單個手機詳情頁
section_vec = response.css('div.section div.section-content')
next_a = section_vec.css('a._j_MP_more')
detail_url = next_a.css('a::attr(href)').extract_first()
next_url = urljoin(response.url, detail_url)
yield scrapy.Request(next_url, self.get_details, dont_filter=False, meta={
'title': response.meta['title']
})
def get_details(self, response): # 當前手機更多詳情頁面
title = response.meta['title']
all_content = response.css('div.detailed-parameters')
all_content = all_content.css('tr')
detail_list = ['' for i in range(8)]
for content in all_content:
# detail = content.css('th')
if content.css('th a'):
detail_name = content.css('th a ::text').extract_first()
else:
detail_name = content.css('th ::text').extract_first()
if content.css('td a'):
detail_content = content.css('td a ::text').extract_first()
else:
detail_content = content.css('td ::text').extract_first()
if detail_name == '上市日期':
detail_list[0] = detail_content.replace(',', ';')
elif detail_name == '出廠系統內核':
detail_list[1] = detail_content.replace(',', ';')
elif detail_name == '主屏分辨率':
detail_list[2] = detail_content.replace(',', ';')
elif detail_name == 'CPU型號':
detail_list[3] = detail_content.replace(',', ';')
elif detail_name == 'GPU型號':
detail_list[4] = detail_content.replace(',', ';')
elif detail_name == '電池容量':
detail_list[5] = detail_content.replace(',', ';')
elif detail_name == '質保時間':
detail_list[6] = detail_content.replace(',', ';')
elif detail_name == '手機重量':
detail_list[7] = detail_content.replace(',', ';')
write_line = title + ',' + (",".join(detail_list))
with open('phone_details.csv', 'a') as f:
f.write(write_line + '\n')
f.close()
print('Write : ' + write_line)