1、創建Scrapy項目
scrapy startproject PosProductRedis
2.進入項目目錄,使用命令genspider創建Spider
scrapy genspider posproductredis XXXX.com
3、定義要抓取的數據(處理items.py文件)
# -*- coding: utf-8 -*-
import scrapy
class PosproductredisItem(scrapy.Item):
# 獲取序號
number_list = scrapy.Field()
# 獲取ID
id_list = scrapy.Field()
# 獲取商家名稱
qiye_list = scrapy.Field()
# 獲取分類
product_list = scrapy.Field()
# 獲取產品名稱
product_name_list = scrapy.Field()
# 獲取銷售情況
sale_list = scrapy.Field()
# 銷售標題(有空格)
sales_title = scrapy.Field()
# 獲取銷售區域
sales_area = scrapy.Field()
# 獲取規格
product_size = scrapy.Field()
# 獲取起訂量
product_quantity = scrapy.Field()
# 獲取零售價
retail_price = scrapy.Field()
# 獲取零售促銷價
promotion_price = scrapy.Field()
# 獲取skuid,可以不寫
# skuid = scrapy.Field()
4、編寫提取item數據的Spider(在spiders文件夾下:posproductredis.py)
# -*- coding: utf-8 -*-
# 利用scrapy_redis將pos後臺數據包含價格、規格、起訂量、銷售區域等信息全部保存到excel中
import scrapy
from PosProductRedis.items import PosproductredisItem
from scrapy_redis.spiders import RedisSpider
import re
class PosproductredisSpider(RedisSpider):
name = 'posproductredis'
allowed_domains = ['XXXX.com']
redis_key = "PosproductredisSpider:start_urls"
# lpush PosproductredisSpider:start_urls https://pos.XXX.com/item/itemonlist.html?d-49489-p=1
login_page = "https://pos.XXXX.com/login.html"
def start_requests(self):
yield scrapy.Request(url=self.login_page,callback=self.login)
def login(self, response):
self.username = input("請輸入賬號:")
self.password = input("請輸入密碼:")
yield scrapy.FormRequest.from_response(
response,
formdata={"j_username":self.username, "j_password":self.password},
callback = self.parse_page
)
# 獲取登錄成功的狀態,訪問需要登錄後才能訪問的頁面
def parse_page(self, response):
if "loginerror" in response.body.decode('utf-8'):
print("登錄失敗,錯誤的手機號或密碼!")
if "</span>首頁" in response.body.decode('utf-8'):
print("歡迎您'%s',成功登錄POS管理系統!" % (self.username))
print("請在slaver端(爬蟲程序執行端)輸入:lpush %s 爬取列表頁網址"%(self.redis_key))
# 登錄成功後獲取在線產品的列表頁,並回調parse()函數處理數據
# yield scrapy.Request(response.url, callback=self.parse)
def parse(self, response):
# print("數據處理中......")
items =[]
# 獲取下一頁的鏈接地址,列表,需要和“https://pos.XXXX.com/item/itemonlist.html”進行拼接
next_url_list = response.xpath('//body//div//div/span/span[@class="paginate_button"]/a/@href').extract()
for each in response.xpath('//div[@class="dataTables_wrapper"]'):
# 序號
number_list = each.xpath('.//td[1]/text()').extract()
# 獲取ID
id_list = each.xpath('.//tbody//tr//td//input[@onclick="homeShow(this)"]/@value').extract()
# 獲取商家名稱
qiye_list = each.xpath('.//td[2]/text()').extract()
# 獲取分類
product_list = each.xpath('.//td[4]/text()').extract()
# 獲取產品名稱
product_name_list = each.xpath('.//td[3]/a/text()').extract()
for i in range(len(id_list)):
item = PosproductredisItem()
item['number_list'] = number_list[i].strip()
item['id_list'] = id_list[i]
item['qiye_list'] = qiye_list[i].strip()
item['product_list'] = product_list[i].strip()
item['product_name_list'] = product_name_list[i].strip()
# yield item
items.append(item)
for item in items:
id_url = "https://pos.XXXX.com/item/showitem.html?item.id="+ item['id_list']
yield scrapy.Request(url=id_url,meta={'meta_1':item},callback=self.parse_id)
pattern = re.compile(r"/?d-49489-p=(\d+)")
for url in next_url_list:
i =pattern.search(url).group(1)
print("第%s頁數據處理中...."%i)
fullurl = 'https://pos.XXXX.com/item/itemonlist.html'+ str(url)
yield scrapy.Request(url=fullurl,callback=self.parse)
# 處理id鏈接,獲取價格、規格、起訂量等信息
def parse_id(self,response):
# 提取每次response的meta數據
meta_1 = response.meta['meta_1']
# print("meta_1",meta_1)
item = PosproductredisItem()
# 獲取銷售標題
sales_title = response.xpath('//div[@id="tabs-1"]/p[8]/span[@class="field"]/text()').extract()
# 獲取銷售情況(有空格)
sale_list = response.xpath('//div[@id="tabs-1"]/p[6]/span/text()').extract()
# 獲取銷售區域
sales_area = response.xpath('//div[@id="tabs-6"]/table/tbody[@id="review_list"]/tr/td[2]/text()').extract()
# 獲取規格
product_size = response.xpath('//div[@id="tabs"]/div[@id="tabs-5"]/table/tbody/tr/td[1]/text()').extract()
# 獲取起訂量
product_quantity = response.xpath('//div[@id="tabs"]/div[@id="tabs-5"]/table/tbody/tr/td[8]/text()').extract()
# 獲取規格對應的skuid號碼》》》javascript:show('688')
skuid_list = response.xpath('//div[@id="tabs"]/div[@id="tabs-5"]/table/tbody/tr/td[9]/a/@href').extract()
# 對skuid_list結果進行正則匹配出數字
pattent = re.compile("\d+")
# items = []
# 有規格必然有起訂量,這兩個是一一對應,並且是必填項不可能爲空,只要是在線的產品都會有規格和起訂量
for i in range(len(product_size)):
# items = []
item['product_size'] = product_size[i]
item['product_quantity'] = product_quantity[i]
# 如果是多個銷售區域,那麼用分號隔開
if len(sales_area)>1:
item['sales_area'] = ";".join(sales_area)
elif len(sales_area) == 1:
area_list ="北京市,天津市,河北省,山西省,內蒙古,遼寧省,吉林省,黑龍江省,上海市,江蘇省,浙江省,安徽省,福建省,江西省,山東省,河南省,湖北省,湖南省,廣東省,廣西,海南省,重慶市,四川省,貴州省,雲南省,西藏,陝西省,甘肅省,青海省,寧夏,新疆"
area_list2 = "北京市,天津市,河北省,山西省,內蒙古,遼寧省,吉林省,黑龍江省,上海市,江蘇省,浙江省,安徽省,福建省,江西省,山東省,河南省,湖北省,湖南省,廣東省,廣西,海南省,重慶市,四川省,貴州省,雲南省,西藏,陝西省,甘肅省,青海省,寧夏,新疆省"
full_area_list = "北京市,天津市,河北省,山西省,內蒙古,遼寧省,吉林省,黑龍江省,上海市,江蘇省,浙江省,安徽省,福建省,江西省,山東省,河南省,湖北省,湖南省,廣東省,廣西,海南省,重慶市,四川省,貴州省,雲南省,西藏,陝西省,甘肅省,青海省,寧夏,新疆,臺灣省,香港,澳門"
if sales_area[0] == area_list or sales_area[0] == area_list2:
item['sales_area'] = "全國(不含港澳臺)"
elif sales_area[0] == full_area_list:
item['sales_area'] = "全國"
else:
item['sales_area'] = sales_area[0]
else:
item['sales_area'] = "無區域"
item['sales_title'] = sales_title[0].strip()
item['sale_list'] = sale_list[0].strip()
item['number_list'] = meta_1['number_list']
item['id_list'] = meta_1['id_list']
item['qiye_list'] = meta_1['qiye_list']
item['product_list'] = meta_1['product_list']
item['product_name_list'] = meta_1['product_name_list']
# items.append(item)
# 提取javascript:show('688')裏面skuid號碼
skuid_number = pattent.search(skuid_list[i]).group()
# 可以把skuid保存下來,這裏無用就不保存了
# item['skuid'] = skuid_number
skuid_url = "https://pos.XXXX.com/item/showitemprice.html?sku.id="+ skuid_number
yield scrapy.Request(url=skuid_url,meta={'meta_2':item},callback=self.parse_skuid)
def parse_skuid(self,response):
# 提取每次response的meta數據
meta_2 = response.meta['meta_2']
item = PosproductredisItem()
# 零售價,將重複的價格篩選掉,用set去掉重複項,並轉換爲列表
retail_price_list = response.xpath('//div[@id="tabs-1"]/table[@id="item"]/tbody/tr/td[2]/text()').extract()
retail_price = list(set(retail_price_list))
for i in range(len(retail_price)):
if retail_price[i] == "0.0":
retail_price[i] = '零售價待定'
elif retail_price[i] == "0.00":
retail_price[i] = '零售價數據0.00有誤'
# 如果是多個價格,用分號隔開
if len(retail_price)>1:
item['retail_price'] = ";".join(retail_price)
elif len(retail_price) ==1:
item['retail_price'] = retail_price[0]
# 獲取零售促銷價,將重複的促銷價篩選掉,用set去掉重複項,並轉換爲列表
promotion_price_list = response.xpath('//div[@id="tabs-1"]/table[@id="item"]/tbody/tr/td[3]/text()').extract()
promotion_price = list(set(promotion_price_list))
for i in range(len(promotion_price)):
if promotion_price[i] == "0.0":
promotion_price[i] = '無促銷價'
elif promotion_price[i] == "0.00":
promotion_price[i] = '促銷價數據0.00有誤'
# 如果是多個促銷價格,用分號隔開
if len(promotion_price)>1:
item['promotion_price'] = ";".join(promotion_price)
elif len(promotion_price) ==1:
item['promotion_price'] = promotion_price[0]
item['number_list'] = meta_2['number_list']
# item['skuid'] = meta_2['skuid']
item['id_list'] = meta_2['id_list']
item['qiye_list'] = meta_2['qiye_list']
item['product_list'] = meta_2['product_list']
item['product_name_list'] = meta_2['product_name_list']
item['sales_title'] = meta_2['sales_title']
item['sale_list'] = meta_2['sale_list']
item['product_size'] = meta_2['product_size']
item['product_quantity'] = meta_2['product_quantity']
item['sales_area'] = meta_2['sales_area']
yield item
5.處理pipelines管道文件保存數據,可將結果保存到文件中(pipelines.py)
# -*- coding: utf-8 -*-
import json
from openpyxl import Workbook
import time
# 轉碼操作
class MyEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, bytes):
return str(o, encoding='utf-8')
return json.JSONEncoder.default(self, o)
class PosproductredisPipeline(object):
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
# 創建表頭
self.ws.append(['序號', 'ID', '商家名稱', '產品分類',
'產品名稱', '銷售標題','銷售情況', '零售價',
'促銷價', '規格', '起訂量', '銷售區域'
])
def process_item(self, item, spider):
text = [item['number_list'], item['id_list'], item['qiye_list'], item['product_list'],
item['product_name_list'], item['sales_title'], item['sale_list'], item['retail_price'],
item['promotion_price'], item['product_size'], item['product_quantity'], item['sales_area']]
self.ws.append(text)
return item
def close_spider(self, spider):
# 給保存的文件名字加上個當天的日期年月日
file_end_name = time.strftime("%Y-%m-%d", time.localtime())
self.wb.save("pos_product_redis"+file_end_name+'.xlsx')
print("數據處理完成,謝謝使用!")
6.配置settings文件(settings.py)
# 使用scrapy-redis裏的去重組件,不再使用scrapy默認的去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用了scrapy-redis裏的調度器組件,不再使用scrapy默認的調度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 允許暫停,redis請求記錄不丟失
SCHEDULER_PERSIST = True
# 不寫默認存儲到本地數據庫
# REDIS_HOST = "192.168.0.109"
# REDIS_PORT = 6379
# 默認的scrapy-redis請求隊列形式
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
# 隊列形式,先進先出,選這個會報錯:Unhandled error in Deferred
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
# 棧形式,先進後出
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
# Configure item pipelines去掉下面註釋,打開管道文件,添加RedisPipeline
ITEM_PIPELINES = {
'PosProductRedis.pipelines.PosproductredisPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400,
}
# Obey robots.txt rules,具體含義參照:https://blog.csdn.net/z564359805/article/details/80691677
ROBOTSTXT_OBEY = False
# Override the default request headers:添加User-Agent信息
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
# 還可以將日誌存到本地文件中(可選添加設置)
LOG_FILE = "posproductredis.log"
LOG_LEVEL = "DEBUG"
# 包含print全部放在日誌中
LOG_STDOUT = True
7.參照以下鏈接打開redis數據庫:
https://blog.csdn.net/z564359805/article/details/80808155
8.以上設置完畢,進行爬取:進入到spiders文件夾下執行項目命令,啓動Spider:
scrapy runspider posproductredis.py
9.在Master端(核心服務器)的redis-cli輸入push指令,參考格式:
輸入:lpush PosproductredisSpider:start_urls https://pos.XXXX.com/item/itemonlist.html?d-49489-p=1