2019叮噹貓雙11店鋪預售統計

1.獲取店鋪的商品列表 

import os
import re
import time
import random
import logging
import pathlib
import requests
from lxml import etree
from pymongo import MongoClient

# 根據需要手動修改要爬取的店鋪ID,並設置cookie
shopId = 6
cookie= 'cna=84DqFV4SPyYCATo9kTLfT6u+; t=aa42477f58c7f2322f00dfb5a1eb3ecc; _tb_token_=7e51fd1e5e1e7; cookie2=156089853e8f3a6eeb0f7920d1963fc3; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; tk_trace=1; dnk=pengjun%5Cu674E; uc1=cookie14=UoTaEcMIdvG%2F2Q%3D%3D&lng=zh_CN&cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&existShop=false&cookie21=URm48syIYn73&tag=8&cookie15=W5iHLLyFOGW7aA%3D%3D&pas=0; uc3=vt3=F8dByuK6XCe%2FyobG9RM%3D&nk2=E6EQ1CLKS%2FnL&id2=VWeT3jqq6jDz&lg2=W5iHLLyFOGW7aA%3D%3D; tracknick=pengjun%5Cu674E; lid=pengjun%E6%9D%8E; uc4=id4=0%40V8Zo2exYFQXrRTZfa2A8fWgCJ%2B0%3D&nk4=0%40EbhmhLlrKdq9uf0H4heNaPV%2BwIo%3D; lgc=pengjun%5Cu674E; csg=b9af8b15; enc=HPXzwVBtnTh2ZKD7IdgorhLo07qNH2rA9jqbXScJDYdMLIFeET66f7y07GgZfiMfpKBC%2BItvWd2MLhSwCstmeA%3D%3D; whl=-1%260%260%260; cq=ccp%3D1; swfstore=171740; _bl_uid=Oekw81wkrjto8X9ddpwz59Lo2byd; pnm_cku822=; _m_h5_tk=b6a19e8985356b467ddaa2fba0d073e9_1571740652530; _m_h5_tk_enc=715e1141f3a7515e95f7ff83e824eca0; isg=BH19CXaG_C6KPlgX7xg1CrZpjNnbCuFDleKNlz_CuVQDdp2oB2rBPEsgIOqVdskk; l=dBjfAG8qqFtnSVAFBOCwourza77OSIRAguPzaNbMi_5B16L1Dn7OkZk0OFp6VjWftt8B4-YhSFe9-etkid-Jth7djawTBxDc.'

# 設置日誌的輸出樣式
logging.basicConfig(level=logging.INFO,
                    format='[%(asctime)-15s] [%(levelname)8s] [%(name)10s ] - %(message)s (%(filename)s:%(lineno)s)',
                    datefmt='%Y-%m-%d %T'
                    )
logger = logging.getLogger(__name__)

# 可爬取的店鋪配置參數
shop_list = {
    1 :{
        "shop_name": "purcotton",
        "list_url": "https://purcotton.tmall.com/i/asynSearch.htm?callback=jsonp693&mid=w-14440378953-0&wid=14440378953&path=/search.htm&search=y&pageNo={}",
        "referer": "https://purcotton.tmall.com/search.htm"
    },
    2 :{
        "shop_name": "miansen",
        "list_url": "https://miansen.tmall.com/i/asynSearch.htm?callback=jsonp363&mid=w-16800593356-0&wid=16800593356&path=/search.htm&search=y&pageNo={}",
        "referer": "https://miansen.tmall.com/search.htm"
    },
    3 :{
        "shop_name": "zichu",
        "list_url": "https://zichu.tmall.com/i/asynSearch.htm?callback=jsonp363&mid=w-14977327192-0&wid=14977327192&path=/search.htm&search=y&pageNo={}",
        "referer": "https://zichu.tmall.com/search.htm"
    },
    4 :{
        "shop_name": "babycaremy",
        "list_url": "https://babycaremy.tmall.com/i/asynSearch.htm?callback=jsonp125&mid=w-14913709402-0&wid=14913709402&path=/search.htm&search=y&pageNo={}",
        "referer": "https://babycaremy.tmall.com/search.htm"
    },
    5 :{
        "shop_name": "jianrou",
        "list_url": "https://jianrou.tmall.com/i/asynSearch.htm?callback=jsonp125&mid=w-16603479881-0&wid=16603479881&path=/search.htm&search=y&pageNo={}",
        "referer": "https://jianrou.tmall.com/search.htm"
    },
    6 :{
        "shop_name": "qingshenghuorh",
        "list_url": "https://qingshenghuorh.tmall.com/i/asynSearch.htm?callback=jsonp116&mid=w-14896201470-0&wid=14896201470&path=/search.htm&search=y&pageNo={}",
        "referer": "https://qingshenghuorh.tmall.com/search.htm"
    }

}

# 構造請求頭
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
    'Referer' : shop_list[shopId]['referer'],
    'Cookie' : cookie,
    'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'accept-encoding' : 'gzip, deflate, br',
    'accept-language' : 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7'
}

# 設置店鋪數據存放目錄
now = time.time()
date_str = time.strftime('%Y-%m-%d', time.localtime(now))
shop_dir = "D:\scrapy\double11\{}\{}".format( date_str,shop_list[shopId]['shop_name'])
if os.path.exists(shop_dir) is False:
    os.makedirs(shop_dir)

# 請求響應內容需匹配的正則模式
search_pattern = re.compile(r'jsonp\d*\(\"(.*?)\"\)', re.S)

# 連接Mongodb數據庫
m = MongoClient(host="172.16.250.238", port=27017)
test_db = m["test"]
db = test_db['tmallGoodsEntity']

# 先將店鋪數據全部置爲不可用,剔除下架商品
db.update_many({"shopId":shopId},{'$set': {"enabled":False}})

goods_list = [] # 用來保存爬到的商品ID

# 用來進行迭代請求,一次最多請求15,防止被反爬
currentPage = 1
hasNext = True
requests_count = 15

# 設置字符集
charSet = "gbk"

# 創建session
s = requests.Session()

while hasNext and requests_count>0:
    logger.info("開始爬取第 {} 頁".format(currentPage))
    pageFilePath = shop_dir + "\page_{}.txt".format(currentPage)
    pagePath = pathlib.Path(pageFilePath)
    if pagePath.exists():
        logger.info("使用本地文件。。。")
        with open(pageFilePath, 'r', encoding=charSet) as f:
            html_str = f.read()
    else:
        logger.info("發送網絡請求。。。")
        url = shop_list[shopId]['list_url'].format(currentPage)
        time.sleep(1+random.randint(0,3))
        try :
            requests_count = requests_count-1
            response = s.get(url, headers=headers)
        except BaseException as e:
            logger.info(e)
            break
        else:
            ret_str = response.content.decode(charSet,"ignore")
            # 替換掉響應內容中影響xml解析的內容
            format_str = ret_str.replace('=\\\"', '=')
            format_str = format_str.replace('\\\" ', ' ')
            format_str = format_str.replace('\\\">', '>')
            search_ret = search_pattern.search(format_str)
            if search_ret:
                html_str = search_ret.group(1)
                with open(pageFilePath, 'w', encoding=charSet) as f:
                    f.write(html_str)
    if html_str:
        # 構造xml樹
        html = etree.HTML(html_str)

        # 美化輸出html的內容
        # print(etree.tostring(html, pretty_print=True).decode('utf-8'))

        # 看看是否還有下一頁
        next_page = html.xpath('//p[@class="ui-page-s"]/a[@title="下一頁"]')
        if len(next_page) == 0:
            hasNext = False

        # 解析頁面內容
        items = html.xpath('//div[@class="J_TItems"]/div[(@class="item5line1" or @class="item4line1") and position()<last()-2]/dl[@class="item"]')
        for item in items:
            sku_id = int(item.xpath('./@data-id')[0])
            img_url = item.xpath('./dt[@class="photo"]/a[@class="J_TGoldData"]/img/@data-ks-lazyload')[0]
            item_name = item.xpath('./dd[@class="detail"]/a[@class="item-name"]/text()')[0].strip()
            attr_item = item.xpath('./dd[@class="detail"]/div[@class="attribute"]')[0]

            sku_price_str = attr_item.xpath('./div[@class="cprice-area"]/span[@class="c-price"]/text()')[0].strip()
            sale_list = attr_item.xpath('./div[@class="sale-area"]/span[@class="sale-num"]/text()')
            sale_count_str = sale_list[0].strip() if len(sale_list)>0 else 0
            rate_list = item.xpath('./dd[@class="rates"]/div/h4/a/span/text()')
            rate_count_str = rate_list[0].strip().replace("評價: ","") if len(rate_list)>0 else 0

            ret = db.find_one({'_id': sku_id})
            if ret is None:
                o = {}
                o['_id'] = sku_id
                o['imgUrl'] = img_url
                o['title'] = item_name
                o['price'] = float(sku_price_str)
                o['totalSaleCount'] = int(sale_count_str)
                o['rateCount'] = int(rate_count_str)
                o['enabled'] = True
                o['updateTime'] = int(now)
                # 關聯的店鋪ID
                o['shopId'] = shopId
                db.insert_one(o)
            else:
                o = {'$set': {}}
                o['$set']['imgUrl'] = img_url
                o['$set']['title'] = item_name
                o['$set']['price'] = float(sku_price_str)
                o['$set']['totalSaleCount'] = int(sale_count_str)
                o['$set']['rateCount'] = int(rate_count_str)
                o['$set']['enabled'] = True
                o['$set']['updateTime'] = int(now)
                db.update_one({'_id': sku_id}, o)
            goods_list.append(sku_id)
    currentPage=currentPage+1
print(goods_list)

# 數據示例,需關注字段類型
"""
{
    "_id": 554949386593, 
    "imgUrl": "//img.alicdn.com/bao/uploaded/i3/430490406/O1CN01MfS3gC1ErzMAMswd5_!!0-item_pic.jpg_180x180.jpg", 
    "title": "全棉時代擦臉巾洗臉巾女一次性潔面巾棉柔純棉抽取式實惠盒裝紙巾", 
    "price": 89.9, 
    "totalSaleCount": 359972, 
    "rateCount": 68780, 
    "enabled": true, 
    "updateTime": 1571987992, 
    "shopId": 1
}
"""

2.獲取店鋪的商品數據

import os
import re
import json
import time
import random
import logging
import pathlib
import requests
from pymongo import MongoClient

# 根據需要手動修改要爬取的店鋪ID,並設置cookie
shopId = 6
cookie= 'cna=84DqFV4SPyYCATo9kTLfT6u+; t=aa42477f58c7f2322f00dfb5a1eb3ecc; _tb_token_=7e51fd1e5e1e7; cookie2=156089853e8f3a6eeb0f7920d1963fc3; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; tk_trace=1; dnk=pengjun%5Cu674E; uc1=cookie14=UoTaEcMIdvG%2F2Q%3D%3D&lng=zh_CN&cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&existShop=false&cookie21=URm48syIYn73&tag=8&cookie15=W5iHLLyFOGW7aA%3D%3D&pas=0; uc3=vt3=F8dByuK6XCe%2FyobG9RM%3D&nk2=E6EQ1CLKS%2FnL&id2=VWeT3jqq6jDz&lg2=W5iHLLyFOGW7aA%3D%3D; tracknick=pengjun%5Cu674E; lid=pengjun%E6%9D%8E; uc4=id4=0%40V8Zo2exYFQXrRTZfa2A8fWgCJ%2B0%3D&nk4=0%40EbhmhLlrKdq9uf0H4heNaPV%2BwIo%3D; lgc=pengjun%5Cu674E; csg=b9af8b15; enc=HPXzwVBtnTh2ZKD7IdgorhLo07qNH2rA9jqbXScJDYdMLIFeET66f7y07GgZfiMfpKBC%2BItvWd2MLhSwCstmeA%3D%3D; whl=-1%260%260%260; cq=ccp%3D1; swfstore=171740; _bl_uid=Oekw81wkrjto8X9ddpwz59Lo2byd; pnm_cku822=; _m_h5_tk=b6a19e8985356b467ddaa2fba0d073e9_1571740652530; _m_h5_tk_enc=715e1141f3a7515e95f7ff83e824eca0; l=dBjfAG8qqFtnSY8MBOCNqQKXiCQOSIRAguSJGwSBi_5aX6L6_x7OkZlPoFp6VjWftt8B4-YhSFe9-etkid-Jth7djawTBxDc.; isg=BAwM2l0Hve3W56lEVtfUqc-u3WqSVeDAnEVcZGbNGLda8az7jlWAfwJHkbnsuehH'

# 設置日誌的輸出樣式
logging.basicConfig(level=logging.INFO,
                    format='[%(asctime)-15s] [%(levelname)8s] [%(name)10s ] - %(message)s (%(filename)s:%(lineno)s)',
                    datefmt='%Y-%m-%d %T'
                    )
logger = logging.getLogger(__name__)

# 可爬取的店鋪配置參數
shop_list = {
    1 :{ "shop_name": "purcotton" },
    2 :{ "shop_name": "miansen" },
    3 :{ "shop_name": "zichu" },
    4 :{ "shop_name": "babycare" },
    5 :{ "shop_name": "jianrou" },
    6 :{ "shop_name": "qingshenghuorh" }

}

# 構造請求頭
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
    'Cookie' : cookie,
    'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'accept-encoding' : 'gzip, deflate, br',
    'accept-language' : 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7'
}

# 設置店鋪數據存放目錄
now = time.time()
date_str = time.strftime('%Y-%m-%d', time.localtime(now))
shop_dir = "D:\scrapy\double11\{}\{}".format( date_str,shop_list[shopId]['shop_name'])
if os.path.exists(shop_dir) is False:
    os.makedirs(shop_dir)

# 連接Mongodb數據庫
m = MongoClient(host="172.16.250.238", port=27017)
test_db = m["test"]
db = test_db['tmallGoodsEntity']

# 獲取店鋪的商品列表
goods_list = db.find({"shopId":shopId})

# 設置字符集
charSet = "gbk"

# 創建session
s = requests.Session()

# 請求響應內容需匹配的正則模式
search_pattern = re.compile(r'setMdskip\s\((.*?)\)$', re.S)

info_url = "https://mdskip.taobao.com/core/initItemDetail.htm?isUseInventoryCenter=false&cartEnable=false&service3C=false&isApparel=false&isSecKill=false&tmallBuySupport=true&isAreaSell=false&tryBeforeBuy=false&offlineShop=false&itemId={}&showShopProm=false&isPurchaseMallPage=false&isRegionLevel=false&household=false&sellerPreview=false&queryMemberRight=true&addressLevel=2&isForbidBuyItem=false&callback=setMdskip&timestamp={}"
referer_base= "https://detail.tmall.com/item.htm?id={}"

for goods in goods_list:
    goods_id = goods["_id"]
    logger.info("開始爬取商品: {}".format(goods_id))
    itemFilePath = shop_dir + "\\info_{}.txt".format(goods_id)
    pagePath = pathlib.Path(itemFilePath)
    if pagePath.exists():
        logger.info("使用本地文件。。。")
        with open(itemFilePath, 'r', encoding=charSet) as f:
            json_str = f.read()
    else:
        logger.info("發送網絡請求。。。")
        time.sleep(1 + random.randint(0, 3))
        headers['Referer'] = referer_base.format(goods_id)
        now = time.time()
        url = info_url.format(goods_id, int(round(now * 1000)))
        try :
            response = s.get(url, headers=headers)
        except BaseException as e:
            logger.info(e)
            break
        else:
            ret_str = response.content.decode(charSet,"ignore")
            search_ret = search_pattern.search(ret_str)
            if search_ret:
                json_str = search_ret.group(1)
                with open(itemFilePath, 'w', encoding=charSet) as f:
                    f.write(json_str)
    if json_str:
        # 轉換爲json對象
        json_obj = json.loads(json_str)

        # 解析響應數據
        if 'sellCount' in json_obj['defaultModel']['sellCountDO']:
            monthSaleCount = json_obj['defaultModel']['sellCountDO']['sellCount']
        else:
            monthSaleCount = '-'
        totalQuantity = json_obj['defaultModel']['inventoryDO']['totalQuantity']
        price_info= json_obj['defaultModel']['itemPriceResultDO']['priceInfo']
        if 'def' in price_info:
            def_info = price_info['def']
        else:
            for key in price_info:
                def_info = price_info[key]
        if def_info:
            # 如果包含預售字段
            if 'wrtInfo' in def_info:
                wrt_info = def_info['wrtInfo']
                original_price = def_info['promotionList'][0]['price']
                pre_cash = wrt_info['price']
                pre_count = wrt_info['groupUC']
                o = { '$set': {} }
                o['$set']['monthSaleCount'] = monthSaleCount
                o['$set']['originalPrice'] = float(original_price)
                o['$set']['preSaleCash'] = int(pre_cash)/100
                o['$set']['preSaleCount'] = int(pre_count)
                o['$set']['preSale'] = True
                o['$set']['preSaleTotal'] = goods["price"]*int(pre_count)
                o['$set']['totalQuantity'] = int(totalQuantity)
                o['$set']['updateTime'] = int(now)
                db.update_one({'_id':int(goods_id)},o)
            else:
                original_price = def_info['price']
                o = { '$set': {} }
                o['$set']['monthSaleCount'] = monthSaleCount
                o['$set']['originalPrice'] = float(original_price)
                o['$set']['preSale'] = False
                o['$set']['totalQuantity'] = int(totalQuantity)
                o['$set']['updateTime'] = int(now)
                db.update_one({'_id': int(goods_id)}, o)
    else:
        break


# 數據示例,需關注字段類型
"""
{
    "_id": 20159694203, 
    "imgUrl": "//img.alicdn.com/bao/uploaded/i4/430490406/O1CN01BzX29B1ErzMxI791m_!!0-item_pic.jpg_180x180.jpg", 
    "title": "全棉時代 產婦一次性內褲女士純棉孕婦產後月子待產用品旅行 25條", 
    "price": 96, 
    "totalSaleCount": 631868, 
    "rateCount": 40631, 
    "enabled": true, 
    "updateTime": 1571993200, 
    "shopId": 1, 
    "monthSaleCount": "2.5萬+", 
    "originalPrice": 194, 
    "preSale": true, 
    "preSaleCash": 10, 
    "preSaleCount": 26135, 
    "preSaleTotal": 2508960, 
    "totalQuantity": 18423
}
"""

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章