Python爬蟲系列之爬取某奢侈品小程序店鋪商品數據

Python爬蟲系列之爬取某奢侈品小程序店鋪商品數據

小程序爬蟲接單、app爬蟲接單、網頁爬蟲接單、接口定製、網站開發、小程序開發> 點擊這裏聯繫我們 <

微信請掃描下方二維碼

在這裏插入圖片描述

代碼僅供學習交流,請勿用於非法用途

一、準備數據庫

create database zr;

use zr;

# 商品表
create table zr_goodslist(
	id int primary key auto_increment comment 'id',
	pid varchar(30) unique comment 'pid',
	sku varchar(30) default null comment 'sku',
    name varchar(50) default null comment 'name',
    sellingPoint varchar(200) default null comment 'sellingPoint',
    descption text default null comment 'desc',
    mainimg text default null comment 'mainimg',
    imageList text default null comment 'imageList',
    video text default null comment 'video',
    brand varchar(30) default null comment 'brand',
    status varchar(8) default null comment 'status',
    stock varchar(10) default null comment 'stock',
    source varchar(10) default null comment 'source',
    refDetail text default null comment 'refDetail',
    convert_size varchar(100) default null comment 'convert_size',
    marketPrice varchar(15) default null comment 'marketPrice',
    salePrice varchar(15) default null comment 'salePrice',
    price varchar(15) default null comment 'price',
    discount varchar(15) default null comment 'discount',
    marketingDesc varchar(300) default null comment 'marketingDesc',
    grade varchar(10) default null comment 'grade',
    brandType varchar(15) default null comment 'brandType',
    categoryOne varchar(20) default null comment 'categoryOne',
    categoryTwo varchar(20) default null comment 'categoryTwo',
    categoryThree varchar(20) default null comment 'categoryThree',
    viewNumStatus varchar(10) default null comment 'viewNumStatus',
    openBargain varchar(30) default null comment 'openBargain',
    directDesc text default null comment 'directDesc',
    degree text default null comment 'degree',
    degreeDesc text default null comment 'degreeDesc',
    degreeExt text default null comment 'degreeExt',
    coefficient text default null comment 'coefficient',
    firstPutOn varchar(50) default null comment 'firstPutOn',
    proc_view_num varchar(15) default null comment 'proc_view_num',
    correctNum varchar(15) default null comment 'correctNum',
    bargainBasePrice varchar(15) default null comment 'bargainBasePrice',
    onSale varchar(10) default null comment 'onSale',
    onSaleCountDown varchar(15) default null comment 'onSaleCountDown',
    bargainLock varchar(50) default null comment 'bargainLock',
    bargainDownTime varchar(35) default null comment 'bargainDownTime',
    isBargain varchar(10) default null comment 'isBargain',
    bargainPrice varchar(15) default null comment 'bargainPrice',
    bargainNum varchar(15) default null comment 'bargainNum',
    color_forming varchar(30) default null comment 'color_forming',
    tile_size varchar(30) default null comment 'tile_size',
    overall_weight varchar(30) default null comment 'overall_weight',
    size_prompt varchar(30) default null comment 'size_prompt',
    defect text default null comment 'defect',
    style text default null comment 'style',
    accessories text default null comment 'accessories',
    material text default null comment 'material',
    lengths text default null comment 'lengths',
    main_material text default null comment 'main_material',
    sizes text default null comment 'sizes',
    fabric text default null comment 'fabric'
)engine=INNODB charset=utf8;


二、代碼實現

# -*- coding:utf-8 -*-
import requests
from queue import Queue
import threading
import json
import MySQLdb
import configparser

totals = 0
cf = configparser.ConfigParser()
try:
    cf.read("config.ini")
except Exception as e:
    print("程序目錄下不存在config.ini配置文件~")
    exit(0)


def getConf(sec, key):
    try:
        return cf.get(sec, key)
    except Exception as e:
        print("未得到以下配置:" + sec + " - " + key)
        exit(0)

# -------------------------------------------------
threadNums = int(getConf("app-sys", "threadNums"))
retry = 3
timeout = 20
# 數據庫賬號
mysql_user = getConf("Mysql-Database", "user")
# 數據庫密碼
mysql_password = getConf("Mysql-Database", "password")
# 數據庫名稱
mysql_database = getConf("Mysql-Database", "database")
# 表名稱
mysql_table = getConf("Mysql-Database", "table")
headers = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; DUK-AL20 Build/LMY48Z; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Safari/537.36 MicroMessenger/7.0.10.1580(0x27000A59) Process/appbrand3 NetType/WIFI Language/zh_CN ABI/arm32",
    "content-type": "application/json;charset=utf-8",
}
host = "https://img.*******.com/"
attrsList = []


class zrSpider(threading.Thread):
    def __init__(self, brandQueue, index, *args, **kwargs):
        super(zrSpider, self).__init__(*args, **kwargs)
        self.brandQueue = brandQueue
        self.index = index

    def getGoodsList(self, brandId, page):
        url = "https://search.*******.com/V4.7.0/product/list"
        data = {
            "page": page,
            "pageSize": 20,
            "sort": "",
            "ppath": "4:" + str(brandId),
            "newShare": 0,
            "selfbiz": 1,
            "version": "5.3.0",
            "debug": "false",
            "mt": "WX-micro",
            "inWechat": 1,
            "from": "micro",
            "deviceId": "deviceId"
        }
        resp = postHtml(url, data)
        if resp:
            try:
                return resp['data']['list']
            except Exception as e:
                pass
        return

    def getGoodsDetail(self, id):
        global attrsList
        url = "https://api.*******.com/V5.3.0/product/newDetail"
        data = {
            "id": str(id),
            "version": "5.3.0",
            "debug": "false",
            "mt": "WX-micro",
            "inWechat": 1,
            "from": "micro",
            "deviceId": "deviceId"
        }
        resp = postHtml(url, data)
        if resp:
            try:
                if str(resp['code']) != "100000":
                    return
            except Exception as e:
                return
            detail = {}
            productAttr = {}
            # brand = {}
            try:
                detail = resp['data']['detail']
                productAttr = resp['data']['productAttr']
                # brand = resp['data']['brand']
            except Exception as e:
                return
            # try:
            #     for product in productAttr:
            #         attrsList.append(product['name'])
            #     print(list(set(attrsList)))
            #     print(len(list(set(attrsList))))
            # except Exception as e:
            #     pass
            # return
            try:
                goods = []
                try:
                    goods.append(detail['id'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['sku'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['name'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['sellingPoint'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['desc'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(host + detail['imageList'][0])
                except Exception as e:
                    goods.append("")
                try:
                    imageList = detail['imageList']
                    imgs = []
                    for image in imageList:
                        imgs.append(host + image)
                    goods.append(str(imgs).replace("'", "\""))
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['video'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['brand'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['status'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['stock'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['source'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['refDetail'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['convert_size'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['marketPrice'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['salePrice'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['price'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['discount'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['marketingDesc'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['grade'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['brandType'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['categoryOne'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['categoryTwo'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['categoryThree'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['viewNumStatus'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['openBargain'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['directDesc'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['degree'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['degreeDesc'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['degreeExt'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['coefficient'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['firstPutOn'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['proc_view_num'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['correctNum'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['bargainBasePrice'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['onSale'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['onSaleCountDown'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['bargainLock'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['bargainDownTime'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['isBargain'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['bargainPrice'])
                except Exception as e:
                    goods.append("")
                try:
                    goods.append(detail['bargainNum'])
                except Exception as e:
                    goods.append("")
               return goods
            except Exception as e:
                return
        return

    def pipLine(self, data):
        print("------------------------- insert ------------------------- ")
        print(data)
        print("---------------------------------------------------------- ")
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database,
                                   charset='utf8')
            cursor = conn.cursor()
            cursor.execute("insert " + mysql_table + "(pid, sellingPoint, descption, mainimg, imageList, video, brand, status, stock, source, refDetail, convert_size, marketPrice, salePrice, price, discount, marketingDesc, grade,categoryTwo, categoryThree, viewNumStatus, openBargain, directDesc, degree, degreeDesc, degreeExt, coefficient, firstPutOn, proc_view_num, correctNum, bargainBasePrice, onSale, onSaleCountDown, bargainLock, bargainPrice, color_forming, tile_size, overall_weight, size_prompt, defect, style, accessories, material, lengths, main_material, sizes, fabric) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (str(data[0]), str(data[1]), str(data[2]), str(data[3]), str(data[4]), str(data[5]), str(data[6]), str(data[7]), str(data[8]), str(data[9]), str(data[10]), str(data[11]), str(data[12]), str(data[13]), str(data[14]), str(data[15]), str(data[16]), str(data[17]), str(data[18]), str(data[19]), str(data[20]), str(data[21]), str(data[22]), str(data[23]), str(data[24]), str(data[25]), str(data[26]), str(data[27]), str(data[28]), str(data[29]), str(data[30]), str(data[31]), str(data[32]), str(data[33]), str(data[34]), str(data[35]), str(data[36]), str(data[37]), str(data[38]), str(data[39]), str(data[40]), str(data[41]), str(data[42]), str(data[43]), str(data[44]), str(data[45]), str(data[46]), str(data[47]), str(data[48]), str(data[49]), str(data[50]), str(data[51]), str(data[52]), str(data[53])))
            conn.commit()
        except Exception as e:
            print(e)
            pass

    def getTotalPage(self, brandId):
        url = "https://search.*******.com/V4.7.0/product/list"
        data = {
            "page": 1,
            "pageSize": 20,
            "sort": "",
            "ppath": "4:" + str(brandId),
            "newShare": 0,
            "selfbiz": 1,
            "version": "5.3.0",
            "debug": "false",
            "mt": "WX-micro",
            "inWechat": 1,
            "from": "micro",
            "deviceId": "deviceId"
        }
        resp = postHtml(url, data)
        if resp:
            try:
                count = int(resp['data']['count'])
                return count // 20 if count % 20 == 0 else (count // 20) + 1
            except Exception as e:
                pass
        return 1

    def checkGoodsExists(self, pid):
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database,
                                   charset='utf8')
            cursor = conn.cursor()
            cursor.execute("select * from " + mysql_table + " where pid = '%s'" % str(pid))
            return len(cursor.fetchall()) > 0
        except Exception as e:
            print(e)
            pass
        return False

    def update(self, data):
        print("------------------------- update ------------------------- ")
        print(data)
        print("---------------------------------------------------------- ")
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database,
                                   charset='utf8')
            cursor = conn.cursor()
            cursor.execute("update " + mysql_table + " set sku = '%s', name = '%s', sellingPoint = '%s', descption = '%s', *****= '%s', imageList = '%s', video = '%s', brand = '%s', status = '%s', stock = '%s', source = '%s', refDetail = '%s', convert_size = '%s', marketPrice = '%s', salePrice = '%s', ***** = '%s', discount = '%s', marketingDesc = '%s', grade = '%s', brandType = '%s', categoryOne = '%s', categoryTwo = '%s', categoryThree = '%s', viewNumStatus = '%s', openBargain = '%s', directDesc = '%s', degree = '%s', degreeDesc = '%s', degreeExt = '%s', coefficient = '%s', firstPutOn = '%s', *****= '%s', correctNum = '%s', bargainBasePrice = '%s', onSale = '%s', onSaleCountDown = '%s', bargainLock = '%s', bargainDownTime = '%s', isBargain = '%s', bargainPrice = '%s', bargainNum = '%s', color_forming = '%s', tile_size = '%s', *****= '%s', size_prompt = '%s', defect = '%s', style = '%s', accessories = '%s', material = '%s', lengths = '%s', fabric = '%s' where pid = '%s'" % (str(data[1]), str(data[2]), str(data[3]), str(data[4]), str(data[5]), str(data[6]), str(data[7]), str(data[8]), str(data[9]), str(data[10]), str(data[11]), str(data[12]), str(data[13]), str(data[14]), str(data[15]), str(data[16]), str(data[17]), str(data[18]), str(data[19]), str(data[20]), str(data[21]), str(data[22]), str(data[23]), str(data[24]), str(data[25]), str(data[26]), str(data[27]), str(data[28]), str(data[29]), str(data[30]), str(data[31]), str(data[32]), str(data[33]), str(data[34]), str(data[35]), str(data[36]), str(data[37]), str(data[38]), str(data[39]), str(data[40]), str(data[41]), str(data[42]), str(data[43]), str(data[44]), str(data[45]), str(data[46]), str(data[47]), str(data[48]), str(data[49]), str(data[50]), str(data[51]), str(data[52]), str(data[53]), str(data[0])))
            conn.commit()
        except Exception as e:
            pass

    def run(self):
        print("線程:%d 啓動~" % self.index)
        while True:
            if self.brandQueue.empty():
                break
            brandQueue = self.brandQueue.get()
            brand_id = str(brandQueue['id'])
            totalPage = self.getTotalPage(brand_id)
            for page in range(1, totalPage + 1):
                goodsList = self.getGoodsList(brand_id, page)
                if goodsList and len(goodsList) > 0:
                    for goods in goodsList:
                        goodsId = goods['id']
                        datas = self.getGoodsDetail(goodsId)
                        exists = self.checkGoodsExists(goodsId)
                        if exists:
                            # 更新
                            self.update(datas)
                        else:
                            self.pipLine(datas)


def postHtml(url, data):
    for i in range(retry):
        try:
            resp = requests.post(url, data=json.dumps(data), json=data, headers=headers, timeout=timeout)
            return json.loads(resp.content.decode("utf-8"))
        except Exception as e:
            pass
    return


def getHtml(url):
    for i in range(retry):
        try:
            resp = requests.get(url, headers=headers, timeout=timeout)
            return json.loads(resp.content.decode("utf-8"))
        except Exception as e:
            pass
    return


def getBrandQueue():
    brandQueue = Queue(0)
    url = "https://api.*******.com/V5.3.0/site/currentBrand"
    data = {
        "version": "5.3.0",
        "debug": "false",
        "mt": "WX-micro",
        "inWechat": 1,
        "from": "micro",
        "deviceId": "deviceId"
    }
    resp = postHtml(url, data)
    if resp:
        brandList = []
        try:
            brandList = resp['data']['list']
        except Exception as e:
            return
        for brand in brandList:
            brandQueue.put(brand)
    return brandQueue


def main():
    print("初始化爬蟲~")
    brandQueue = getBrandQueue()
    print("類目獲取完畢~")
    for i in range(threadNums):
        z = zrSpider(brandQueue, i)
        z.start()


if __name__ == '__main__':
    main()

小程序爬蟲接單、app爬蟲接單、網頁爬蟲接單、接口定製、網站開發、小程序開發 > 點擊這裏聯繫我們 <

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章