Python爬蟲系列之爬取某社區團微信小程序店鋪商品數據

Python爬蟲系列之爬取某社區團微信小程序店鋪商品數據

如有問題QQ請> 點擊這裏聯繫我們 <

微信請掃描下方二維碼

在這裏插入圖片描述

代碼僅供學習交流,請勿用於非法用途

  • 數據庫僅用於去重使用,數據主要存儲於excel

一、準備數據庫

set names utf8;
drop database if exists sqt;
create database sqt;

use sqt;

CREATE TABLE `goods_list` (
  `id` int(10) NOT NULL AUTO_INCREMENT COMMENT 'ID',
  `goods_id` bigint(20) NOT NULL COMMENT '唯一ID',
  `sj_area` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '上架區域',
  `goods_brand` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '商品品牌',
  `goods_code` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '商品編碼',
  `spu_id` varchar(10) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT 'SPU-ID',
  `gys_code` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '供應商編碼(留空)',
  `gys_name` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '供應商簡稱',
  `goods_name` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '商品名稱',
  `attrs` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '規格',
  `sc_price` decimal(10,2) DEFAULT NULL COMMENT '市場價',
  `ysj_price` decimal(10,2) DEFAULT NULL COMMENT '預售價',
  `pt_fei` varchar(10) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '平臺費(=預售價*10%,小數點1位)',
  `bzj` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT '3年' COMMENT '保證金(留空)',
  `gys_js_price` decimal(10,2) DEFAULT NULL COMMENT '供應商結算價=預售價-門店提成-平臺費',
  `shop_ghj_price` decimal(10,2) DEFAULT NULL COMMENT '門店供貨價=預售價-門店提成',
  `GMV` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT 'GMV=預售價*限定數量',
  `sc_riqi` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT '2020年' COMMENT '生產日期(默認2020年)',
  `zcfs` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '貯存方式(默認值:01常溫)',
  `bzq` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '保質期(默認值:3年)',
  `ghfs` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT '次日' COMMENT '供貨方式(默認值:次日)',
  `xd_num` int(10) DEFAULT NULL COMMENT '限定數量',
  `xg_num` int(10) DEFAULT NULL COMMENT '',
  `sj_bq` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '設計標籤(採集分類)',
  `cate1` varchar(10) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '一級分類(同設計標籤)',
  `cate2` varchar(10) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '二級分類',
  `guanzhu_num` int(10) DEFAULT NULL COMMENT '關注人數',
  `xs_nums` int(10) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '銷售數量',
  `xs_e_price` decimal(10,2) DEFAULT NULL COMMENT '銷售額(銷售數量*預售價)',
  `sq_time` varchar(10) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '售罄時間(分鐘)',
  `sj_time` int(10) DEFAULT NULL COMMENT '上架時間',
  `xj_time` int(10) DEFAULT NULL COMMENT '下架時間',
  `start_time` int(10) DEFAULT NULL COMMENT '開始銷售時間',
  `end_time` int(10) DEFAULT NULL COMMENT '結束銷售時間',
  `qy_address` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '區域',
  `imageb_url` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '詳情頁地址(域名+ID)',
  `sy_image` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '首頁圖片',
  `haibao_image` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '海報圖片地址(2張滾動的圖片)',
  `images` text COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '詳情圖片',
  `sp_image` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '首頁視頻',
  `state` enum('0','1') COLLATE utf8mb4_unicode_ci DEFAULT '0' COMMENT '狀態值:0=下架,1=上架',
--  `prTitle` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT 'prTitle',
--  `prDetail` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT 'prDetail',
--  `tmBuyStart` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT 'tmBuyStart',
--  `tmPickUp` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT 'tmPickUp',
  `createtime`  int(10) DEFAULT NULL COMMENT '創建時間',
  PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='完整商品表';

二、代碼實現

# -*- coding:utf-8 -*-
import requests
import json
from queue import Queue
import threading
import os
import time
import configparser
import MySQLdb
from bs4 import BeautifulSoup


retry = 3
timeout = 20
headers = {
    "content-type": "application/json",
    "authorization": "請替換爲自己的authorization",
    "ver": "2.20.0",
    "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/7.0.11(0x17000b21) NetType/WIFI Language/zh_CN",
    "referer": "https://servicewechat.com/wxbbdca62c011eeb38/202/page-frame.html",
    "x-tingyun-id": "請替換爲自己的x-tingyun-id",
}
cf = configparser.ConfigParser()
# 間隔啓動判斷
intervalStartTime = 29
try:
    cf.read(os.getcwd() + "/conf.ini", encoding="utf-8-sig")
except Exception as e:
    print("程序目錄下不存在conf.ini配置文件~")
    exit(0)


def getConf(sec, key):
    try:
        return cf.get(sec, key)
    except Exception as e:
        print(e)
        print("未得到以下配置:" + sec + " - " + key)
        exit(0)


keywords = ""
try:
    keywords = getConf("app-sys", "keywords").split(",")
except Exception as e:
    print("keywords參數錯誤!")
    exit(0)
threadNums = 1
try:
    threadNums = int(getConf("app-sys", "threadNums"))
except Exception as e:
    print("threadNums參數錯誤!")
    exit(0)
# 啓動時間點
startTime = getConf("app-sys", "start")
startTimes = []
try:
    startTimes = startTime.split(",")
except Exception as e:
    pass

unexcept = getConf("app-sys", "unexcept")
unexcepts = []
try:
    unexcepts = unexcept.split(",")
except Exception as e:
    pass
# 數據庫賬號
mysql_user = getConf("Mysql-Database", "user")
# 數據庫密碼
mysql_password = getConf("Mysql-Database", "password")
# 數據庫名稱
mysql_database = getConf("Mysql-Database", "database")
# 主機地址
mysql_host = getConf("Mysql-Database", "host")
# 端口
mysql_port = getConf("Mysql-Database", "port")


def execSQl(sql):
    try:
        conn = MySQLdb.connect(user=mysql_user, password=mysql_password, host=mysql_host, database=mysql_database, charset='utf8')
        cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        return True
    except Exception as e:
        return False


def querySQL(sql):
    try:
        conn = MySQLdb.connect(user=mysql_user, password=mysql_password, host=mysql_host, database=mysql_database, charset='utf8')
        cursor = conn.cursor()
        cursor.execute(sql)
        return cursor.fetchall()
    except Exception as e:
        return False


def getHtml(url):
    for i in range(retry):
        try:
            resp = requests.get(url, headers=headers, timeout=timeout)
            return json.loads(resp.content.decode("utf-8"))
        except Exception as e:
            continue
    return


def postHtml(url, data):
    for i in range(retry):
        try:
            resp = requests.post(url, headers=headers, data=json.dumps(data), timeout=timeout)
            return json.loads(resp.content.decode("utf-8"))
        except Exception as e:
            continue
    return


def getCurrDate():
    return str(time.strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日'))


def dateTots(s):
    try:
        return int(time.mktime(time.strptime(s, "%Y/%m/%d %H:%M:%S")))
    except Exception as e:
        return 0


class shtSpider(threading.Thread):
    def __init__(self, categoryQueue, index, city, partnerId, grouponId, *args, **kwargs):
        super(shtSpider, self).__init__(*args, **kwargs)
        self.categoryQueue = categoryQueue
        self.city = city
      
    def getTotalPages(self, categoryId):
        url = "https://api.*****.net/mc/diamondV2/list-merchandise"
        data = {
            "diamondId": str(categoryId),
            "grouponId": self.grouponId,
            "partnerId": self.partnerId,
            "p": "1",
            "size": "10"
        }
        resp = postHtml(url, data)
        try:
            return int(resp['data']['totalPages'])
        except Exception as e:
            return

    def getGoodsList(self, categoryId, page):
        url = "https://api.*****.net/mc/diamondV2/list-merchandise"
        data = {
            "diamondId": str(categoryId),
            "grouponId": self.grouponId,
            "partnerId": self.partnerId,
            "p": int(page),
            "size": 10
        }
        resp = postHtml(url, data)
        try:
            return resp['data']['grouponMerchandiseList']
        except Exception as e:
            return

    def getGoodsDetail(self, merchandiseId, merchtypeId, categoryName):
        url = "https://api.*****.net/mc/merchandise/detail"
        data = {
            "grouponId": self.grouponId,
            "partnerId": self.partnerId,
            "merchandiseId": str(merchandiseId),
            "merchtypeId": str(merchtypeId)
        }
        goods_id = str(merchtypeId) + str(merchandiseId)
        resp = postHtml(url, data)
        if resp:
            try:
                data = resp['data']
                datas = {}
                try:
                    datas['goods_id'] = int(appflag + str(goods_id))
                except Exception as e:
                    return
                try:
                    datas['sj_area'] = self.city + "十|薈}團"
                except Exception as e:
                    datas['sj_area'] = ""
                try:
                    goods_name = str(data['title'])
                    if "【" != goods_name[0]:
                        pname = goods_name.split(" ")
                        if len(pname) > 1:
                            goods_name = goods_name.replace(pname[0], "【" + pname[0] + "】")
                    datas['goods_name'] = goods_name
                except Exception as e:
                    datas['goods_name'] = ""
                try:
                    datas['attrs'] = ""
                except Exception as e:
                    datas['attrs'] = ""
                datas['ckbz_dw'] = "大件"
                try:
                    datas['sc_price'] = float(data['originprice'])
                except Exception as e:
                    datas['sc_price'] = 0.00
                try:
                    datas['ysj_price'] = float(data['activityprice'])
                except Exception as e:
                    datas['ysj_price'] = 0.00
                try:
                    datas['shop_tc'] = str(float('%.1f' % float(datas['ysj_price'] * 0.1)))
                except Exception as e:
                    datas['shop_tc'] = "0.0"
                try:
                    datas['pt_fei'] = str(float('%.1f' % float(datas['ysj_price'] * 0.1)))
                except Exception as e:
                    datas['pt_fei'] = "0.0"
                try:
                    datas['gys_js_price'] = float(
                        "%.2f" % float(datas['ysj_price'] - float(datas['shop_tc']) - float(datas['pt_fei'])))
                except Exception as e:
                    datas['gys_js_price'] = 0.00
                try:
                    datas['shop_ghj_price'] = float(datas['ysj_price'] - float(datas['shop_tc']))
                except Exception as e:
                    datas['shop_ghj_price'] = 0.00
                try:
                    datas['xs_e_price'] = float("%.2f" % (int(data['waterQuantity']) * datas['ysj_price']))
                except Exception as e:
                    datas['xs_e_price'] = 0.00
                try:
                    datas['sj_time'] = dateTots(data['startTime'])
                except Exception as e:
                    datas['sj_time'] = 0
                try:
                    datas['xj_time'] = dateTots(data['endTime'])
                except Exception as e:
                    datas['xj_time'] = 0
                try:
                    datas['start_time'] = dateTots(data['startTime'])
                except Exception as e:
                    datas['start_time'] = 0
                try:
                    datas['end_time'] = dateTots(data['endTime'])
                except Exception as e:
                    datas['end_time'] = 0
                try:
                    datas['qy_address'] = self.city + "十?薈d團"
                except Exception as e:
                    datas['qy_address'] = ""
                try:
                    datas['imageb_url'] = detailPre + str(datas['goods_id'])
                except Exception as e:
                    datas['imageb_url'] = ""
                try:
                    datas['sy_image'] = data['itemimage']
                except Exception as e:
                    datas['sy_image'] = ""
                try:
                    images = data['carouselFileList'][:2]
                    imagesList = []
                    for image in images:
                        try:
                            rrr = image['url']
                            if "?" in rrr:
                                rrr = rrr[:rrr.find("?")]
                            imagesList.append(rrr)
                        except Exception as e:
                            imagesList.append(image)
                    datas['haibao_image'] = ",".join(imagesList)
                except Exception as e:
                    datas['haibao_image'] = ""
                try:
                    description = data['description']
                    descriptionImgs = []
                    descriptionSoup = BeautifulSoup(description, "html.parser")
                    descriptionSImgs = descriptionSoup.find_all("img")
                    for descriptionSImg in descriptionSImgs:
                        try:
                            descriptionImgs.append(descriptionSImg['src'])
                        except Exception as e:
                            pass
                    datas['images'] = ",".join(descriptionImgs)
                except Exception as e:
                    datas['images'] = ""
                datas['state'] = "0"
                datas['createtime'] = int(time.time())
                try:
                    datas['gys_name'] = data['supplierName']
                except Exception as e:
                    datas['gys_name'] = ""
                return datas
            except Exception as e:
                pass
        return

    def checkGoodsExists(self, pid):
        try:
            sql = "select * from goods_list where goods_id = %d" % int(pid)
            res = querySQL(sql)
            return len(res) > 0
        except Exception as e:
            return False

    

    def update(self, data):
        print("update ----------------------------------------------------")
        print(data)
        try:
            sql = "update goods_list set `sj_area` = '%s', `goods_name` = '%s', `attrs` = '%s', `ckbz_dw` = '%s', `sc_price` = %f, `ysj_price` = %f, `shop_tc` = '%s', `pt_fei` = '%s', `gys_js_price` = %f, `shop_ghj_price` = %f, `shujuhd3` = '%s', `GMV` = '%s', `c_address` = '%s', `sc_riqi` = '%s', `zcfs` = '%s', `bzq` = '%s', `ghfs` = '%s', `xd_num` = %d, `xg_num` = %d, `hd_attrs` = '%s', `sj_bq` = '%s', `cate1` = '%s', `cate2` = '%s', `pq_beizhu` = '%s', `xs_nums` = %d, `xs_e_price` = %f, `sj_time` = %d, `xj_time` = %d, `start_time` = %d, `end_time` = %d, `qy_address` = '%s', `imageb_url` = '%s', `sy_image` = '%s', `haibao_image` = '%s', `images` = '%s', `state` = '%s', `createtime` = %d, `gys_name` = '%s' where goods_id = %d" % (
            data['sj_area'], data['goods_name'], data['attrs'], data['ckbz_dw'], data['sc_price'], data['ysj_price'], data['shop_tc'], data['pt_fei'], data['gys_js_price'], data['shop_ghj_price'], data['shujuhd3'], data['GMV'], data['c_address'], data['sc_riqi'], data['zcfs'], data['bzq'], data['ghfs'], data['xd_num'], data['xg_num'], data['hd_attrs'], data['sj_bq'], data['cate1'], data['cate2'], data['pq_beizhu'], data['xs_nums'], data['xs_e_price'], data['sj_time'], data['xj_time'], data['start_time'], data['end_time'], data['qy_address'], data['imageb_url'], data['sy_image'], data['haibao_image'], data['images'], data['state'], data['createtime'], data['gys_name'], data['goods_id'])
            execSQl(sql)
        except Exception as e:
            print(e)
            pass

    def run(self):
        while True:
            if self.categoryQueue.empty():
                break
            category = self.categoryQueue.get()
            categoryName = category['title']
            totalPage = self.getTotalPages(category['categoryId'])
            if totalPage:
                for i in range(1, totalPage + 1):
                    goodsList = self.getGoodsList(category['categoryId'], i)
                    if goodsList and len(goodsList) > 0:
                        for goods in goodsList:
                            merchandiseId = goods['merchandiseid']
                            merchtypeid = goods['merchtypeid']
                            data = self.getGoodsDetail(merchandiseId, merchtypeid, categoryName)
                            if data:
                                existsStatus = self.checkGoodsExists(data['goods_id'])
                                if existsStatus:
                                    self.update(data)
                                else:
                                    self.add(data)


def getCategoryQueue(partnerId, grouponId):
    categoryQueue = Queue(0)
    url = "https://api.*****.net/mc/groupClassify/v3/categoryList"
    data = {
        "partnerId": str(partnerId),
        "grouponId": str(grouponId),
        "isPartner": 0
    }
    resp = postHtml(url, data)
    try:
        categories = resp['data']
        for category in categories:
            title = category['title']
            if title not in unexcepts:
                categoryQueue.put(category)
    except Exception as e:
        print("登錄過期~")
        time.sleep(10)
        exit(0)
    return categoryQueue


def getKeysList():
    keysList = []
    if keywords and len(keywords) > 0:
        for keyword in keywords:
            keysList.append(keyword)
    return keysList


def getNearTeam(lat, lng):
    url = "https://api.*****.net/partner/near"
    data = {
        "lat": lat,
        "lng": lng
    }
    res = postHtml(url, data)
    try:
        return str(res['data']['list'][0]['partnerId']), str(res['data']['list'][0]['grouponId'])
    except Exception as e:
        pass
    return


def parser():
    global threadNums
    keys = getKeysList()
    if keys and len(keys) > 0:
        for key in keys:
            try:
                city, lat, lng = key.split("-")
                partnerId, grouponId = getNearTeam(lat, lng)
                categoryQueue = getCategoryQueue(partnerId, grouponId)
                threadNums = threadNums if threadNums < categoryQueue.qsize() else categoryQueue.qsize()
                ths = []
                for i in range(threadNums):
                    s = shtSpider(categoryQueue, i, city, partnerId, grouponId)
                    ths.append(s)
                    s.start()
                for t in ths:
                    t.join()
            except Exception as e:
                print("關鍵詞:%s 格式錯誤,正確格式爲:地區-緯度-經度" % str(key))


def getCurrTime():
    return str(time.strftime('%H:%M'))


def main():
    print("啓動時任務爬蟲!")
    parser()


if __name__ == '__main__':
    main()

小程序爬蟲接單、app爬蟲接單、網頁爬蟲接單、接口定製、網站開發、小程序開發 > 點擊這裏聯繫我們 <

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章