Python爬蟲系列之爬取某社區團微信小程序店鋪商品數據
如有問題QQ請> 點擊這裏聯繫我們 <
微信請掃描下方二維碼
代碼僅供學習交流,請勿用於非法用途
- 數據庫僅用於去重使用,數據主要存儲於excel
一、準備數據庫
set names utf8;
drop database if exists sqt;
create database sqt;
use sqt;
CREATE TABLE `goods_list` (
`id` int(10) NOT NULL AUTO_INCREMENT COMMENT 'ID',
`goods_id` bigint(20) NOT NULL COMMENT '唯一ID',
`sj_area` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '上架區域',
`goods_brand` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '商品品牌',
`goods_code` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '商品編碼',
`spu_id` varchar(10) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT 'SPU-ID',
`gys_code` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '供應商編碼(留空)',
`gys_name` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '供應商簡稱',
`goods_name` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '商品名稱',
`attrs` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '規格',
`sc_price` decimal(10,2) DEFAULT NULL COMMENT '市場價',
`ysj_price` decimal(10,2) DEFAULT NULL COMMENT '預售價',
`pt_fei` varchar(10) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '平臺費(=預售價*10%,小數點1位)',
`bzj` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT '3年' COMMENT '保證金(留空)',
`gys_js_price` decimal(10,2) DEFAULT NULL COMMENT '供應商結算價=預售價-門店提成-平臺費',
`shop_ghj_price` decimal(10,2) DEFAULT NULL COMMENT '門店供貨價=預售價-門店提成',
`GMV` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT 'GMV=預售價*限定數量',
`sc_riqi` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT '2020年' COMMENT '生產日期(默認2020年)',
`zcfs` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '貯存方式(默認值:01常溫)',
`bzq` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '保質期(默認值:3年)',
`ghfs` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT '次日' COMMENT '供貨方式(默認值:次日)',
`xd_num` int(10) DEFAULT NULL COMMENT '限定數量',
`xg_num` int(10) DEFAULT NULL COMMENT '',
`sj_bq` varchar(20) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '設計標籤(採集分類)',
`cate1` varchar(10) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '一級分類(同設計標籤)',
`cate2` varchar(10) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '二級分類',
`guanzhu_num` int(10) DEFAULT NULL COMMENT '關注人數',
`xs_nums` int(10) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '銷售數量',
`xs_e_price` decimal(10,2) DEFAULT NULL COMMENT '銷售額(銷售數量*預售價)',
`sq_time` varchar(10) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '售罄時間(分鐘)',
`sj_time` int(10) DEFAULT NULL COMMENT '上架時間',
`xj_time` int(10) DEFAULT NULL COMMENT '下架時間',
`start_time` int(10) DEFAULT NULL COMMENT '開始銷售時間',
`end_time` int(10) DEFAULT NULL COMMENT '結束銷售時間',
`qy_address` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '區域',
`imageb_url` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '詳情頁地址(域名+ID)',
`sy_image` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '首頁圖片',
`haibao_image` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '海報圖片地址(2張滾動的圖片)',
`images` text COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '詳情圖片',
`sp_image` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '首頁視頻',
`state` enum('0','1') COLLATE utf8mb4_unicode_ci DEFAULT '0' COMMENT '狀態值:0=下架,1=上架',
-- `prTitle` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT 'prTitle',
-- `prDetail` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT 'prDetail',
-- `tmBuyStart` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT 'tmBuyStart',
-- `tmPickUp` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT 'tmPickUp',
`createtime` int(10) DEFAULT NULL COMMENT '創建時間',
PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='完整商品表';
二、代碼實現
# -*- coding:utf-8 -*-
import requests
import json
from queue import Queue
import threading
import os
import time
import configparser
import MySQLdb
from bs4 import BeautifulSoup
retry = 3
timeout = 20
headers = {
"content-type": "application/json",
"authorization": "請替換爲自己的authorization",
"ver": "2.20.0",
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/7.0.11(0x17000b21) NetType/WIFI Language/zh_CN",
"referer": "https://servicewechat.com/wxbbdca62c011eeb38/202/page-frame.html",
"x-tingyun-id": "請替換爲自己的x-tingyun-id",
}
cf = configparser.ConfigParser()
# 間隔啓動判斷
intervalStartTime = 29
try:
cf.read(os.getcwd() + "/conf.ini", encoding="utf-8-sig")
except Exception as e:
print("程序目錄下不存在conf.ini配置文件~")
exit(0)
def getConf(sec, key):
try:
return cf.get(sec, key)
except Exception as e:
print(e)
print("未得到以下配置:" + sec + " - " + key)
exit(0)
keywords = ""
try:
keywords = getConf("app-sys", "keywords").split(",")
except Exception as e:
print("keywords參數錯誤!")
exit(0)
threadNums = 1
try:
threadNums = int(getConf("app-sys", "threadNums"))
except Exception as e:
print("threadNums參數錯誤!")
exit(0)
# 啓動時間點
startTime = getConf("app-sys", "start")
startTimes = []
try:
startTimes = startTime.split(",")
except Exception as e:
pass
unexcept = getConf("app-sys", "unexcept")
unexcepts = []
try:
unexcepts = unexcept.split(",")
except Exception as e:
pass
# 數據庫賬號
mysql_user = getConf("Mysql-Database", "user")
# 數據庫密碼
mysql_password = getConf("Mysql-Database", "password")
# 數據庫名稱
mysql_database = getConf("Mysql-Database", "database")
# 主機地址
mysql_host = getConf("Mysql-Database", "host")
# 端口
mysql_port = getConf("Mysql-Database", "port")
def execSQl(sql):
try:
conn = MySQLdb.connect(user=mysql_user, password=mysql_password, host=mysql_host, database=mysql_database, charset='utf8')
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
return True
except Exception as e:
return False
def querySQL(sql):
try:
conn = MySQLdb.connect(user=mysql_user, password=mysql_password, host=mysql_host, database=mysql_database, charset='utf8')
cursor = conn.cursor()
cursor.execute(sql)
return cursor.fetchall()
except Exception as e:
return False
def getHtml(url):
for i in range(retry):
try:
resp = requests.get(url, headers=headers, timeout=timeout)
return json.loads(resp.content.decode("utf-8"))
except Exception as e:
continue
return
def postHtml(url, data):
for i in range(retry):
try:
resp = requests.post(url, headers=headers, data=json.dumps(data), timeout=timeout)
return json.loads(resp.content.decode("utf-8"))
except Exception as e:
continue
return
def getCurrDate():
return str(time.strftime('%Y{y}%m{m}%d{d}').format(y='年', m='月', d='日'))
def dateTots(s):
try:
return int(time.mktime(time.strptime(s, "%Y/%m/%d %H:%M:%S")))
except Exception as e:
return 0
class shtSpider(threading.Thread):
def __init__(self, categoryQueue, index, city, partnerId, grouponId, *args, **kwargs):
super(shtSpider, self).__init__(*args, **kwargs)
self.categoryQueue = categoryQueue
self.city = city
def getTotalPages(self, categoryId):
url = "https://api.*****.net/mc/diamondV2/list-merchandise"
data = {
"diamondId": str(categoryId),
"grouponId": self.grouponId,
"partnerId": self.partnerId,
"p": "1",
"size": "10"
}
resp = postHtml(url, data)
try:
return int(resp['data']['totalPages'])
except Exception as e:
return
def getGoodsList(self, categoryId, page):
url = "https://api.*****.net/mc/diamondV2/list-merchandise"
data = {
"diamondId": str(categoryId),
"grouponId": self.grouponId,
"partnerId": self.partnerId,
"p": int(page),
"size": 10
}
resp = postHtml(url, data)
try:
return resp['data']['grouponMerchandiseList']
except Exception as e:
return
def getGoodsDetail(self, merchandiseId, merchtypeId, categoryName):
url = "https://api.*****.net/mc/merchandise/detail"
data = {
"grouponId": self.grouponId,
"partnerId": self.partnerId,
"merchandiseId": str(merchandiseId),
"merchtypeId": str(merchtypeId)
}
goods_id = str(merchtypeId) + str(merchandiseId)
resp = postHtml(url, data)
if resp:
try:
data = resp['data']
datas = {}
try:
datas['goods_id'] = int(appflag + str(goods_id))
except Exception as e:
return
try:
datas['sj_area'] = self.city + "十|薈}團"
except Exception as e:
datas['sj_area'] = ""
try:
goods_name = str(data['title'])
if "【" != goods_name[0]:
pname = goods_name.split(" ")
if len(pname) > 1:
goods_name = goods_name.replace(pname[0], "【" + pname[0] + "】")
datas['goods_name'] = goods_name
except Exception as e:
datas['goods_name'] = ""
try:
datas['attrs'] = ""
except Exception as e:
datas['attrs'] = ""
datas['ckbz_dw'] = "大件"
try:
datas['sc_price'] = float(data['originprice'])
except Exception as e:
datas['sc_price'] = 0.00
try:
datas['ysj_price'] = float(data['activityprice'])
except Exception as e:
datas['ysj_price'] = 0.00
try:
datas['shop_tc'] = str(float('%.1f' % float(datas['ysj_price'] * 0.1)))
except Exception as e:
datas['shop_tc'] = "0.0"
try:
datas['pt_fei'] = str(float('%.1f' % float(datas['ysj_price'] * 0.1)))
except Exception as e:
datas['pt_fei'] = "0.0"
try:
datas['gys_js_price'] = float(
"%.2f" % float(datas['ysj_price'] - float(datas['shop_tc']) - float(datas['pt_fei'])))
except Exception as e:
datas['gys_js_price'] = 0.00
try:
datas['shop_ghj_price'] = float(datas['ysj_price'] - float(datas['shop_tc']))
except Exception as e:
datas['shop_ghj_price'] = 0.00
try:
datas['xs_e_price'] = float("%.2f" % (int(data['waterQuantity']) * datas['ysj_price']))
except Exception as e:
datas['xs_e_price'] = 0.00
try:
datas['sj_time'] = dateTots(data['startTime'])
except Exception as e:
datas['sj_time'] = 0
try:
datas['xj_time'] = dateTots(data['endTime'])
except Exception as e:
datas['xj_time'] = 0
try:
datas['start_time'] = dateTots(data['startTime'])
except Exception as e:
datas['start_time'] = 0
try:
datas['end_time'] = dateTots(data['endTime'])
except Exception as e:
datas['end_time'] = 0
try:
datas['qy_address'] = self.city + "十?薈d團"
except Exception as e:
datas['qy_address'] = ""
try:
datas['imageb_url'] = detailPre + str(datas['goods_id'])
except Exception as e:
datas['imageb_url'] = ""
try:
datas['sy_image'] = data['itemimage']
except Exception as e:
datas['sy_image'] = ""
try:
images = data['carouselFileList'][:2]
imagesList = []
for image in images:
try:
rrr = image['url']
if "?" in rrr:
rrr = rrr[:rrr.find("?")]
imagesList.append(rrr)
except Exception as e:
imagesList.append(image)
datas['haibao_image'] = ",".join(imagesList)
except Exception as e:
datas['haibao_image'] = ""
try:
description = data['description']
descriptionImgs = []
descriptionSoup = BeautifulSoup(description, "html.parser")
descriptionSImgs = descriptionSoup.find_all("img")
for descriptionSImg in descriptionSImgs:
try:
descriptionImgs.append(descriptionSImg['src'])
except Exception as e:
pass
datas['images'] = ",".join(descriptionImgs)
except Exception as e:
datas['images'] = ""
datas['state'] = "0"
datas['createtime'] = int(time.time())
try:
datas['gys_name'] = data['supplierName']
except Exception as e:
datas['gys_name'] = ""
return datas
except Exception as e:
pass
return
def checkGoodsExists(self, pid):
try:
sql = "select * from goods_list where goods_id = %d" % int(pid)
res = querySQL(sql)
return len(res) > 0
except Exception as e:
return False
def update(self, data):
print("update ----------------------------------------------------")
print(data)
try:
sql = "update goods_list set `sj_area` = '%s', `goods_name` = '%s', `attrs` = '%s', `ckbz_dw` = '%s', `sc_price` = %f, `ysj_price` = %f, `shop_tc` = '%s', `pt_fei` = '%s', `gys_js_price` = %f, `shop_ghj_price` = %f, `shujuhd3` = '%s', `GMV` = '%s', `c_address` = '%s', `sc_riqi` = '%s', `zcfs` = '%s', `bzq` = '%s', `ghfs` = '%s', `xd_num` = %d, `xg_num` = %d, `hd_attrs` = '%s', `sj_bq` = '%s', `cate1` = '%s', `cate2` = '%s', `pq_beizhu` = '%s', `xs_nums` = %d, `xs_e_price` = %f, `sj_time` = %d, `xj_time` = %d, `start_time` = %d, `end_time` = %d, `qy_address` = '%s', `imageb_url` = '%s', `sy_image` = '%s', `haibao_image` = '%s', `images` = '%s', `state` = '%s', `createtime` = %d, `gys_name` = '%s' where goods_id = %d" % (
data['sj_area'], data['goods_name'], data['attrs'], data['ckbz_dw'], data['sc_price'], data['ysj_price'], data['shop_tc'], data['pt_fei'], data['gys_js_price'], data['shop_ghj_price'], data['shujuhd3'], data['GMV'], data['c_address'], data['sc_riqi'], data['zcfs'], data['bzq'], data['ghfs'], data['xd_num'], data['xg_num'], data['hd_attrs'], data['sj_bq'], data['cate1'], data['cate2'], data['pq_beizhu'], data['xs_nums'], data['xs_e_price'], data['sj_time'], data['xj_time'], data['start_time'], data['end_time'], data['qy_address'], data['imageb_url'], data['sy_image'], data['haibao_image'], data['images'], data['state'], data['createtime'], data['gys_name'], data['goods_id'])
execSQl(sql)
except Exception as e:
print(e)
pass
def run(self):
while True:
if self.categoryQueue.empty():
break
category = self.categoryQueue.get()
categoryName = category['title']
totalPage = self.getTotalPages(category['categoryId'])
if totalPage:
for i in range(1, totalPage + 1):
goodsList = self.getGoodsList(category['categoryId'], i)
if goodsList and len(goodsList) > 0:
for goods in goodsList:
merchandiseId = goods['merchandiseid']
merchtypeid = goods['merchtypeid']
data = self.getGoodsDetail(merchandiseId, merchtypeid, categoryName)
if data:
existsStatus = self.checkGoodsExists(data['goods_id'])
if existsStatus:
self.update(data)
else:
self.add(data)
def getCategoryQueue(partnerId, grouponId):
categoryQueue = Queue(0)
url = "https://api.*****.net/mc/groupClassify/v3/categoryList"
data = {
"partnerId": str(partnerId),
"grouponId": str(grouponId),
"isPartner": 0
}
resp = postHtml(url, data)
try:
categories = resp['data']
for category in categories:
title = category['title']
if title not in unexcepts:
categoryQueue.put(category)
except Exception as e:
print("登錄過期~")
time.sleep(10)
exit(0)
return categoryQueue
def getKeysList():
keysList = []
if keywords and len(keywords) > 0:
for keyword in keywords:
keysList.append(keyword)
return keysList
def getNearTeam(lat, lng):
url = "https://api.*****.net/partner/near"
data = {
"lat": lat,
"lng": lng
}
res = postHtml(url, data)
try:
return str(res['data']['list'][0]['partnerId']), str(res['data']['list'][0]['grouponId'])
except Exception as e:
pass
return
def parser():
global threadNums
keys = getKeysList()
if keys and len(keys) > 0:
for key in keys:
try:
city, lat, lng = key.split("-")
partnerId, grouponId = getNearTeam(lat, lng)
categoryQueue = getCategoryQueue(partnerId, grouponId)
threadNums = threadNums if threadNums < categoryQueue.qsize() else categoryQueue.qsize()
ths = []
for i in range(threadNums):
s = shtSpider(categoryQueue, i, city, partnerId, grouponId)
ths.append(s)
s.start()
for t in ths:
t.join()
except Exception as e:
print("關鍵詞:%s 格式錯誤,正確格式爲:地區-緯度-經度" % str(key))
def getCurrTime():
return str(time.strftime('%H:%M'))
def main():
print("啓動時任務爬蟲!")
parser()
if __name__ == '__main__':
main()