Python爬蟲系列之爬取某奢侈品小程序店鋪商品數據
小程序爬蟲接單、app爬蟲接單、網頁爬蟲接單、接口定製、網站開發、小程序開發> 點擊這裏聯繫我們 <
微信請掃描下方二維碼
代碼僅供學習交流,請勿用於非法用途
一、準備數據庫
create database zr;
use zr;
# 商品表
create table zr_goodslist(
id int primary key auto_increment comment 'id',
pid varchar(30) unique comment 'pid',
sku varchar(30) default null comment 'sku',
name varchar(50) default null comment 'name',
sellingPoint varchar(200) default null comment 'sellingPoint',
descption text default null comment 'desc',
mainimg text default null comment 'mainimg',
imageList text default null comment 'imageList',
video text default null comment 'video',
brand varchar(30) default null comment 'brand',
status varchar(8) default null comment 'status',
stock varchar(10) default null comment 'stock',
source varchar(10) default null comment 'source',
refDetail text default null comment 'refDetail',
convert_size varchar(100) default null comment 'convert_size',
marketPrice varchar(15) default null comment 'marketPrice',
salePrice varchar(15) default null comment 'salePrice',
price varchar(15) default null comment 'price',
discount varchar(15) default null comment 'discount',
marketingDesc varchar(300) default null comment 'marketingDesc',
grade varchar(10) default null comment 'grade',
brandType varchar(15) default null comment 'brandType',
categoryOne varchar(20) default null comment 'categoryOne',
categoryTwo varchar(20) default null comment 'categoryTwo',
categoryThree varchar(20) default null comment 'categoryThree',
viewNumStatus varchar(10) default null comment 'viewNumStatus',
openBargain varchar(30) default null comment 'openBargain',
directDesc text default null comment 'directDesc',
degree text default null comment 'degree',
degreeDesc text default null comment 'degreeDesc',
degreeExt text default null comment 'degreeExt',
coefficient text default null comment 'coefficient',
firstPutOn varchar(50) default null comment 'firstPutOn',
proc_view_num varchar(15) default null comment 'proc_view_num',
correctNum varchar(15) default null comment 'correctNum',
bargainBasePrice varchar(15) default null comment 'bargainBasePrice',
onSale varchar(10) default null comment 'onSale',
onSaleCountDown varchar(15) default null comment 'onSaleCountDown',
bargainLock varchar(50) default null comment 'bargainLock',
bargainDownTime varchar(35) default null comment 'bargainDownTime',
isBargain varchar(10) default null comment 'isBargain',
bargainPrice varchar(15) default null comment 'bargainPrice',
bargainNum varchar(15) default null comment 'bargainNum',
color_forming varchar(30) default null comment 'color_forming',
tile_size varchar(30) default null comment 'tile_size',
overall_weight varchar(30) default null comment 'overall_weight',
size_prompt varchar(30) default null comment 'size_prompt',
defect text default null comment 'defect',
style text default null comment 'style',
accessories text default null comment 'accessories',
material text default null comment 'material',
lengths text default null comment 'lengths',
main_material text default null comment 'main_material',
sizes text default null comment 'sizes',
fabric text default null comment 'fabric'
)engine=INNODB charset=utf8;
二、代碼實現
# -*- coding:utf-8 -*-
import requests
from queue import Queue
import threading
import json
import MySQLdb
import configparser
totals = 0
cf = configparser.ConfigParser()
try:
cf.read("config.ini")
except Exception as e:
print("程序目錄下不存在config.ini配置文件~")
exit(0)
def getConf(sec, key):
try:
return cf.get(sec, key)
except Exception as e:
print("未得到以下配置:" + sec + " - " + key)
exit(0)
# -------------------------------------------------
threadNums = int(getConf("app-sys", "threadNums"))
retry = 3
timeout = 20
# 數據庫賬號
mysql_user = getConf("Mysql-Database", "user")
# 數據庫密碼
mysql_password = getConf("Mysql-Database", "password")
# 數據庫名稱
mysql_database = getConf("Mysql-Database", "database")
# 表名稱
mysql_table = getConf("Mysql-Database", "table")
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; DUK-AL20 Build/LMY48Z; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Safari/537.36 MicroMessenger/7.0.10.1580(0x27000A59) Process/appbrand3 NetType/WIFI Language/zh_CN ABI/arm32",
"content-type": "application/json;charset=utf-8",
}
host = "https://img.*******.com/"
attrsList = []
class zrSpider(threading.Thread):
def __init__(self, brandQueue, index, *args, **kwargs):
super(zrSpider, self).__init__(*args, **kwargs)
self.brandQueue = brandQueue
self.index = index
def getGoodsList(self, brandId, page):
url = "https://search.*******.com/V4.7.0/product/list"
data = {
"page": page,
"pageSize": 20,
"sort": "",
"ppath": "4:" + str(brandId),
"newShare": 0,
"selfbiz": 1,
"version": "5.3.0",
"debug": "false",
"mt": "WX-micro",
"inWechat": 1,
"from": "micro",
"deviceId": "deviceId"
}
resp = postHtml(url, data)
if resp:
try:
return resp['data']['list']
except Exception as e:
pass
return
def getGoodsDetail(self, id):
global attrsList
url = "https://api.*******.com/V5.3.0/product/newDetail"
data = {
"id": str(id),
"version": "5.3.0",
"debug": "false",
"mt": "WX-micro",
"inWechat": 1,
"from": "micro",
"deviceId": "deviceId"
}
resp = postHtml(url, data)
if resp:
try:
if str(resp['code']) != "100000":
return
except Exception as e:
return
detail = {}
productAttr = {}
# brand = {}
try:
detail = resp['data']['detail']
productAttr = resp['data']['productAttr']
# brand = resp['data']['brand']
except Exception as e:
return
# try:
# for product in productAttr:
# attrsList.append(product['name'])
# print(list(set(attrsList)))
# print(len(list(set(attrsList))))
# except Exception as e:
# pass
# return
try:
goods = []
try:
goods.append(detail['id'])
except Exception as e:
goods.append("")
try:
goods.append(detail['sku'])
except Exception as e:
goods.append("")
try:
goods.append(detail['name'])
except Exception as e:
goods.append("")
try:
goods.append(detail['sellingPoint'])
except Exception as e:
goods.append("")
try:
goods.append(detail['desc'])
except Exception as e:
goods.append("")
try:
goods.append(host + detail['imageList'][0])
except Exception as e:
goods.append("")
try:
imageList = detail['imageList']
imgs = []
for image in imageList:
imgs.append(host + image)
goods.append(str(imgs).replace("'", "\""))
except Exception as e:
goods.append("")
try:
goods.append(detail['video'])
except Exception as e:
goods.append("")
try:
goods.append(detail['brand'])
except Exception as e:
goods.append("")
try:
goods.append(detail['status'])
except Exception as e:
goods.append("")
try:
goods.append(detail['stock'])
except Exception as e:
goods.append("")
try:
goods.append(detail['source'])
except Exception as e:
goods.append("")
try:
goods.append(detail['refDetail'])
except Exception as e:
goods.append("")
try:
goods.append(detail['convert_size'])
except Exception as e:
goods.append("")
try:
goods.append(detail['marketPrice'])
except Exception as e:
goods.append("")
try:
goods.append(detail['salePrice'])
except Exception as e:
goods.append("")
try:
goods.append(detail['price'])
except Exception as e:
goods.append("")
try:
goods.append(detail['discount'])
except Exception as e:
goods.append("")
try:
goods.append(detail['marketingDesc'])
except Exception as e:
goods.append("")
try:
goods.append(detail['grade'])
except Exception as e:
goods.append("")
try:
goods.append(detail['brandType'])
except Exception as e:
goods.append("")
try:
goods.append(detail['categoryOne'])
except Exception as e:
goods.append("")
try:
goods.append(detail['categoryTwo'])
except Exception as e:
goods.append("")
try:
goods.append(detail['categoryThree'])
except Exception as e:
goods.append("")
try:
goods.append(detail['viewNumStatus'])
except Exception as e:
goods.append("")
try:
goods.append(detail['openBargain'])
except Exception as e:
goods.append("")
try:
goods.append(detail['directDesc'])
except Exception as e:
goods.append("")
try:
goods.append(detail['degree'])
except Exception as e:
goods.append("")
try:
goods.append(detail['degreeDesc'])
except Exception as e:
goods.append("")
try:
goods.append(detail['degreeExt'])
except Exception as e:
goods.append("")
try:
goods.append(detail['coefficient'])
except Exception as e:
goods.append("")
try:
goods.append(detail['firstPutOn'])
except Exception as e:
goods.append("")
try:
goods.append(detail['proc_view_num'])
except Exception as e:
goods.append("")
try:
goods.append(detail['correctNum'])
except Exception as e:
goods.append("")
try:
goods.append(detail['bargainBasePrice'])
except Exception as e:
goods.append("")
try:
goods.append(detail['onSale'])
except Exception as e:
goods.append("")
try:
goods.append(detail['onSaleCountDown'])
except Exception as e:
goods.append("")
try:
goods.append(detail['bargainLock'])
except Exception as e:
goods.append("")
try:
goods.append(detail['bargainDownTime'])
except Exception as e:
goods.append("")
try:
goods.append(detail['isBargain'])
except Exception as e:
goods.append("")
try:
goods.append(detail['bargainPrice'])
except Exception as e:
goods.append("")
try:
goods.append(detail['bargainNum'])
except Exception as e:
goods.append("")
return goods
except Exception as e:
return
return
def pipLine(self, data):
print("------------------------- insert ------------------------- ")
print(data)
print("---------------------------------------------------------- ")
try:
conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database,
charset='utf8')
cursor = conn.cursor()
cursor.execute("insert " + mysql_table + "(pid, sellingPoint, descption, mainimg, imageList, video, brand, status, stock, source, refDetail, convert_size, marketPrice, salePrice, price, discount, marketingDesc, grade,categoryTwo, categoryThree, viewNumStatus, openBargain, directDesc, degree, degreeDesc, degreeExt, coefficient, firstPutOn, proc_view_num, correctNum, bargainBasePrice, onSale, onSaleCountDown, bargainLock, bargainPrice, color_forming, tile_size, overall_weight, size_prompt, defect, style, accessories, material, lengths, main_material, sizes, fabric) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (str(data[0]), str(data[1]), str(data[2]), str(data[3]), str(data[4]), str(data[5]), str(data[6]), str(data[7]), str(data[8]), str(data[9]), str(data[10]), str(data[11]), str(data[12]), str(data[13]), str(data[14]), str(data[15]), str(data[16]), str(data[17]), str(data[18]), str(data[19]), str(data[20]), str(data[21]), str(data[22]), str(data[23]), str(data[24]), str(data[25]), str(data[26]), str(data[27]), str(data[28]), str(data[29]), str(data[30]), str(data[31]), str(data[32]), str(data[33]), str(data[34]), str(data[35]), str(data[36]), str(data[37]), str(data[38]), str(data[39]), str(data[40]), str(data[41]), str(data[42]), str(data[43]), str(data[44]), str(data[45]), str(data[46]), str(data[47]), str(data[48]), str(data[49]), str(data[50]), str(data[51]), str(data[52]), str(data[53])))
conn.commit()
except Exception as e:
print(e)
pass
def getTotalPage(self, brandId):
url = "https://search.*******.com/V4.7.0/product/list"
data = {
"page": 1,
"pageSize": 20,
"sort": "",
"ppath": "4:" + str(brandId),
"newShare": 0,
"selfbiz": 1,
"version": "5.3.0",
"debug": "false",
"mt": "WX-micro",
"inWechat": 1,
"from": "micro",
"deviceId": "deviceId"
}
resp = postHtml(url, data)
if resp:
try:
count = int(resp['data']['count'])
return count // 20 if count % 20 == 0 else (count // 20) + 1
except Exception as e:
pass
return 1
def checkGoodsExists(self, pid):
try:
conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database,
charset='utf8')
cursor = conn.cursor()
cursor.execute("select * from " + mysql_table + " where pid = '%s'" % str(pid))
return len(cursor.fetchall()) > 0
except Exception as e:
print(e)
pass
return False
def update(self, data):
print("------------------------- update ------------------------- ")
print(data)
print("---------------------------------------------------------- ")
try:
conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database,
charset='utf8')
cursor = conn.cursor()
cursor.execute("update " + mysql_table + " set sku = '%s', name = '%s', sellingPoint = '%s', descption = '%s', *****= '%s', imageList = '%s', video = '%s', brand = '%s', status = '%s', stock = '%s', source = '%s', refDetail = '%s', convert_size = '%s', marketPrice = '%s', salePrice = '%s', ***** = '%s', discount = '%s', marketingDesc = '%s', grade = '%s', brandType = '%s', categoryOne = '%s', categoryTwo = '%s', categoryThree = '%s', viewNumStatus = '%s', openBargain = '%s', directDesc = '%s', degree = '%s', degreeDesc = '%s', degreeExt = '%s', coefficient = '%s', firstPutOn = '%s', *****= '%s', correctNum = '%s', bargainBasePrice = '%s', onSale = '%s', onSaleCountDown = '%s', bargainLock = '%s', bargainDownTime = '%s', isBargain = '%s', bargainPrice = '%s', bargainNum = '%s', color_forming = '%s', tile_size = '%s', *****= '%s', size_prompt = '%s', defect = '%s', style = '%s', accessories = '%s', material = '%s', lengths = '%s', fabric = '%s' where pid = '%s'" % (str(data[1]), str(data[2]), str(data[3]), str(data[4]), str(data[5]), str(data[6]), str(data[7]), str(data[8]), str(data[9]), str(data[10]), str(data[11]), str(data[12]), str(data[13]), str(data[14]), str(data[15]), str(data[16]), str(data[17]), str(data[18]), str(data[19]), str(data[20]), str(data[21]), str(data[22]), str(data[23]), str(data[24]), str(data[25]), str(data[26]), str(data[27]), str(data[28]), str(data[29]), str(data[30]), str(data[31]), str(data[32]), str(data[33]), str(data[34]), str(data[35]), str(data[36]), str(data[37]), str(data[38]), str(data[39]), str(data[40]), str(data[41]), str(data[42]), str(data[43]), str(data[44]), str(data[45]), str(data[46]), str(data[47]), str(data[48]), str(data[49]), str(data[50]), str(data[51]), str(data[52]), str(data[53]), str(data[0])))
conn.commit()
except Exception as e:
pass
def run(self):
print("線程:%d 啓動~" % self.index)
while True:
if self.brandQueue.empty():
break
brandQueue = self.brandQueue.get()
brand_id = str(brandQueue['id'])
totalPage = self.getTotalPage(brand_id)
for page in range(1, totalPage + 1):
goodsList = self.getGoodsList(brand_id, page)
if goodsList and len(goodsList) > 0:
for goods in goodsList:
goodsId = goods['id']
datas = self.getGoodsDetail(goodsId)
exists = self.checkGoodsExists(goodsId)
if exists:
# 更新
self.update(datas)
else:
self.pipLine(datas)
def postHtml(url, data):
for i in range(retry):
try:
resp = requests.post(url, data=json.dumps(data), json=data, headers=headers, timeout=timeout)
return json.loads(resp.content.decode("utf-8"))
except Exception as e:
pass
return
def getHtml(url):
for i in range(retry):
try:
resp = requests.get(url, headers=headers, timeout=timeout)
return json.loads(resp.content.decode("utf-8"))
except Exception as e:
pass
return
def getBrandQueue():
brandQueue = Queue(0)
url = "https://api.*******.com/V5.3.0/site/currentBrand"
data = {
"version": "5.3.0",
"debug": "false",
"mt": "WX-micro",
"inWechat": 1,
"from": "micro",
"deviceId": "deviceId"
}
resp = postHtml(url, data)
if resp:
brandList = []
try:
brandList = resp['data']['list']
except Exception as e:
return
for brand in brandList:
brandQueue.put(brand)
return brandQueue
def main():
print("初始化爬蟲~")
brandQueue = getBrandQueue()
print("類目獲取完畢~")
for i in range(threadNums):
z = zrSpider(brandQueue, i)
z.start()
if __name__ == '__main__':
main()