# 導入requests和re正則庫
import requests
import re
# 定義第一個函數實現獲取網頁數據
def getHTMLText(url,loginheaders):
try:
r = requests.get(url,headers=loginheaders, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text)
return r.text
except:
return ""
#定義一個函數實現把書包信息存儲區來,包括編號,價格,名稱
def parsePage(ilt, html):
try:
# 要明白 .* 代表的是任意個不同字符,而不是說必須是任意個相同的字符,其他的也是類似
# re.findall()返回的是列表類型
# 利用正則表達式查找價格("view_price":"任意個數的任意數字加點"),所以正則表達式還可在小數點後加兩個0寫成r'\"view_price\"\:\"[\d\.]*00\"'
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
# 利用正則表達式查找題目("raw_title":"任意個數的任意字符")
tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
#循環遍歷價格和題目,利用:分隔符獲得值
for i in range(len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
# 添加到ilt列表當中,二維元組列表
ilt.append([price, title])
except:
print("")
# 打印題目和爬取的信息
def printGoodsList(ilt):
# 定義輸出格式
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序號", "價格", "商品名稱"))
# 循環遍歷打印爬取到的信息,第一種方式
for i in range(len(ilt)):
print(tplt.format(i + 1, ilt[i][0], ilt[i][1]))
# 第二種方式,g是從ilt循環遍歷每元組的每項,g[0],g[1]是價格和題目
# count = 0
# for g in ilt:
# count = count + 1
# print(tplt.format(count, g[0], g[1]))
#定義主函數,寫一下變量
def main():
goods = '書包'
# depth是爬取的深度,也就是爬取幾頁
depth = 3
start_url = 'https://s.taobao.com/search?q=' + goods
infoList = []
for i in range(depth):
try:
url = start_url + '&s=' + str(44 * i)
# 設置登錄的頭信息,注意cookie是登陸頁面點擊登錄獲取的頭信息
loginheaders = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36",
"cookie": "t=4ddf8b70981503ff445f594236c71e96; thw=cn; cookie2=12bdc5dde93e3514edca199a193f232f; _tb_token_=e45e8be4e50be; _samesite_flag_=true; enc=201s0rRJEHeguaLLCC6IAbLJao3k%2FWpbaR4FH6jpx1T6haa1auRivMShxlx1S0Ul3c3meKsTzPUcwTv3aEzt1Q%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; alitrackid=www.taobao.com; lastalitrackid=login.taobao.com; mt=ci=0_0; JSESSIONID=C7EA2D8019D08587402413BDBF38AFF0; cna=k8/5Fvt7vU8CAbfIPj2evd1M; l=eBxklgrHqCn48L6LBOfZourza77TlIRfguPzaNbMiT5P_2fH75cAWZjFnt8MCnGVnsZw-354uljQBrT8xyUBh6Yl3ZQ7XPQo3dTh.; isg=BFFRjiZN687i_QTKjM76nskzYF3rvsUwkMIhzjPmIpg42nAseQpbAHY4fK48Ul1o; tfstk=cwyCBJqLmeYQTTeeQzsZTKVPlrkPZj_sj6gLA5hd3EP90jECibvqlvj4ZFvtMc1..; sgcookie=EjxPeg5aM1t9jg2xwmUNw; unb=2639049752; uc1=cookie14=UoTUMtQjutczRQ%3D%3D&cookie21=VFC%2FuZ9ajC0X15Rzt0LhxQ%3D%3D&pas=0&existShop=false&cookie15=WqG3DMC9VAQiUQ%3D%3D&cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D; uc3=nk2=2nZbzUmLMMCi2g%3D%3D&vt3=F8dBxGXMemSB7fdhNUQ%3D&lg2=V32FPkk%2Fw0dUvg%3D%3D&id2=UU6idYdXStUjdg%3D%3D; csg=8948e7ae; lgc=%5Cu90ED%5Cu5DDD%5Cu5DDD1998; cookie17=UU6idYdXStUjdg%3D%3D; dnk=%5Cu90ED%5Cu5DDD%5Cu5DDD1998; skt=b778092dd81708b9; existShop=MTU4ODQ3NzIxNQ%3D%3D; uc4=id4=0%40U2xvIZeyY044%2Fg4ssnvrOTWRM69N&nk4=0%402EwyHO%2FQs1K5Yt3PADSqX0DTequA; tracknick=%5Cu90ED%5Cu5DDD%5Cu5DDD1998; _cc_=V32FPkk%2Fhw%3D%3D; _l_g_=Ug%3D%3D; sg=82b; _nk_=%5Cu90ED%5Cu5DDD%5Cu5DDD1998; cookie1=BvGDAyiO3yivOSSiEiVryF%2FdX85RndH78rFBY0y3kOc%3D"
}
html = getHTMLText(url, loginheaders)
parsePage(infoList, html)
except:
continue
printGoodsList(infoList)
main()