一.導入需要的包
import urllib.request
import re
import requests
import time
import json
import xlwt
from random import choice
import os
import socket
from lxml import etree
二.項目需求分析
1.打開天貓首頁,搜索手機,出現下面頁面
2.查看網頁源代碼,搜索蘋果,跳轉到如圖地方,
3.點擊前面的鏈接發現是每種品牌的手機頁面,這些鏈接也是我們要抓的第一類信息
4.在這個頁面,我們可以看見的信息有:商品LOGO、商品名、銷量、價格等,肯定還有跳轉到商詳頁的鏈接,這些都是我們接下來要抓取的東西
同理,打開源碼頁,這裏有個小技巧,可以搜索銷量,這樣定位更準確些,如上圖,在銷量信息周圍我們能找到其他需要的信息,得到商詳頁的鏈接後,我們可以進商詳頁看看有什麼可以爬取的了
5.個人覺得商詳頁最有價值的東西就是評論信息及評論圖片了,所以,F12打開調試器,點擊network,找到存放評論信息的鏈接,如下:
https://rate.tmall.com/list_detail_rate.htm?itemId=538869984042&spuId=382573494&sellerId=2616970884&order=3&
currentPage=1&append=0
&content=1&tagId=&posi=&picture=1&groupId=&ua=098%23E1hvwQvpvLhvUvCkvvvvvjiPR2z90jtURF
qZAjEUPmPO6jDvR2cZ6j1PPLSyAjnhRphvCvvvphmCvpvuARNjNjx4zYMNQ9FwibDug5%2B2195RQDIFG5K88d
%2Fb9cD3e48myECi97DtdphvmpmvGSX2vvmWbIwCvvpv9hCviQhvCvvvpZpCvpvVvUCvpvvvmphvLU2LJ5Ia%2
Bb8reEQaUExreCkKHkx%2F1WmK53hz8Z4ikC4AdX3l8PoxdX9OdegaQfV6R3pBOyKQD40OV8tYVVzheugcRoxL
Dwet%2B3oZfveEvpvVvpCmpYFyuphvmvvvpoNq4cp3Kphv8vvvphvvvvvvvvC2DpvvvJyvvhXVvvmCWvvvByOv
vUhwvvCVB9vv9BQPvpvhvv2MMsyCvvpvvhCv9phv2nM5WDQi7rMNzT2Qz2yCvvpvvhCvdphvmpmC6rN0vvvPR8
6Cvvyv98o3L9vvbbG%3D&itemPropertyId=&itemPropertyIndex=&userPropertyId=&
userPropertyIndex=&rateQuery=&location=&needFold=0&_ksTS=1541865686575_944
&callback=jsonp945
確實,很長的一段,但分析下,其實不用全部的,只需要有幾個關鍵的信息點就行了,筆者親測,可以縮減至如下
https://rate.tmall.com/list_detail_rate.htm?itemId=538869984042&sellerId=2616970884&&order=3¤tPage=1&append=0&content=1&picture=1
其中我們需要修改的地方有:itemId、sellerId、currentPage三處,分別爲商品ID,商家ID和當前頁號
6.我們先打開上面的鏈接,看看他其實長這樣子
沒錯,一個JSON格式的文件,至於爲什麼我的頁面這麼好看,因爲我用來谷歌的一個插件,叫json-handle
好了,一目瞭然了,評論內容、評論時間、追評內容、追評時間、評論圖片都在這裏了,接下來就是代碼實現了
二.代碼實現
1.隨機獲取請求頭信息
def get_user_hearder():
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13',
'Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 ',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 ',
'Mozilla/5.0 (Linux; U; Android 3.2; ja-jp; F-01D Build/F0001) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13 ',
'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7',
'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5 ',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9 ',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
]
headers = ['User-Agent']
headers.append(choice(user_agents))
return headers
2.使用代理獲取網頁源碼
def get_html_proxy_utf(url,lines):
proxy=urllib.request.ProxyHandler({'http':choice(lines)})
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
opener.addheaders = [get_user_hearder()]
html=opener.open(url).read().decode('utf-8','ignore')
return html
3.獲取json文件
def getdata(html):
jsondata = re.search(r'\{.*\}', html).group()
data = json.loads(jsondata)
return data
4.獲取需要的信息、下載圖片、寫入excel文件
def download(logo_name, item_name,itemId,sale_num,data,url,z,f,sheet1):
path = 'E:\TMALL\%s\%s\\' % (str(logo_name), str(item_name).replace('/', ''))
if not os.path.exists(path):
os.makedirs(path)
for i in range(0, len(data['rateDetail']['rateList'])):
content = data['rateDetail']['rateList'][i]['rateContent']
# 評論
creationtime = data['rateDetail']['rateList'][i]['rateDate']
#評論時間
if ('pics' in data['rateDetail']['rateList'][i].keys()):
if (data['rateDetail']['rateList'][i]['pics'] != ''):
for k in range(0, len(data['rateDetail']['rateList'][i]['pics'])):
pics = data['rateDetail']['rateList'][i]['pics'][k]
a = 'http:' + pics
auto_down(a, path + str(z) + '_' +str(i + 1) + '_' + 'pic' + '_' + str(k) + '.jpg')
# 買家秀,命名規則:第幾頁第幾條評論的第幾個圖片
if ( data['rateDetail']['rateList'][i]['appendComment'] == None): # 判斷是否有追評,如果沒有將追評的信息默認爲空
appendcontent = ''
appendtime = ''
else:
appendcontent = data['rateDetail']['rateList'][i]['appendComment']['content']
# 追加評論的內容
appendtime = data['rateDetail']['rateList'][i]['appendComment']['commentTime']
if (data['rateDetail']['rateList'][i]['appendComment']['pics'] != ''): # 判斷是否有追加的圖片
for l in range(0, len(data['rateDetail']['rateList'][i]['appendComment']['pics'])):
appendpics = data['rateDetail']['rateList'][i]['appendComment']['pics'][l]
b = 'http:' + appendpics
html2 = requests.get(b)
respon2 = html2.status_code
if (respon2 != 200):
continue
else:
auto_down(b, path + str(z) + '_' +str(i + 1) + '_' + 'appendpic' + '_' + str(l) + '.jpg')
# 下載追加評論的圖片 命名規則:第幾頁第幾條評論的追加評論中的第幾個圖片
write_to_excel(logo_name, item_name,itemId,sale_num,content, creationtime, appendcontent, appendtime, url, z * 10 + i,f,sheet1)
print(str(i) + "頁數據已經保存")
5.主函數
def main():
f = open('valid_ip.txt', 'rb')
lines = f.readlines()
url1 = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.1000724.4.38322a68rQnDFe&' \
'q=%CA%D6%BB%FA&sort=d&style=g&search_condition=2&sarea_code=310100&from=sn_1_brand-qp&shopType=any#J_Filter'
try:
pagecode=get_html_proxy_gbk(url1,lines)
except Exception as e:
pagecode=get_html_proxy_gbk(url1,lines)
html = etree.HTML(pagecode)
logo_url = html.xpath('//a[@data-f="spu-brand-qp"]/@href')
logo_name = html.xpath('//a[@data-f="spu-brand-qp"]/img/@alt')
for k in range(7,len(logo_url)):
url = 'https://list.tmall.com/search_product.htm' + logo_url[k]
try:
pagecode = get_html_proxy_gbk(url,lines)
except Exception as e:
pagecode = get_html_proxy_gbk(url,lines)
html = etree.HTML(pagecode)
url_list = html.xpath('//div[@class="productTitle productTitle-spu"]/a[1]/@href')
item_name = html.xpath('//div[@class="productTitle productTitle-spu"]/a[1]/text()')
sale_num = html.xpath('//p[@class="productStatus"]/span/em/text()')
for i in range(0,len(url_list)):
itemId=re.findall(r'id=(.*?)&', url_list[i])[0]
sellerId=re.findall(r'id=(.*?)&', url_list[i])[1]
if os.path.exists('%s.xls' % str(item_name[i]).replace('/', '')):
continue
else:
f = xlwt.Workbook()
sheet1 = f.add_sheet('%s' % (str(itemId)), cell_overwrite_ok=True)
row0 = ["品牌", "商品名稱", "商品編號", "銷量", "評論文本", '評論時間', '追評文本', '追評時間', 'URL']
# 寫入第一行
for q in range(0, len(row0)):
sheet1.write(0, q, row0[q])
print('開始爬取第' + str(i) + '個商品_' + str(item_name[i]))
for j in range(1,100):
url='https://rate.tmall.com/list_detail_rate.htm?itemId='+str(itemId)+\
'&sellerId='+str(sellerId)+'&order=3¤tPage='+str(j)+'&append=0&content=1&picture=1'
try:
data = getdata(get_html_proxy_utf(url,lines))
except Exception as e:
try:
data = getdata(get_html_proxy_utf(url, lines))
except Exception as e:
data = getdata(get_html_proxy_utf(url, lines))
download(logo_name[k], item_name[i],itemId,sale_num[k],data,url,j-1,f,sheet1)
print('第'+str(j)+'頁爬取完成')
time.sleep(1)
f.close()
OVER
全部的代碼都在上面了,希望對大家的學習有幫助!