1 分析
首先對網頁url進行分析。我們將第二頁、第三頁最後s=44
改爲s=0
時,我們剛好獲取的爲第一頁數據,所以我們總結出商品的頁數爲鏈接最後的s=44*i
i爲頁數爲[0,1,2,3…]
#第一頁鏈接
#https://s.taobao.com/search?q=%E8%B6%85%E7%9F%AD%E8%A3%99&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180202&ie=utf8
#第二頁鏈接
#https://s.taobao.com/search?q=%E8%B6%85%E7%9F%AD%E8%A3%99&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180202&ie=utf8&bcoffset=4&ntoffset=4&p4ppushleft=1%2C48&s=44
#第三頁鏈接
#https://s.taobao.com/search?q=%E8%B6%85%E7%9F%AD%E8%A3%99&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180202&ie=utf8&bcoffset=4&ntoffset=4&p4ppushleft=1%2C48&s=88
其次對圖片地址進行分析,使用chrome檢查,我們得到元素位置如下,將其copy出來,加上http:瀏覽器打開即是我們想要的圖片數據,但是圖片並不是大圖,我們注意到_360x360Q90.jpg_.webp
,可能爲將圖片壓縮,我們將其刪除,同時加上http:,打開發現是大圖。
#圖片地址
#//g-search1.alicdn.com/img/bao/uploaded/i4/i1/2110184062/TB2ETSrXvnW1eJjSZFqXXa8sVXa_!!2110184062.jpg_360x360Q90.jpg_.webp
#//g-search3.alicdn.com/img/bao/uploaded/i4/i3/88504238/TB22FQtaPgy_uJjSZKPXXaGlFXa_!!88504238.jpg_360x360Q90.jpg_.webp
即我們想要的圖片地址爲:
#http://g-search1.alicdn.com/img/bao/uploaded/i4/i1/2110184062/TB2ETSrXvnW1eJjSZFqXXa8sVXa_!!2110184062.jpg
那麼我們的解題思路已經出現:
獲取每頁的鏈接—對每頁的源碼正則提取圖片地址—將圖片地址加上http:—最後下載這個鏈接保存爲jpg格式。
2 代碼
在獲取某一個產品的圖時,會出現報錯<urlopen error [Errno 61] Connection refused> python
,沒找到原因,但不影響程序的整體效果,可能是此產品的問題,使用try,except,使程序繼續運行即可。其中代理服務器爲可選。
#!/user/bin/env python
#-*- coding:utf-8 -*-
#auth:M10
import re
import urllib.request
import urllib.error
import time
keyword = "超短裙"
real_word = urllib.request.quote(keyword)#將關鍵詞轉換爲link所識別的
def get_pics():
for i in range(0,10):
url = 'http://s.taobao.com/search?q='+real_word+'&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180202&ie=utf8&bcoffset=4&ntoffset=4&p4ppushleft=1%2C48&s='+str(i*44)#根據每一頁的規律
header = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'referer':'https://s.taobao.com/search?initiative_id=tbindexz_20170306&ie=utf8&spm=a21bo.2017.201856-taobao-item.2&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&imgfile=&q=%E8%B6%85%E7%9F%AD%E8%A3%99&suggest=history_1&_input_charset=utf-8&wq=chaoduanq&suggest_query=chaoduanq&source=suggest'
}
time.sleep(2)
#proxy = urllib.request.ProxyHandler({'http':'60.23.46.24:80'})
#opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
#urllib.request.install_opener(opener)
request = urllib.request.Request(url,headers=header)
try:
data = urllib.request.urlopen(request, timeout=5).read().decode('utf-8', 'ignore')
except urllib.error.URLError as e:
print(e.reason)
print(e.code)
pat = '"pic_url":"(.*?).jpg"'#使用正則表達式獲取圖片地址
re_link = re.compile(pat).findall(data)
#print(re_link)
for j in range(0,len(re_link)):
time.sleep(2)
link = 'http:'+re_link[j]+'.jpg'
path = '/Users/wangxingfan/Desktop/data1/'+str(i)+str(j)+'.jpg'
try:
urllib.request.urlretrieve(link,path)#出錯,不知道什麼原因
except:
pass
get_pics()
3 運行結果
4 另一個例子
爬千圖網,步驟基本一樣。
#!/user/bin/env python
#-*- coding:utf-8 -*-
#auth:M10
import re
import urllib.request
import urllib.error
import time
def get_pics():
for i in range(1,7):
url = 'http://www.58pic.com/tupian/meixi-0-0-0'+str(i)+'.html'
header = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Referer':'http://www.58pic.com/tupian/meixi-0-0-1.html'
}
request = urllib.request.Request(url,headers=header)
try:
data = urllib.request.urlopen(request,timeout=5).read().decode('utf-8','ignore')
except urllib.error.URLError as e:
print(e.reason)
print(e.code)
pat = '"(http://pic.qiantucdn.com/58pic.*?)!/fw'
links = re.compile(pat).findall(data)
time.sleep(2)
for j in range(len(links)):
path = '/Users/wangxingfan/Desktop/data2/'+str(i)+str(j)+'.jpg'
try:
urllib.request.urlretrieve(links[j],path)
except:
pass
get_pics()