# -*- coding: utf-8 -*-
"""
Created on Sat May 4 20:24:04 2019
@author: navy
"""
import re
import urllib
# 拿到頁面數據
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
return html
# 通過正則表達式獲取圖片路徑
def getImage(html):
reg = r'src="(.*?\.jpg)"' # r表示不轉移 . 匹配任意字符 .*? 後面多個問號,代表非貪婪模式,也就是說只匹配符合條件的最少字符
pattern = re.compile(reg, re.I)
html = html.decode('gbk'); # 將字節轉成字符串
result = re.findall(pattern, html);
count = 0;
for imgUrl in result:
print(imgUrl)
count = count + 1;
urllib.request.urlretrieve(imgUrl, "E:\python_workspaces\images\%s.jpg" % count, callbackfunc) #下載文件
print("下載完成,總共有:", count, "張圖片");
def callbackfunc(blocknum, blocksize, totalsize):
"""
回調函數
@blocknum: 已經下載的數據塊
@blocksize: 數據塊的大小
@totalsize: 遠程文件的大小
"""
percent = 100.0 * blocknum * blocksize / totalsize
if percent > 100:
percent = 100
print("%.f%%" % percent)
if __name__ == '__main__':
html = getHtml("http://www.netbian.com/")
getImage(html)