import urllib.request
import re
import os
import time
start_page = int(input('請輸入起始頁碼-'))
end_page = int(input('請輸入結束頁碼-'))
url = 'https://www.qiushibaike.com/pic/page/{}/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}
for page in range(start_page, end_page + 1):
print('正在爬取第--%s--頁......' % page)
urlt = url.format(page)
request = urllib.request.Request(url=urlt, headers=headers)
response = urllib.request.urlopen(request)
print(response)
content = response.read().decode('utf8')
pattern = re.compile(r'<div class="thumb">.*?<img src="(.*?)" alt="(.*?)" />.*?</div>', re.S)
ret = pattern.findall(content)
for info in ret:
image_url = 'https:' + info[0]
image_name = info[1]
filename = image_name + '.' + image_url.split('.')[-1]
print('正在下載--%s--...' % filename)
dirname = 'qiutu'
filepath = os.path.join(dirname, filename)
urllib.request.urlretrieve(image_url, filepath)
print('結束下載--%s--' % filename)
time.sleep(2)
print('結束爬取第--%s--頁...' % page)
time.sleep(2)
# 單行模式 注意提取字符串裏面內容有換行 re.S 否則返回可能爲空
# .*? 不需要,匹配走 (.*?)保留
#注意右擊檢查中的代碼可能與源碼中的代碼不同,
導致匹配錯誤,某個標籤可能會有 /符號,而檢查中沒有 /符號