requests請求、re過濾、time延時、os打開系統文件
重點:網頁分析,text中分析網頁,re過濾。
'''
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36
'''
import requests # 請求
import re # 解析(正則表達式方法)
import time # 延時
import os # 打開系統
# 1/x 請求網頁
def getHTMLText(url, headers):
try:
response = requests.get(url, headers=headers) #
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except:
return "getHTMLText NULL"
# 2/x 解析網頁
def parsePage(ilt, html):
try:
urls = re.findall(r'<a href="(.*?)" alt=".*?" title=".*?"', html) # urls 提取了圖片路徑放置列表中
dir_name = re.findall(r'<h1 class="post-title h3">(.*?)</h1>', html)[-1] # dir_name 提取了名字作爲文件名
#print(urls)
for i in range(len(urls)):
ilt.append([dir_name, urls[i]])
except:
return "parseHTML NULL"
print("")
# 3/x 打印列表
def printGoodLists(ilts):
tplt = "{:4}\t{:16}\{:16}" # 配置格式
print(tplt.format("序號", "文件名", "url"))
count = 0
for ilt in ilts:
count += 1
print(tplt.format(count, ilt[0], ilt[1]))
pass
# 4/x 保存文件_圖片
def saveFile(inforList, headers):
try:
dir_name = inforList[0][0]
if not os.path.exists(dir_name):
os.mkdir(dir_name)
for dir, url in inforList:
time.sleep(1)
file_name = url.split('/')[-1] #
response = requests.get(url, headers=headers)
with open(dir_name + "/" + file_name, 'wb') as f:
f.write(response.content)
f.close()
print("saveFile sucess!")
except:
print("saveFile error!")
def main():
url = "https://www.vmgirls.com/12945.html"
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"}
response = getHTMLText(url, headers) # 返回html頁面文本
inforList = []
parsePage(inforList, response) # 返回文件夾名+文件url len(inforList):數據行(個)數
printGoodLists(inforList) # 打印序號、文件名、對應文件url inforList[0][1]:第一條url、inforList[0][0]:第一條標題
saveFile(inforList, headers)
if __name__ == "__main__":
main()