1. 採用爬取的網頁進行domain命名
#URL=‘ https://news.x.x.x/c/2018-12-31/doc-ihqfskcn2820495.shtml’
filename = url[url.rfind('/')+1 :] #從最右方 ‘/’ 開始查找
start_pos = url.find('//') + 2 #起始位置
end_pos = url.find('/',start_pos) #結束位置
domain = url[start_pos:end_pos]
filename = domain + '_' + filename #文件名
2. 採用MD5進行命名
suffix = url[url.rfind('.'):]
#注意一定要對要編碼的字符串encode進行轉換否則會報錯
filename = hashlib.md5(url.encode('utf-8')).hexdigest() + suffix
不轉碼的錯誤信息
TypeError: Unicode-objects must be encoded before hashing
3. 完整代碼
import requests
import hashlib
url = 'https://news.x.x.cn/c/2018-12-31/doc-ihqfskcn2820495.shtml'
response = requests.get(url)
response.encoding = 'utf-8'
naming_mode = input('Please enter the encoding method of your choice(domain or md5):')
if naming_mode == 'domain':
filename = url[url.rfind('/')+1 :] #從最右方 ‘/’ 開始查找
start_pos = url.find('//') + 2
end_pos = url.find('/',start_pos)
domain = url[start_pos:end_pos]
filename = domain + '_' + filename #文件名
else:
suffix = url[url.rfind('.'):]
#注意一定要對要編碼的字符串encode進行轉換
filename = hashlib.md5(url.encode('utf-8')).hexdigest() + suffix
f = open(filename,'w+')
f.write(response.text)
f.close()