1. 采用爬取的网页进行domain命名
#URL=‘ https://news.x.x.x/c/2018-12-31/doc-ihqfskcn2820495.shtml’
filename = url[url.rfind('/')+1 :] #从最右方 ‘/’ 开始查找
start_pos = url.find('//') + 2 #起始位置
end_pos = url.find('/',start_pos) #结束位置
domain = url[start_pos:end_pos]
filename = domain + '_' + filename #文件名
2. 采用MD5进行命名
suffix = url[url.rfind('.'):]
#注意一定要对要编码的字符串encode进行转换否则会报错
filename = hashlib.md5(url.encode('utf-8')).hexdigest() + suffix
不转码的错误信息
TypeError: Unicode-objects must be encoded before hashing
3. 完整代码
import requests
import hashlib
url = 'https://news.x.x.cn/c/2018-12-31/doc-ihqfskcn2820495.shtml'
response = requests.get(url)
response.encoding = 'utf-8'
naming_mode = input('Please enter the encoding method of your choice(domain or md5):')
if naming_mode == 'domain':
filename = url[url.rfind('/')+1 :] #从最右方 ‘/’ 开始查找
start_pos = url.find('//') + 2
end_pos = url.find('/',start_pos)
domain = url[start_pos:end_pos]
filename = domain + '_' + filename #文件名
else:
suffix = url[url.rfind('.'):]
#注意一定要对要编码的字符串encode进行转换
filename = hashlib.md5(url.encode('utf-8')).hexdigest() + suffix
f = open(filename,'w+')
f.write(response.text)
f.close()