最近用了selenium+chromedriver.exe隱藏式下載文件
寫篇博文留作紀念
chromedriver.exe這個東西下載自己去搜,要和自己的瀏覽器版本對應上
下載完了放在着:C:\Windows\System32\
不廢話直接上代碼,拿去試試,以供參考。
# coding:utf-8
import io
import re
import json
import urllib.request
import os
from bs4 import BeautifulSoup
from selenium import webdriver
class paChong(object):
def __init__(self ):
super(paChong, self).__init__()
self.name=''
self.basic_path = "D:\\work\\"
self.path_01 = ''
self.path_02 = ''
self.list_name = [
'電影放映單位設立、變更審批', '道路貨運經營許可', '單位和個人接收境內電視節目的許可',
'單位、個人從事出版物零售業務、變更出版物經營許可證登記事項審批', '從事清真食品生產、經營單位的資質審批的初審',
'從事城市生活垃圾經營性清掃﹑收集﹑運輸﹑處理服務審批', '從事包裝裝潢印刷品和其他印刷品印刷經營活動企業的設立、變更審批',
'慈善組織認定', '慈善組織公募資格認定', '除劇毒化學品、易制爆化學品外其他危險化學品(不含倉儲經營)經營企業經營許可證核發',
'初中及以下階段學校(義務教育、學前教育、非學歷文化教育)審批', '初中及以下階段(義務教育、學前教育)教師資格認定',
'籌備設立宗教活動場所審批(含擴建、異地重建)的初審', '城鎮污水排入排水管網許可', '城市建築垃圾處置覈准',
'車體廣告設置許可', '“三有”陸生野生動物人工繁育許可', '“三有”陸生野生動物經營利用許可'
]
# 無界面模式
def download_new(self,url,path):
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory': path,
'download.prompt_for_download': False,
'download.directory_upgrade': True,
'safebrowsing.enabled': False,
'safebrowsing.disable_download_protection': True}
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument("--headless")
driverChrome = webdriver.Chrome(chrome_options=chrome_options)
driverChrome.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': path}}
command_result = driverChrome.execute("send_command", params)
return driverChrome
def mkdir(self,path):
isExists = os.path.exists(path)
if not isExists:
os.mkdir(path)
def main(self):
result = self.get_html_01()
def get_html_03(self,url,name):
#url = 'http://zwfw.hubei.gov.cn/lawguide/clxxlist/42010500000077138126X00105000001.jspx'
result = urllib.request.urlopen(url).read().decode("utf-8")
soup = BeautifulSoup(result, "html.parser")
list_d = soup.find_all(name='a', text='下載')
temp = 0
for d in list_d:
tr = d.parent.parent.find_all(name='td')[0].text[2:].strip()
if str(tr).__len__()>20:
tr = str(tr)[0:20]
temp=temp+1
totalCount = str(d)[49:56]
totalCount = re.sub("\D", "", totalCount)
dowon = 'http://zwfw.hubei.gov.cn/lawguide/ykypt/doc/' + totalCount + '.jspx'
self.mkdir(self.path_02 + '\\' + tr)
driverChrome = self.download_new(url, self.path_02 + '\\' + tr )
print('正在下載:'+name+'|||'+tr+'|||'+str(temp))
print(dowon)
f = open('D:\\work\\wuhan\\hangyang\\pachong\\ad.txt', 'a', encoding="utf-8")
f.writelines(name+'|||'+tr+'|||'+str(temp)+':'+dowon + "\n")
f.close()
try:
driverChrome.get(dowon)
except Exception as ex:
print("--------出錯繼續----")
driverChrome.get(dowon)
print('正在下載ok')
def get_html_02(self,DXNAME):
url = 'http://zwfw.hubei.gov.cn/lawguide/smallItem/getSearchSx.jspx?dxName='
dxName = DXNAME
#dxName = urllib.parse.quote('護士執業註冊(變更註冊、延續註冊)')
dxName = urllib.parse.quote(dxName)
cxbsh = '1'
url = url + dxName + '&cxbsh=' + cxbsh + ''
page = urllib.request.urlopen(url).read().decode("utf-8")
json_data_02 = json.loads(page)
name_02_code_list = list()
for name in json_data_02[0]['list']:
name_02 = name['REGION_CODE']
if name_02 :
if int(name_02) == 420105000000:
print(name['NAME'])
print('http://zwfw.hubei.gov.cn/lawguide/clxxlist/'+name['CODE']+'.jspx')
self.path_02 = self.path_01+'\\'+name['NAME']
self.mkdir(self.path_02)
self.get_html_03('http://zwfw.hubei.gov.cn/lawguide/clxxlist/'+name['CODE']+'.jspx',name['NAME'])
#else:
#print('不是武漢市')
return name_02_code_list
def get_html_01(self):
temp = 0
for n in self.list_name:
cxbsh = '1'
url = 'http://zwfw.hubei.gov.cn/lawguide/smallItem/getSearchDx.jspx?siteName=' + urllib.parse.quote(n) + '&cxbsh=' + cxbsh + ''
# 引入模塊
self.path_01 = self.basic_path + n
self.mkdir(self.path_01)
self.name = n
#print(n)
temp+=1
print('------------------------------------' +str(temp) + '------------------------------------')
print('------------------------------------' + str(temp) + '------------------------------------')
print('------------------------------------' + str(temp) + '------------------------------------')
print('------------------------------------' + str(temp) + '------------------------------------')
print('------------------------------------' + str(temp) + '------------------------------------')
print('------------------------------------' + str(temp) + '------------------------------------')
page = urllib.request.urlopen(url).read().decode("utf-8")
json_data_01 = json.loads(page)
names = json_data_01[0]['list']
name_list = list()
for name in names:
name_list.append(name['NAME'])
print(name['NAME'])
self.get_html_02(name['NAME'])
print(n+'有:'+json_data_01[0]['pageSize']+'個子項')
return "OK"
def kaishiba():
pc = paChong()
pc.main()
if __name__ == '__main__':
kaishiba()