最近用了selenium+chromedriver.exe隱藏式下載文件


寫篇博文留作紀念

chromedriver.exe這個東西下載自己去搜,要和自己的瀏覽器版本對應上

下載完了放在着:C:\Windows\System32\

不廢話直接上代碼,拿去試試,以供參考。

# coding:utf-8
import io
import re
import json
import urllib.request
import os
from bs4 import BeautifulSoup
from selenium import webdriver

class paChong(object):
    def __init__(self ):
        super(paChong, self).__init__()
        self.name=''
        self.basic_path = "D:\\work\\"
        self.path_01 = ''
        self.path_02 = ''
        self.list_name = [
        '電影放映單位設立、變更審批', '道路貨運經營許可', '單位和個人接收境內電視節目的許可',
        '單位、個人從事出版物零售業務、變更出版物經營許可證登記事項審批', '從事清真食品生產、經營單位的資質審批的初審',
        '從事城市生活垃圾經營性清掃﹑收集﹑運輸﹑處理服務審批', '從事包裝裝潢印刷品和其他印刷品印刷經營活動企業的設立、變更審批',
        '慈善組織認定', '慈善組織公募資格認定', '除劇毒化學品、易制爆化學品外其他危險化學品(不含倉儲經營)經營企業經營許可證核發',
        '初中及以下階段學校(義務教育、學前教育、非學歷文化教育)審批', '初中及以下階段(義務教育、學前教育)教師資格認定',
        '籌備設立宗教活動場所審批(含擴建、異地重建)的初審', '城鎮污水排入排水管網許可', '城市建築垃圾處置覈准',
        '車體廣告設置許可', '“三有”陸生野生動物人工繁育許可', '“三有”陸生野生動物經營利用許可'
                         ]

    # 無界面模式
    def download_new(self,url,path):
        chrome_options = webdriver.ChromeOptions()
        prefs = {'download.default_directory': path,
                 'download.prompt_for_download': False,
                 'download.directory_upgrade': True,
                 'safebrowsing.enabled': False,
                 'safebrowsing.disable_download_protection': True}

        chrome_options.add_experimental_option('prefs', prefs)
        chrome_options.add_argument("--headless")
        driverChrome = webdriver.Chrome(chrome_options=chrome_options)
        driverChrome.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')

        params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': path}}
        command_result = driverChrome.execute("send_command", params)
        return driverChrome



    def mkdir(self,path):
        isExists = os.path.exists(path)
        if not isExists:
            os.mkdir(path)
    def main(self):
        result = self.get_html_01()

    def get_html_03(self,url,name):
        #url = 'http://zwfw.hubei.gov.cn/lawguide/clxxlist/42010500000077138126X00105000001.jspx'
        result = urllib.request.urlopen(url).read().decode("utf-8")
        soup = BeautifulSoup(result, "html.parser")
        list_d = soup.find_all(name='a', text='下載')

        temp = 0
        for d in list_d:
            tr = d.parent.parent.find_all(name='td')[0].text[2:].strip()
            if str(tr).__len__()>20:
                tr = str(tr)[0:20]
            temp=temp+1
            totalCount = str(d)[49:56]
            totalCount = re.sub("\D", "", totalCount)
            dowon = 'http://zwfw.hubei.gov.cn/lawguide/ykypt/doc/' + totalCount + '.jspx'
            self.mkdir(self.path_02 + '\\' + tr)
            driverChrome = self.download_new(url, self.path_02 + '\\' + tr )
            print('正在下載:'+name+'|||'+tr+'|||'+str(temp))
            print(dowon)
            f = open('D:\\work\\wuhan\\hangyang\\pachong\\ad.txt', 'a', encoding="utf-8")
            f.writelines(name+'|||'+tr+'|||'+str(temp)+':'+dowon + "\n")
            f.close()
            try:
                driverChrome.get(dowon)
            except Exception as ex:
                print("--------出錯繼續----")
                driverChrome.get(dowon)
            print('正在下載ok')


    def get_html_02(self,DXNAME):
        url = 'http://zwfw.hubei.gov.cn/lawguide/smallItem/getSearchSx.jspx?dxName='
        dxName = DXNAME
        #dxName = urllib.parse.quote('護士執業註冊(變更註冊、延續註冊)')
        dxName = urllib.parse.quote(dxName)
        cxbsh = '1'
        url = url + dxName + '&cxbsh=' + cxbsh + ''
        page = urllib.request.urlopen(url).read().decode("utf-8")
        json_data_02 = json.loads(page)
        name_02_code_list = list()
        for name in json_data_02[0]['list']:
            name_02 = name['REGION_CODE']
            if name_02 :

                if int(name_02) == 420105000000:
                    print(name['NAME'])
                    print('http://zwfw.hubei.gov.cn/lawguide/clxxlist/'+name['CODE']+'.jspx')
                    self.path_02 = self.path_01+'\\'+name['NAME']
                    self.mkdir(self.path_02)
                    self.get_html_03('http://zwfw.hubei.gov.cn/lawguide/clxxlist/'+name['CODE']+'.jspx',name['NAME'])

                #else:
                    #print('不是武漢市')
        return name_02_code_list
    def get_html_01(self):
        temp = 0
        for n in self.list_name:
            cxbsh = '1'
            url = 'http://zwfw.hubei.gov.cn/lawguide/smallItem/getSearchDx.jspx?siteName=' + urllib.parse.quote(n) + '&cxbsh=' + cxbsh + ''
            # 引入模塊
            self.path_01 = self.basic_path + n
            self.mkdir(self.path_01)
            self.name = n
            #print(n)
            temp+=1
            print('------------------------------------' +str(temp)  + '------------------------------------')
            print('------------------------------------' + str(temp) + '------------------------------------')
            print('------------------------------------' + str(temp) + '------------------------------------')
            print('------------------------------------' + str(temp) + '------------------------------------')
            print('------------------------------------' + str(temp) + '------------------------------------')
            print('------------------------------------' + str(temp) + '------------------------------------')
            page = urllib.request.urlopen(url).read().decode("utf-8")
            json_data_01 = json.loads(page)
            names = json_data_01[0]['list']
            name_list = list()
            for name in names:
                name_list.append(name['NAME'])
                print(name['NAME'])
                self.get_html_02(name['NAME'])
            print(n+'有:'+json_data_01[0]['pageSize']+'個子項')
        return "OK"

def kaishiba():

    pc = paChong()
    pc.main()

if __name__ == '__main__':
    kaishiba()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章