多線程下載CVPR ICCV IJCAI論文

前提:

  1. 科學上網
  2. 谷歌瀏覽器
  3. python3

特點:

  1. 自定義多線程下載數目threads_num
  2. 自定義下載路徑
  3. 下載的文件均以論文的title命名
  4. 原始title中的非法字符(win10不允許作爲文件名的字符)已被處理爲下劃線或空格

代碼如下:

# coding:utf-8
import re
import urllib.request
import os
import threading
import os
from lxml import etree
from selenium import webdriver
import time
import re

rstr = r"[\/\\\:\*\?\"\<\>\|]"  # '/ \ : * ? " < > |'


def getIJCAIPapers(ctype, year, paper_wrapper, localdir):
    length = len(paper_wrapper)
    url = 'https://www.ijcai.org/proceedings/' + year
    for i in range(length):
        title_temp = paper_wrapper[i].xpath('./div[@class="title"]/text()')[0]
        new_title = re.sub(rstr, "_", title_temp)
        id_temp = paper_wrapper[i].xpath(
            './div[@class="details"]/a[1]/@href')[0]
        url_temp = url + '/' + id_temp
        path_temp = os.path.join(localDir, new_title + '.pdf')
        if os.path.exists(path_temp):
            print(title_temp + ' has been downloaded before.')
        else:
            urllib.request.urlretrieve(url_temp, path_temp)
            print(title_temp + ' has been downloaded now.')


def get_CVPR_ICCV_Papers(ctype, year, paper_wrapper, localdir):
    length = len(paper_wrapper)
    download_url = 'http://openaccess.thecvf.com/'
    for i in range(length):
        url_suffix = paper_wrapper[i].xpath('./@href')[0]
        download_url_temp = download_url + url_suffix
        full_file_name = url_suffix.split('/')[-1]
        file_name_re = re.findall('^.*?_(.*.pdf)', full_file_name)
        if len(file_name_re) == 1:
            file_name_ = file_name_re[0]
            file_name_ = re.sub(rstr, "_", file_name_)
            file_name = file_name_.replace('_', ' ')
            file_path_temp = os.path.join(localdir, file_name)
            if os.path.exists(file_path_temp):
                print(file_name + ' has been downloaded before.')
            else:
                urllib.request.urlretrieve(download_url_temp, file_path_temp)
                print(file_name + ' has been downloaded now.')


if __name__ == '__main__':
    ctype = 'CVPR'  #修改成對應的會議類型(限:ICCV,CVPR,IJCAI,其餘的需要自己修改網站鏈接)
    year = '2019'  #論文發表的年份
    localDir = os.path.join('自定義下載目錄', ctype + year)
    if not os.path.exists(localDir):
        os.makedirs(localDir)
    threads_num = 20  #自定義多線程下載的線程數目
    driver = webdriver.Chrome()
    threads = []
    if ctype == 'IJCAI':
        url = 'https://www.ijcai.org/proceedings/' + year
        # 啓動chrome
        driver.get(url)
        time.sleep(5)  #  等待瀏覽器加載頁面
        pageSource = driver.page_source
        html = etree.HTML(pageSource)
        paper_wrapper = html.xpath(
            '//*[@id="subsection0"]/div[@class="paper_wrapper"]')
        length = len(paper_wrapper)
    else:
        url = 'http://openaccess.thecvf.com/' + ctype + year + '.py'
        # 啓動chrome
        driver.get(url)
        time.sleep(5)  #  等待瀏覽器加載頁面
        pageSource = driver.page_source
        html = etree.HTML(pageSource)
        paper_wrapper = html.xpath('//*[@id="content"]/dl/dd/a')
        length = len(paper_wrapper)

    one_thread_papers = int(length / threads_num)
    start = 0
    end = start + one_thread_papers
    for i in range(threads_num):
        if ctype == 'IJCAI':
            t = threading.Thread(
                target=getIJCAIPapers,
                args=(ctype, year, paper_wrapper[start:end], localDir))

        else:
            t = threading.Thread(
                target=get_CVPR_ICCV_Papers,
                args=(ctype, year, paper_wrapper[start:end], localDir))
        start = end
        end += one_thread_papers
        threads.append(t)
    for t in threads:
        t.start()
    for t in threads:
        t.join()
    print('all downloaded finished.')

效果圖(下載ing):

多快樂啊2333

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章