前提:
- 科學上網
- 谷歌瀏覽器
- python3
特點:
- 自定義多線程下載數目threads_num
- 自定義下載路徑
- 下載的文件均以論文的title命名
- 原始title中的非法字符(win10不允許作爲文件名的字符)已被處理爲下劃線或空格
代碼如下:
# coding:utf-8
import re
import urllib.request
import os
import threading
import os
from lxml import etree
from selenium import webdriver
import time
import re
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
def getIJCAIPapers(ctype, year, paper_wrapper, localdir):
length = len(paper_wrapper)
url = 'https://www.ijcai.org/proceedings/' + year
for i in range(length):
title_temp = paper_wrapper[i].xpath('./div[@class="title"]/text()')[0]
new_title = re.sub(rstr, "_", title_temp)
id_temp = paper_wrapper[i].xpath(
'./div[@class="details"]/a[1]/@href')[0]
url_temp = url + '/' + id_temp
path_temp = os.path.join(localDir, new_title + '.pdf')
if os.path.exists(path_temp):
print(title_temp + ' has been downloaded before.')
else:
urllib.request.urlretrieve(url_temp, path_temp)
print(title_temp + ' has been downloaded now.')
def get_CVPR_ICCV_Papers(ctype, year, paper_wrapper, localdir):
length = len(paper_wrapper)
download_url = 'http://openaccess.thecvf.com/'
for i in range(length):
url_suffix = paper_wrapper[i].xpath('./@href')[0]
download_url_temp = download_url + url_suffix
full_file_name = url_suffix.split('/')[-1]
file_name_re = re.findall('^.*?_(.*.pdf)', full_file_name)
if len(file_name_re) == 1:
file_name_ = file_name_re[0]
file_name_ = re.sub(rstr, "_", file_name_)
file_name = file_name_.replace('_', ' ')
file_path_temp = os.path.join(localdir, file_name)
if os.path.exists(file_path_temp):
print(file_name + ' has been downloaded before.')
else:
urllib.request.urlretrieve(download_url_temp, file_path_temp)
print(file_name + ' has been downloaded now.')
if __name__ == '__main__':
ctype = 'CVPR' #修改成對應的會議類型(限:ICCV,CVPR,IJCAI,其餘的需要自己修改網站鏈接)
year = '2019' #論文發表的年份
localDir = os.path.join('自定義下載目錄', ctype + year)
if not os.path.exists(localDir):
os.makedirs(localDir)
threads_num = 20 #自定義多線程下載的線程數目
driver = webdriver.Chrome()
threads = []
if ctype == 'IJCAI':
url = 'https://www.ijcai.org/proceedings/' + year
# 啓動chrome
driver.get(url)
time.sleep(5) # 等待瀏覽器加載頁面
pageSource = driver.page_source
html = etree.HTML(pageSource)
paper_wrapper = html.xpath(
'//*[@id="subsection0"]/div[@class="paper_wrapper"]')
length = len(paper_wrapper)
else:
url = 'http://openaccess.thecvf.com/' + ctype + year + '.py'
# 啓動chrome
driver.get(url)
time.sleep(5) # 等待瀏覽器加載頁面
pageSource = driver.page_source
html = etree.HTML(pageSource)
paper_wrapper = html.xpath('//*[@id="content"]/dl/dd/a')
length = len(paper_wrapper)
one_thread_papers = int(length / threads_num)
start = 0
end = start + one_thread_papers
for i in range(threads_num):
if ctype == 'IJCAI':
t = threading.Thread(
target=getIJCAIPapers,
args=(ctype, year, paper_wrapper[start:end], localDir))
else:
t = threading.Thread(
target=get_CVPR_ICCV_Papers,
args=(ctype, year, paper_wrapper[start:end], localDir))
start = end
end += one_thread_papers
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
print('all downloaded finished.')
效果圖(下載ing):
多快樂啊2333