前提:
- 科学上网
- 谷歌浏览器
- python3
特点:
- 自定义多线程下载数目threads_num
- 自定义下载路径
- 下载的文件均以论文的title命名
- 原始title中的非法字符(win10不允许作为文件名的字符)已被处理为下划线或空格
代码如下:
# coding:utf-8
import re
import urllib.request
import os
import threading
import os
from lxml import etree
from selenium import webdriver
import time
import re
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
def getIJCAIPapers(ctype, year, paper_wrapper, localdir):
length = len(paper_wrapper)
url = 'https://www.ijcai.org/proceedings/' + year
for i in range(length):
title_temp = paper_wrapper[i].xpath('./div[@class="title"]/text()')[0]
new_title = re.sub(rstr, "_", title_temp)
id_temp = paper_wrapper[i].xpath(
'./div[@class="details"]/a[1]/@href')[0]
url_temp = url + '/' + id_temp
path_temp = os.path.join(localDir, new_title + '.pdf')
if os.path.exists(path_temp):
print(title_temp + ' has been downloaded before.')
else:
urllib.request.urlretrieve(url_temp, path_temp)
print(title_temp + ' has been downloaded now.')
def get_CVPR_ICCV_Papers(ctype, year, paper_wrapper, localdir):
length = len(paper_wrapper)
download_url = 'http://openaccess.thecvf.com/'
for i in range(length):
url_suffix = paper_wrapper[i].xpath('./@href')[0]
download_url_temp = download_url + url_suffix
full_file_name = url_suffix.split('/')[-1]
file_name_re = re.findall('^.*?_(.*.pdf)', full_file_name)
if len(file_name_re) == 1:
file_name_ = file_name_re[0]
file_name_ = re.sub(rstr, "_", file_name_)
file_name = file_name_.replace('_', ' ')
file_path_temp = os.path.join(localdir, file_name)
if os.path.exists(file_path_temp):
print(file_name + ' has been downloaded before.')
else:
urllib.request.urlretrieve(download_url_temp, file_path_temp)
print(file_name + ' has been downloaded now.')
if __name__ == '__main__':
ctype = 'CVPR' #修改成对应的会议类型(限:ICCV,CVPR,IJCAI,其余的需要自己修改网站链接)
year = '2019' #论文发表的年份
localDir = os.path.join('自定义下载目录', ctype + year)
if not os.path.exists(localDir):
os.makedirs(localDir)
threads_num = 20 #自定义多线程下载的线程数目
driver = webdriver.Chrome()
threads = []
if ctype == 'IJCAI':
url = 'https://www.ijcai.org/proceedings/' + year
# 启动chrome
driver.get(url)
time.sleep(5) # 等待浏览器加载页面
pageSource = driver.page_source
html = etree.HTML(pageSource)
paper_wrapper = html.xpath(
'//*[@id="subsection0"]/div[@class="paper_wrapper"]')
length = len(paper_wrapper)
else:
url = 'http://openaccess.thecvf.com/' + ctype + year + '.py'
# 启动chrome
driver.get(url)
time.sleep(5) # 等待浏览器加载页面
pageSource = driver.page_source
html = etree.HTML(pageSource)
paper_wrapper = html.xpath('//*[@id="content"]/dl/dd/a')
length = len(paper_wrapper)
one_thread_papers = int(length / threads_num)
start = 0
end = start + one_thread_papers
for i in range(threads_num):
if ctype == 'IJCAI':
t = threading.Thread(
target=getIJCAIPapers,
args=(ctype, year, paper_wrapper[start:end], localDir))
else:
t = threading.Thread(
target=get_CVPR_ICCV_Papers,
args=(ctype, year, paper_wrapper[start:end], localDir))
start = end
end += one_thread_papers
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
print('all downloaded finished.')
效果图(下载ing):
多快乐啊2333