利用asyncio并发下载pdf并同步到七牛云网盘

原創

2020-06-16 11:50

#pdf下载并上传到七牛云

# -*- coding: utf-8 -*-

import aiohttp
import asyncio
import redis
import re
import time
import os
import pymysql
from qiniu import Auth
from qiniu import put_file

import logging

logging.basicConfig(filename='log_today_pdf.log',
format='%(asctime)s -%(name)s-%(levelname)s-%(module)s:%(message)s',
datefmt='%Y-%m-%d %H:%M:%S %p',
level=logging.ERROR)

#七牛云配置
bucket = "xxxx"
access_key = "xxxx"
secret_key = "xxxx"

#获取当前日期
today=time.strftime('%Y-%m-%d',time.localtime(time.time()))
#today="2019-04-08"

ip="xxx"
user="xxx"
password="xxx"

# 打开数据库连接
db = pymysql.connect(ip, user, password, "xxxx", charset='utf8' )

# 使用cursor()方法获取操作游标
cursor = db.cursor()

# 使用execute方法执行SQL语句

sql="xxxxx"

cursor.execute(sql)
results = cursor.fetchall()

rdp = redis.ConnectionPool(host='xxxx', port=6379, password='')
#redis连接池
r = redis.StrictRedis(connection_pool=rdp)

all_pdf_url=[]
# with r.pipeline(transaction=False) as p:
for row in results:
url = row[0]
#将pdf文件地址保存到redis
all_pdf_url.append(url)

#利用aiohttp包，异步下载pdf文件
async def fetch(session, url, dst):
async with session.get(url) as req:
with(open(dst, 'ab')) as f:
while True:
chunk = await req.content.read(1024)
if not chunk:
break
f.write(chunk)

async def async_download_from_url(url, dst,semaphore):
async with semaphore:#设置并发数
'''异步'''
async with aiohttp.connector.TCPConnector(limit=300, force_close=True, enable_cleanup_closed=True) as tc:
async with aiohttp.ClientSession(connector=tc) as session:
await fetch(session, url, dst)

file_path="/data/in9/"

#从redis获取pdf下载地址，并发下载pdf
async def coro():
download_urls=all_pdf_url
with r.pipeline(transaction=False) as p:
task = []
semaphore = asyncio.Semaphore(50)
for download_url in download_urls:
#如果下载地址不在redis中，则下载
if(r.sismember("url_notice_history",download_url)==False):
# 获取pdf文件编号
download_url_true=download_url
if (download_url_true.find("pdf") >= 0):
result = re.findall(".*/"+today+"/(.*).pdf", download_url_true)
else:
result = re.findall(".*/"+today+"/(.*).PDF", download_url_true)
try:
p.sadd("url_notice_history",download_url)
if(result):
dst = file_path + result[0] + ".pdf"
task.append(async_download_from_url(download_url, dst,semaphore))
except Exception as e:
logging.error("error_message===",e)

p.execute()
await asyncio.wait(task) # 等待所有协程结束

loop = asyncio.get_event_loop()
loop.run_until_complete(coro())
loop.close()
print("下载结束===")

#将下载的pdf上传到七牛云
qiniuyunDir="notice/file/"
def upload_qiniu(file_dir):
q = Auth(access_key, secret_key)
for root,dirs,files in os.walk(file_dir):
# 生成上传 Token，可以指定过期时间等
for file in files:
token = q.upload_token(bucket, qiniuyunDir+file, 3600)
# 要上传文件的本地路径
ret, info = put_file(token, qiniuyunDir+file, root+file)
if info.status_code != 200:
logging.info("file upload qiniuyun fail %s" % file)

upload_qiniu(file_path)
#执行解析脚本，运行pdf解析程序
print("上传完成完成")
#logging.info("开始运行pdf解析程序===")
#os.system("docker exec -i 7702bb49cc46 bash /pdf_exchange/in9_start.sh")

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

利用asyncio并发下载pdf并同步到七牛云网盘

Wireshark 安装+使用（一）

博客园商业化之路-众包平台：继续召集早期合作开发者

java steam對象根據屬性值排序正序倒序

利用docker安裝mysql鏡像及遠程連接mysql

利用scrapy抓取數據，批量插入mysql

python3 讀取配置文件

利用asyncio併發下載pdf並同步到七牛雲網盤

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結