利用asyncio并发下载pdf并同步到七牛云网盘

#pdf下载并上传到七牛云

# -*- coding: utf-8 -*-

import aiohttp
import asyncio
import redis
import re
import time
import os
import  pymysql
from qiniu import Auth
from qiniu import put_file

import logging

logging.basicConfig(filename='log_today_pdf.log',
                    format='%(asctime)s -%(name)s-%(levelname)s-%(module)s:%(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S %p',
                    level=logging.ERROR)

#七牛云配置
bucket = "xxxx"
access_key = "xxxx"
secret_key = "xxxx"


#获取当前日期
today=time.strftime('%Y-%m-%d',time.localtime(time.time()))
#today="2019-04-08"

ip="xxx"
user="xxx"
password="xxx"

# 打开数据库连接
db = pymysql.connect(ip, user, password, "xxxx", charset='utf8' )

# 使用cursor()方法获取操作游标
cursor = db.cursor()

# 使用execute方法执行SQL语句

sql="xxxxx"

cursor.execute(sql)
results = cursor.fetchall()

rdp = redis.ConnectionPool(host='xxxx', port=6379, password='')
#redis连接池
r = redis.StrictRedis(connection_pool=rdp)

all_pdf_url=[]
# with r.pipeline(transaction=False) as p:
for row in results:
    url = row[0]
    #将pdf文件地址保存到redis
    all_pdf_url.append(url)
    


#利用aiohttp包,异步下载pdf文件
async def fetch(session, url, dst):
    async with session.get(url) as req:
        with(open(dst, 'ab')) as f:
            while True:
                chunk = await req.content.read(1024)
                if not chunk:
                    break
                f.write(chunk)


async def async_download_from_url(url, dst,semaphore):
    async with semaphore:#设置并发数
        '''异步'''
        async with aiohttp.connector.TCPConnector(limit=300, force_close=True, enable_cleanup_closed=True) as tc:
            async with aiohttp.ClientSession(connector=tc) as session:
                await fetch(session, url, dst)

file_path="/data/in9/"

#从redis获取pdf下载地址,并发下载pdf
async def coro():
    download_urls=all_pdf_url
    with r.pipeline(transaction=False) as p:
        task = []
        semaphore = asyncio.Semaphore(50)
        for download_url in download_urls:
            #如果下载地址不在redis中,则下载
            if(r.sismember("url_notice_history",download_url)==False):
                # 获取pdf文件编号
                download_url_true=download_url
                if (download_url_true.find("pdf") >= 0):
                    result = re.findall(".*/"+today+"/(.*).pdf", download_url_true)
                else:
                    result = re.findall(".*/"+today+"/(.*).PDF", download_url_true)
                try:
                    p.sadd("url_notice_history",download_url)
                    if(result):
                        dst = file_path + result[0] + ".pdf"
                        task.append(async_download_from_url(download_url, dst,semaphore))
                except Exception as e:
                    logging.error("error_message===",e)
                   
        p.execute()
        await asyncio.wait(task)  # 等待所有协程结束

loop = asyncio.get_event_loop()
loop.run_until_complete(coro())
loop.close()
print("下载结束===")

#将下载的pdf上传到七牛云
qiniuyunDir="notice/file/"
def upload_qiniu(file_dir):
    q = Auth(access_key, secret_key)
    for root,dirs,files in os.walk(file_dir):
        # 生成上传 Token,可以指定过期时间等
        for file in files:
            token = q.upload_token(bucket, qiniuyunDir+file, 3600)
            # 要上传文件的本地路径
            ret, info = put_file(token, qiniuyunDir+file, root+file)
            if info.status_code != 200:
                logging.info("file upload qiniuyun fail %s" % file)

upload_qiniu(file_path)
#执行解析脚本,运行pdf解析程序
print("上传完成完成")
#logging.info("开始运行pdf解析程序===")
#os.system("docker exec -i 7702bb49cc46  bash /pdf_exchange/in9_start.sh")


 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章