利用asyncio併發下載pdf並同步到七牛雲網盤

#pdf下載並上傳到七牛雲

# -*- coding: utf-8 -*-

import aiohttp
import asyncio
import redis
import re
import time
import os
import  pymysql
from qiniu import Auth
from qiniu import put_file

import logging

logging.basicConfig(filename='log_today_pdf.log',
                    format='%(asctime)s -%(name)s-%(levelname)s-%(module)s:%(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S %p',
                    level=logging.ERROR)

#七牛雲配置
bucket = "xxxx"
access_key = "xxxx"
secret_key = "xxxx"


#獲取當前日期
today=time.strftime('%Y-%m-%d',time.localtime(time.time()))
#today="2019-04-08"

ip="xxx"
user="xxx"
password="xxx"

# 打開數據庫連接
db = pymysql.connect(ip, user, password, "xxxx", charset='utf8' )

# 使用cursor()方法獲取操作遊標
cursor = db.cursor()

# 使用execute方法執行SQL語句

sql="xxxxx"

cursor.execute(sql)
results = cursor.fetchall()

rdp = redis.ConnectionPool(host='xxxx', port=6379, password='')
#redis連接池
r = redis.StrictRedis(connection_pool=rdp)

all_pdf_url=[]
# with r.pipeline(transaction=False) as p:
for row in results:
    url = row[0]
    #將pdf文件地址保存到redis
    all_pdf_url.append(url)
    


#利用aiohttp包,異步下載pdf文件
async def fetch(session, url, dst):
    async with session.get(url) as req:
        with(open(dst, 'ab')) as f:
            while True:
                chunk = await req.content.read(1024)
                if not chunk:
                    break
                f.write(chunk)


async def async_download_from_url(url, dst,semaphore):
    async with semaphore:#設置併發數
        '''異步'''
        async with aiohttp.connector.TCPConnector(limit=300, force_close=True, enable_cleanup_closed=True) as tc:
            async with aiohttp.ClientSession(connector=tc) as session:
                await fetch(session, url, dst)

file_path="/data/in9/"

#從redis獲取pdf下載地址,併發下載pdf
async def coro():
    download_urls=all_pdf_url
    with r.pipeline(transaction=False) as p:
        task = []
        semaphore = asyncio.Semaphore(50)
        for download_url in download_urls:
            #如果下載地址不在redis中,則下載
            if(r.sismember("url_notice_history",download_url)==False):
                # 獲取pdf文件編號
                download_url_true=download_url
                if (download_url_true.find("pdf") >= 0):
                    result = re.findall(".*/"+today+"/(.*).pdf", download_url_true)
                else:
                    result = re.findall(".*/"+today+"/(.*).PDF", download_url_true)
                try:
                    p.sadd("url_notice_history",download_url)
                    if(result):
                        dst = file_path + result[0] + ".pdf"
                        task.append(async_download_from_url(download_url, dst,semaphore))
                except Exception as e:
                    logging.error("error_message===",e)
                   
        p.execute()
        await asyncio.wait(task)  # 等待所有協程結束

loop = asyncio.get_event_loop()
loop.run_until_complete(coro())
loop.close()
print("下載結束===")

#將下載的pdf上傳到七牛雲
qiniuyunDir="notice/file/"
def upload_qiniu(file_dir):
    q = Auth(access_key, secret_key)
    for root,dirs,files in os.walk(file_dir):
        # 生成上傳 Token,可以指定過期時間等
        for file in files:
            token = q.upload_token(bucket, qiniuyunDir+file, 3600)
            # 要上傳文件的本地路徑
            ret, info = put_file(token, qiniuyunDir+file, root+file)
            if info.status_code != 200:
                logging.info("file upload qiniuyun fail %s" % file)

upload_qiniu(file_path)
#執行解析腳本,運行pdf解析程序
print("上傳完成完成")
#logging.info("開始運行pdf解析程序===")
#os.system("docker exec -i 7702bb49cc46  bash /pdf_exchange/in9_start.sh")


 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章