#pdf下載並上傳到七牛雲
# -*- coding: utf-8 -*-
import aiohttp
import asyncio
import redis
import re
import time
import os
import pymysql
from qiniu import Auth
from qiniu import put_file
import logging
logging.basicConfig(filename='log_today_pdf.log',
format='%(asctime)s -%(name)s-%(levelname)s-%(module)s:%(message)s',
datefmt='%Y-%m-%d %H:%M:%S %p',
level=logging.ERROR)
#七牛雲配置
bucket = "xxxx"
access_key = "xxxx"
secret_key = "xxxx"
#獲取當前日期
today=time.strftime('%Y-%m-%d',time.localtime(time.time()))
#today="2019-04-08"
ip="xxx"
user="xxx"
password="xxx"
# 打開數據庫連接
db = pymysql.connect(ip, user, password, "xxxx", charset='utf8' )
# 使用cursor()方法獲取操作遊標
cursor = db.cursor()
# 使用execute方法執行SQL語句
sql="xxxxx"
cursor.execute(sql)
results = cursor.fetchall()
rdp = redis.ConnectionPool(host='xxxx', port=6379, password='')
#redis連接池
r = redis.StrictRedis(connection_pool=rdp)
all_pdf_url=[]
# with r.pipeline(transaction=False) as p:
for row in results:
url = row[0]
#將pdf文件地址保存到redis
all_pdf_url.append(url)
#利用aiohttp包,異步下載pdf文件
async def fetch(session, url, dst):
async with session.get(url) as req:
with(open(dst, 'ab')) as f:
while True:
chunk = await req.content.read(1024)
if not chunk:
break
f.write(chunk)
async def async_download_from_url(url, dst,semaphore):
async with semaphore:#設置併發數
'''異步'''
async with aiohttp.connector.TCPConnector(limit=300, force_close=True, enable_cleanup_closed=True) as tc:
async with aiohttp.ClientSession(connector=tc) as session:
await fetch(session, url, dst)
file_path="/data/in9/"
#從redis獲取pdf下載地址,併發下載pdf
async def coro():
download_urls=all_pdf_url
with r.pipeline(transaction=False) as p:
task = []
semaphore = asyncio.Semaphore(50)
for download_url in download_urls:
#如果下載地址不在redis中,則下載
if(r.sismember("url_notice_history",download_url)==False):
# 獲取pdf文件編號
download_url_true=download_url
if (download_url_true.find("pdf") >= 0):
result = re.findall(".*/"+today+"/(.*).pdf", download_url_true)
else:
result = re.findall(".*/"+today+"/(.*).PDF", download_url_true)
try:
p.sadd("url_notice_history",download_url)
if(result):
dst = file_path + result[0] + ".pdf"
task.append(async_download_from_url(download_url, dst,semaphore))
except Exception as e:
logging.error("error_message===",e)
p.execute()
await asyncio.wait(task) # 等待所有協程結束
loop = asyncio.get_event_loop()
loop.run_until_complete(coro())
loop.close()
print("下載結束===")
#將下載的pdf上傳到七牛雲
qiniuyunDir="notice/file/"
def upload_qiniu(file_dir):
q = Auth(access_key, secret_key)
for root,dirs,files in os.walk(file_dir):
# 生成上傳 Token,可以指定過期時間等
for file in files:
token = q.upload_token(bucket, qiniuyunDir+file, 3600)
# 要上傳文件的本地路徑
ret, info = put_file(token, qiniuyunDir+file, root+file)
if info.status_code != 200:
logging.info("file upload qiniuyun fail %s" % file)
upload_qiniu(file_path)
#執行解析腳本,運行pdf解析程序
print("上傳完成完成")
#logging.info("開始運行pdf解析程序===")
#os.system("docker exec -i 7702bb49cc46 bash /pdf_exchange/in9_start.sh")