不知道如何優化,寫入excel文件依然很慢.
# -*- coding: utf-8 -*-#
# python 3.6.7
# Description:
# Author: zzq
# Date: 2020/4/27
import math
import threadpool
import asyncio
import pandas as pd
def getdata(rows:int, cols:int):
"""
生成數據矩陣
:param rows:行
:param cols:列
:return:
"""
array = []
for rows in range(rows):
row = []
for col in range(cols):
row.append("Row %sCol %s" % (rows, col))
array.append(row)
data = pd.DataFrame(array)
return data
async def do_work_one(name: str, data: pd.DataFrame, start_row: int, writer: pd.ExcelWriter) -> None:
"""
定義攜程對象
:param name: 線程名字
:param data: 數據矩陣
:param start_row: 數據寫入開始的行號
:param writer: 保存文件對象
:return:
"""
if start_row == 0:
data.to_excel(writer, startrow=start_row)
else:
data.to_excel(writer, startrow=start_row + 1, header=False)
print('\r %s do_work_one' % name, end="")
def task_do_work(name:str, data:pd.DataFrame, start_row:int, data_size:int, writer:pd.ExcelWriter) -> pd.ExcelWriter:
"""
1、每一個線程裏面會有多個協程對象
2、協程的運行是由順序的,只是在IO交互的時候,不用等待IO交互完成
3、多線程中使用協程的時候必須新建loop對象
:param name: 線程id
:param data: 數據矩陣
:param start_row: 數據寫入起始行
:param data_size: 數據片大小
:param writer:
:return:
"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
tasks = []
curr_data = data.iloc[start_row:start_row + data_size, ]
coroutine1 = do_work_one(name, curr_data, start_row, writer)
tasks.append(coroutine1)
loop.run_until_complete(asyncio.wait(tasks))
return writer
def call_back(param, result):
pass
# result.save()
if __name__ == '__main__':
t0 = pd.datetime.now()
print("start: %s" % t0)
jobs = []
num = 10
pool = threadpool.ThreadPool(num)
work_requests = []
data = getdata(rows=12800, cols=50)
t1 = pd.datetime.now()
print("數據生成: %s" % t1)
print("生成數據總計耗時 %s s " % (t1 - t0))
writer = pd.ExcelWriter("text.xlsx")
start_row = 0
data_size = math.ceil(data.shape[0] / num)
for i in range(num):
# work_requests.append(threadpool.WorkRequest(task_do_work, args=(
# '線程-{0}'.format(i), data, start_row, data_size, writer), callback=call_back,
# exc_callback=call_back))
work_requests.append(threadpool.WorkRequest(task_do_work, args=(
'線程-{0}'.format(i), data, start_row, data_size, writer)))
start_row = start_row + data_size
[pool.putRequest(req) for req in work_requests]
pool.wait()
print("")
end = pd.datetime.now()
print("線程運行完成: %s" % end)
print("線程運行耗時: %s" % (end - t1))
writer.save()
save_end = pd.datetime.now()
print("保存文件: %s" % save_end)
print("保存文件耗時: %s" % (save_end - end))
print('總計耗時 %s s' % (pd.datetime.now() - t0))