python pandas 多線程(攜程)寫入excel

不知道如何優化,寫入excel文件依然很慢. 

# -*- coding: utf-8 -*-#
# python 3.6.7
# Description:  
# Author:       zzq
# Date:         2020/4/27

import math
import threadpool
import asyncio
import pandas as pd

def getdata(rows:int, cols:int):
    """
    生成數據矩陣
    :param rows:行
    :param cols:列
    :return:
    """
    array = []
    for rows in range(rows):
        row = []
        for col in range(cols):
            row.append("Row %sCol %s" % (rows, col))
        array.append(row)
    data = pd.DataFrame(array)
    return data


async def do_work_one(name: str, data: pd.DataFrame, start_row: int, writer: pd.ExcelWriter) -> None:
    """
    定義攜程對象
    :param name: 線程名字
    :param data: 數據矩陣
    :param start_row: 數據寫入開始的行號
    :param writer: 保存文件對象
    :return:
    """
    if start_row == 0:
        data.to_excel(writer, startrow=start_row)
    else:
        data.to_excel(writer, startrow=start_row + 1, header=False)
    print('\r %s do_work_one' % name, end="")


def task_do_work(name:str, data:pd.DataFrame, start_row:int, data_size:int, writer:pd.ExcelWriter) -> pd.ExcelWriter:
    """
     1、每一個線程裏面會有多個協程對象
    2、協程的運行是由順序的,只是在IO交互的時候,不用等待IO交互完成
    3、多線程中使用協程的時候必須新建loop對象
    :param name: 線程id
    :param data: 數據矩陣
    :param start_row: 數據寫入起始行
    :param data_size: 數據片大小
    :param writer:
    :return:
    """
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    tasks = []
    curr_data = data.iloc[start_row:start_row + data_size, ]
    coroutine1 = do_work_one(name, curr_data, start_row, writer)
    tasks.append(coroutine1)
    loop.run_until_complete(asyncio.wait(tasks))

    return writer


def call_back(param, result):
    pass
    # result.save()


if __name__ == '__main__':
    t0 = pd.datetime.now()
    print("start: %s" % t0)
    jobs = []
    num = 10
    pool = threadpool.ThreadPool(num)
    work_requests = []
    data = getdata(rows=12800, cols=50)
    t1 = pd.datetime.now()
    print("數據生成: %s" % t1)
    print("生成數據總計耗時 %s s " % (t1 - t0))
    writer = pd.ExcelWriter("text.xlsx")
    start_row = 0
    data_size = math.ceil(data.shape[0] / num)

    for i in range(num):
        # work_requests.append(threadpool.WorkRequest(task_do_work, args=(
        #     '線程-{0}'.format(i), data, start_row, data_size, writer), callback=call_back,
        #                                             exc_callback=call_back))
        work_requests.append(threadpool.WorkRequest(task_do_work, args=(
            '線程-{0}'.format(i), data, start_row, data_size, writer)))
        start_row = start_row + data_size

    [pool.putRequest(req) for req in work_requests]
    pool.wait()
    print("")
    end = pd.datetime.now()
    print("線程運行完成: %s" % end)
    print("線程運行耗時: %s" % (end - t1))
    writer.save()
    save_end = pd.datetime.now()
    print("保存文件: %s" % save_end)
    print("保存文件耗時: %s" % (save_end - end))
    print('總計耗時 %s s' % (pd.datetime.now() - t0))

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章