python pandas 多线程(携程)写入excel

不知道如何优化,写入excel文件依然很慢. 

# -*- coding: utf-8 -*-#
# python 3.6.7
# Description:  
# Author:       zzq
# Date:         2020/4/27

import math
import threadpool
import asyncio
import pandas as pd

def getdata(rows:int, cols:int):
    """
    生成数据矩阵
    :param rows:行
    :param cols:列
    :return:
    """
    array = []
    for rows in range(rows):
        row = []
        for col in range(cols):
            row.append("Row %sCol %s" % (rows, col))
        array.append(row)
    data = pd.DataFrame(array)
    return data


async def do_work_one(name: str, data: pd.DataFrame, start_row: int, writer: pd.ExcelWriter) -> None:
    """
    定义携程对象
    :param name: 线程名字
    :param data: 数据矩阵
    :param start_row: 数据写入开始的行号
    :param writer: 保存文件对象
    :return:
    """
    if start_row == 0:
        data.to_excel(writer, startrow=start_row)
    else:
        data.to_excel(writer, startrow=start_row + 1, header=False)
    print('\r %s do_work_one' % name, end="")


def task_do_work(name:str, data:pd.DataFrame, start_row:int, data_size:int, writer:pd.ExcelWriter) -> pd.ExcelWriter:
    """
     1、每一个线程里面会有多个协程对象
    2、协程的运行是由顺序的,只是在IO交互的时候,不用等待IO交互完成
    3、多线程中使用协程的时候必须新建loop对象
    :param name: 线程id
    :param data: 数据矩阵
    :param start_row: 数据写入起始行
    :param data_size: 数据片大小
    :param writer:
    :return:
    """
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    tasks = []
    curr_data = data.iloc[start_row:start_row + data_size, ]
    coroutine1 = do_work_one(name, curr_data, start_row, writer)
    tasks.append(coroutine1)
    loop.run_until_complete(asyncio.wait(tasks))

    return writer


def call_back(param, result):
    pass
    # result.save()


if __name__ == '__main__':
    t0 = pd.datetime.now()
    print("start: %s" % t0)
    jobs = []
    num = 10
    pool = threadpool.ThreadPool(num)
    work_requests = []
    data = getdata(rows=12800, cols=50)
    t1 = pd.datetime.now()
    print("数据生成: %s" % t1)
    print("生成数据总计耗时 %s s " % (t1 - t0))
    writer = pd.ExcelWriter("text.xlsx")
    start_row = 0
    data_size = math.ceil(data.shape[0] / num)

    for i in range(num):
        # work_requests.append(threadpool.WorkRequest(task_do_work, args=(
        #     '线程-{0}'.format(i), data, start_row, data_size, writer), callback=call_back,
        #                                             exc_callback=call_back))
        work_requests.append(threadpool.WorkRequest(task_do_work, args=(
            '线程-{0}'.format(i), data, start_row, data_size, writer)))
        start_row = start_row + data_size

    [pool.putRequest(req) for req in work_requests]
    pool.wait()
    print("")
    end = pd.datetime.now()
    print("线程运行完成: %s" % end)
    print("线程运行耗时: %s" % (end - t1))
    writer.save()
    save_end = pd.datetime.now()
    print("保存文件: %s" % save_end)
    print("保存文件耗时: %s" % (save_end - end))
    print('总计耗时 %s s' % (pd.datetime.now() - t0))

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章