pandas合併excel

工作中有個需求需要把100多個excel文件合併成一個,並按照指定的列排序,查了查,用pandas處理做好。
備註:111個excel文件,總共有200多萬條數據,101M數據
代碼:

import os
import sys
import pandas as pd
import datetime

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(BASE_DIR)

EXECL_DIRS = ["xiaoying", "xiaoshu"]     # xlsx文件所在的目錄
EXECL_FIELS_MAP = {"xueqian": "學前包含圖片的題目", "xiaoyu": "小語包含圖片的題目", "xiaoying": "小英包含圖片的題目", "xiaoshu": "小數包含圖片的題目"}


def combine_execl_files():
    """
    組合execl
    :return: 
    """
    def deal_url(val):
        if not val.startswith('"'):
            val = '"'+val+'"'
        return val

    for item in EXECL_DIRS:
        dfs = []
        print("begin to deal %s" % item)
        start_time = datetime.datetime.now()
        path_dir = os.path.join(BASE_DIR, 'tools/%s/'% item)
        files = os.listdir(path_dir)
        for file in files:
            file_path = os.path.join(path_dir, file)
            try:
                df = pd.read_excel(file_path)
                if not df.empty:
                    df['圖片url'] = df['圖片url'].apply(deal_url)
                    dfs.append(df)
            except Exception as exc:
                print("error file_path: %s     error:%s" % (file_path, str(exc)))

        print("total df number: %s" % len(dfs))
        new_df = pd.concat(dfs, ignore_index=True)
        new_df = new_df.sort_values(by=['圖片url'])      # 按某一列進行排序
        print("total rows: %s   total cost time:%s " % (len(new_df), datetime.datetime.now()-start_time))

        df_cnt = len(new_df)
        step = 500000          # 數據超過200萬條,故分批處理
        numbers = int(df_cnt/step) + 1
        for i in range(1, numbers+1):
            start = (i-1) * step
            end = i * step
            print("start:%s   end:%s " % (start, end))
            sub_df = new_df[start:end]
            a_file_name = os.path.join(BASE_DIR, 'tools/%s_%s.xlsx' % (EXECL_FIELS_MAP.get(item), i))
            sub_df.to_excel(a_file_name, index=False, encoding='utf8')
            print("write data to excel success !!!")


if __name__ == "__main__":
    combine_execl_files()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章