def reduce_mem_usage(df):
start_mem = df.memory_usage().sum() / (1024 ** 3)
print('Memory usage of dataframe is {:.2f} GB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
min_val = df[col].min()
max_val = df[col].max()
if str(col_type).startswith('int'):
type_list = [np.int8, np.int16, np.int32, np.int64]
for i in type_list:
if min_val >= np.iinfo(i).min and max_val <= np.iinfo(i).max:
df[col] = df[col].astype(i)
break
else:
type_list = [np.float16, np.float32, np.float64]
for i in type_list:
if min_val >= np.iinfo(i).min and max_val <= np.iinfo(i).max:
df[col] = df[col].astype(i)
break
end_mem = df.memory_usage().sum() / (1024 ** 3)
print('Memory usage of dataframe is {:.2f} GB'.format(end_mem))
return df
使用pandas的read_csv或者excel讀取大文件時,在讀取過程中出現OOM(Out of memory,內存溢出),但是結合watch -n 0.1 free -hm和已讀取的行數佔比來查看的話,發現需要內存超出實際內存大約佔10%左右,可通過設置chunksize進行分塊讀取(如總行數的1/10)。