準備了一個csv文件,大約約59萬行,14列,大小約61M,格式如下:
https://blog.csdn.net/wowotuo/article/details/109828399
上代碼。
一、寫入arrow文件
1、相關庫
using DataFrames
using CSV
using Arrow;
2、csv => dataframe
csv_path = s"C:\Users\songroom\Desktop\test.csv"
println("csv => DataFrame: ")
@time df = CSV.File(csv_path) |> DataFrame;
3、dataframe => arrow文件
println("df => arrow file ")
arrow_path = "C:\\Users\\songroom\\Desktop\\test.arrow"
io = open(arrow_path, "r+"); # 不能是w,或w+,否則會有內存泄露
try
@time Arrow.write(io, df)
finally
close(io)
end
上面,完成arrow文件的生成。
二、讀出arrow文件
println("read arrow file ")
arrow_path = "C:\\Users\\songroom\\Desktop\\test.arrow"
@time df = DataFrame(Arrow.Table(arrow_path))
三、整體代碼和輸出
using DataFrames
using CSV
using Arrow;
csv_path = "C:\\Users\\songroom\\Desktop\\test.csv"
arrow_path = "C:\\Users\\songroom\\Desktop\\test.arrow"
@time df = get_dataframe_from_csv(csv_path);
@time write_arrow_file(df,arrow_path)
@time df = read_arrow_file(arrow_path)
function get_dataframe_from_csv(csv_path:: String)
df = CSV.File(csv_path) |> DataFrame;
return df
end
function write_arrow_file(df:: DataFrame,arrow_path::String)
println("df => arrow file ")
open(arrow_path, "r+") do io # 不能是w,或w+,否則會有內存泄露
Arrow.write(io, df)
end
end
function read_arrow_file(arrow_path::String)
println("read arrow file ")
df = DataFrame(Arrow.Table(arrow_path))
return df
end
輸出:
julia> @run test
0.313795 seconds (590.24 k allocations: 163.234 MiB, 12.18% gc time)
df => arrow file
0.132979 seconds (594.55 k allocations: 35.252 MiB)
read arrow file
0.000664 seconds (900 allocations: 57.250 KiB)
read_arrow_file (generic function with 1 method)
四、其它
arrow文件和CSV文件比較:
速度絕對快,但arrow文件是一個喫硬盤的。