Julia: arrow,一種革命性的數據格式

準備了一個csv文件,大約約59萬行,14列,大小約61M,格式如下:

https://blog.csdn.net/wowotuo/article/details/109828399

上代碼。

一、寫入arrow文件

1、相關庫

using DataFrames
using CSV
using Arrow;

2、csv => dataframe

csv_path = s"C:\Users\songroom\Desktop\test.csv"
println("csv => DataFrame: ")
@time df =  CSV.File(csv_path) |> DataFrame;

3、dataframe => arrow文件

println("df => arrow file ")
arrow_path = "C:\\Users\\songroom\\Desktop\\test.arrow"
io = open(arrow_path, "r+"); # 不能是w,或w+,否則會有內存泄露
try
    @time Arrow.write(io, df)
finally
    close(io)
end

上面,完成arrow文件的生成。

二、讀出arrow文件

println("read arrow file ")
arrow_path = "C:\\Users\\songroom\\Desktop\\test.arrow"
@time df = DataFrame(Arrow.Table(arrow_path))

三、整體代碼和輸出

using DataFrames
using CSV
using Arrow;

csv_path = "C:\\Users\\songroom\\Desktop\\test.csv"
arrow_path = "C:\\Users\\songroom\\Desktop\\test.arrow"
@time df = get_dataframe_from_csv(csv_path);
@time write_arrow_file(df,arrow_path)
@time df = read_arrow_file(arrow_path)

function get_dataframe_from_csv(csv_path:: String)
   df =  CSV.File(csv_path) |> DataFrame;
   return df
end

function write_arrow_file(df:: DataFrame,arrow_path::String)
    println("df => arrow file ")
    open(arrow_path, "r+") do io # 不能是w,或w+,否則會有內存泄露
        Arrow.write(io, df)
    end
end

function read_arrow_file(arrow_path::String)
    println("read arrow file ")
    df = DataFrame(Arrow.Table(arrow_path))
    return df
end

輸出:

julia> @run test
  0.313795 seconds (590.24 k allocations: 163.234 MiB, 12.18% gc time)
df => arrow file
  0.132979 seconds (594.55 k allocations: 35.252 MiB)
read arrow file
  0.000664 seconds (900 allocations: 57.250 KiB)
read_arrow_file (generic function with 1 method)

在這裏插入圖片描述四、其它

arrow文件和CSV文件比較:
在這裏插入圖片描述速度絕對快,但arrow文件是一個喫硬盤的。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章