BSON.jl icon indicating copy to clipboard operation
BSON.jl copied to clipboard

Can't save large file due to conversion to Int32

Open under-Peter opened this issue 6 years ago • 2 comments

Hi! I'm using BSON to save a DataFrame with some custom types. For a large DataFrame (about 3.2 GB) this fails at the line indicated because, I think, 3.2GB is larger than typemax(Int32) (about 2.2GB).

Is BSON unable to save larger files or is the cast to Int32 unnecessary and could also be changed to Int64?

My exact trace is:

 ERROR: LoadError: InexactError: trunc(Int32, 2397876411)
  [1] throw_inexacterror(::Symbol, ::Any, ::Int64) at ./boot.jl:567
  [2] checked_trunc_sint at ./boot.jl:589 [inlined]
  [3] toInt32 at ./boot.jl:626 [inlined]
  [4] Type at ./boot.jl:716 [inlined]
  [5] bson_doc(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Array{Pair{String,_1} where _1,1}) at [...]/.julia/            packages/BSON/kxdIr/src/write.jl:32
  [6] bson_primitive(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Array{Any,1}) at [...]/.julia/packages/BSON/kxdIr/       src/write.jl:37
  [7] bson_pair(::Base.GenericIOBuffer{Array{UInt8,1}}, ::String, ::Array{Any,1}) at [...]/.julia/packages/BSON/        kxdIr/src/write.jl:22
  [8] bson_doc(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Array{Pair{String,_1} where _1,1}) at [...]/.julia/            packages/BSON/kxdIr/src/write.jl:28
  [9] bson_primitive(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Array{Any,1}) at [...]/.julia/packages/BSON/kxdIr/       src/write.jl:37
  [10] bson_pair(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Symbol, ::Array{Any,1}) at [...]/.julia/packages/BSON/       kxdIr/src/write.jl:22
  [11] bson_doc(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Dict{Symbol,Any}) at [...]/.julia/packages/BSON/kxdIr/src/    write.jl:28
  [12] bson_primitive at [...]/.julia/packages/BSON/kxdIr/src/write.jl:36 [inlined]
  [13] bson_pair(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Symbol, ::Dict{Symbol,Any}) at [...]/.julia/packages/        BSON/kxdIr/src/write.jl:22
  [14] bson_doc(::IOStream, ::Dict{Symbol,Any}) at [...]/.julia/packages/BSON/kxdIr/src/write.jl:28
  [15] bson_primitive(::IOStream, ::Dict{Symbol,Any}) at [...]/.julia/packages/BSON/kxdIr/src/write.jl:36
  [16] bson(::IOStream, ::Dict{Symbol,DataFrame}) at [...]/.julia/packages/BSON/kxdIr/src/write.jl:81
  [17] #14 at [...]/.julia/packages/BSON/kxdIr/src/write.jl:83 [inlined]
  [18] #open#294(::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::Function, ::getfield(BSON,                  Symbol("##14#15")){Dict{Symbol,DataFrame}}, ::String, ::Vararg{String,N} where N) at ./iostream.jl:369
  [19] open at ./iostream.jl:367 [inlined]
  [20] bson(::String, ::Dict{Symbol,DataFrame}) at [...]/.julia/packages/BSON/kxdIr/src/write.jl:83
  [21] top-level scope at none:0
  [22] include at ./boot.jl:317 [inlined]
  [23] include_relative(::Module, ::String) at ./loading.jl:1044
  [24] include(::Module, ::String) at ./sysimg.jl:29
  [25] exec_options(::Base.JLOptions) at ./client.jl:231
  [26] _start() at ./client.jl:425
 in expression starting [...]/runheadless.jl:67

edit: So it seems like in general all write and read cast the length of the object to write or read to an Int32. Maybe that's reasonable and I should just split up my DataFrame instead? Cheers

under-Peter avatar Nov 22 '18 13:11 under-Peter

This is an unfortunate limitation of BSON's array implementation. But we could work around it by splitting arrays into typemax(Int32) chunks.

MikeInnes avatar Nov 28 '18 00:11 MikeInnes

This is an unfortunate limitation of BSON's array implementation. But we could work around it by splitting arrays into typemax(Int32) chunks.

for example

function partition_matrix(M; chunk_size=1024)
    dim1, dim2 = size(M)
    Nc1 = dim1%chunk_size==0 ? dim1÷chunk_size :  dim1÷chunk_size + 1
    Nc2 = dim2%chunk_size==0 ? dim2÷chunk_size :  dim2÷chunk_size + 1
    @show Nc1, Nc2
    return Dict(
        (i,j) => M[ (i-1)*chunk_size+1:min(dim1,i*chunk_size),
                    (j-1)*chunk_size+1:min(dim2,j*chunk_size) ]
        for i=1:Nc1 for j=1:Nc2

function merge_matrix(M::Dict)
    IJ = sort(collect(keys(M)))
    Nc1 = maximum(first.(IJ))
    Nc2 = maximum(last.(IJ))
    r1, r2 = size(M[(Nc1,Nc2)])
    c1, c2 = size(M[(1,1)])
    dim1 = Nc1==1 ? r1 : c1*(Nc1-1)+r1
    dim2 = Nc2==1 ? r2 : c2*(Nc2-1)+r2
    @show dim1, dim2
    MAT = zeros(eltype(M[(1,1)]), dim1, dim2)
    for (k,v) in M
        MAT[(k[1]-1)*c1+1:min(k[1]*c1,dim1), (k[2]-1)*c2+1:min(k[2]*c2,dim2)] .= v
    return MAT

function save_big_matrix(fn_common, M; Nparts=20)
    MD  = partition_matrix(M)
    kMD = sort(collect(keys(MD)))
    L   = length(kMD) ÷ Nparts + 1
    for i = 1:Nparts
        m = Dict(k=>MD[k] for k ∈ kMD[((i-1)*L+1):min(i*L,length(kMD))])
        BSON.bson(fn_common * "_part_$(i).bson", matrix=m, allkeys=kMD)

function load_big_matrix(fn_common; Nparts=20)
    @assert all([isfile(fn_common*"_part_$(i).bson") for i=1:Nparts])
    MD  = Dict()
    for i = 1:Nparts
        fn_i = fn_common*"_part_$(i).bson"
        m = BSON.load(fn_i)[:matrix]
        MD = merge(MD, m)
    fn_i = fn_common*"_part_1.bson"
    kMD = BSON.load(fn_i)[:allkeys]
    @assert sort(kMD)==sort(collect(keys(MD)))
    return merge_matrix(MD)

algorithmx avatar Jul 22 '21 10:07 algorithmx