BSON.jl icon indicating copy to clipboard operation
BSON.jl copied to clipboard

Can't save large file due to conversion to Int32

Open under-Peter opened this issue 6 years ago • 2 comments

https://github.com/MikeInnes/BSON.jl/blob/7ed00f318f39cfa90a929927132ec252338745a6/src/write.jl#L32

Hi! I'm using BSON to save a DataFrame with some custom types. For a large DataFrame (about 3.2 GB) this fails at the line indicated because, I think, 3.2GB is larger than typemax(Int32) (about 2.2GB).

Is BSON unable to save larger files or is the cast to Int32 unnecessary and could also be changed to Int64?

My exact trace is:

 ERROR: LoadError: InexactError: trunc(Int32, 2397876411)
 Stacktrace:
  [1] throw_inexacterror(::Symbol, ::Any, ::Int64) at ./boot.jl:567
  [2] checked_trunc_sint at ./boot.jl:589 [inlined]
  [3] toInt32 at ./boot.jl:626 [inlined]
  [4] Type at ./boot.jl:716 [inlined]
  [5] bson_doc(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Array{Pair{String,_1} where _1,1}) at [...]/.julia/            packages/BSON/kxdIr/src/write.jl:32
  [6] bson_primitive(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Array{Any,1}) at [...]/.julia/packages/BSON/kxdIr/       src/write.jl:37
  [7] bson_pair(::Base.GenericIOBuffer{Array{UInt8,1}}, ::String, ::Array{Any,1}) at [...]/.julia/packages/BSON/        kxdIr/src/write.jl:22
  [8] bson_doc(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Array{Pair{String,_1} where _1,1}) at [...]/.julia/            packages/BSON/kxdIr/src/write.jl:28
  [9] bson_primitive(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Array{Any,1}) at [...]/.julia/packages/BSON/kxdIr/       src/write.jl:37
  [10] bson_pair(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Symbol, ::Array{Any,1}) at [...]/.julia/packages/BSON/       kxdIr/src/write.jl:22
  [11] bson_doc(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Dict{Symbol,Any}) at [...]/.julia/packages/BSON/kxdIr/src/    write.jl:28
  [12] bson_primitive at [...]/.julia/packages/BSON/kxdIr/src/write.jl:36 [inlined]
  [13] bson_pair(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Symbol, ::Dict{Symbol,Any}) at [...]/.julia/packages/        BSON/kxdIr/src/write.jl:22
  [14] bson_doc(::IOStream, ::Dict{Symbol,Any}) at [...]/.julia/packages/BSON/kxdIr/src/write.jl:28
  [15] bson_primitive(::IOStream, ::Dict{Symbol,Any}) at [...]/.julia/packages/BSON/kxdIr/src/write.jl:36
  [16] bson(::IOStream, ::Dict{Symbol,DataFrame}) at [...]/.julia/packages/BSON/kxdIr/src/write.jl:81
  [17] #14 at [...]/.julia/packages/BSON/kxdIr/src/write.jl:83 [inlined]
  [18] #open#294(::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::Function, ::getfield(BSON,                  Symbol("##14#15")){Dict{Symbol,DataFrame}}, ::String, ::Vararg{String,N} where N) at ./iostream.jl:369
  [19] open at ./iostream.jl:367 [inlined]
  [20] bson(::String, ::Dict{Symbol,DataFrame}) at [...]/.julia/packages/BSON/kxdIr/src/write.jl:83
  [21] top-level scope at none:0
  [22] include at ./boot.jl:317 [inlined]
  [23] include_relative(::Module, ::String) at ./loading.jl:1044
  [24] include(::Module, ::String) at ./sysimg.jl:29
  [25] exec_options(::Base.JLOptions) at ./client.jl:231
  [26] _start() at ./client.jl:425
 in expression starting [...]/runheadless.jl:67

edit: So it seems like in general all write and read cast the length of the object to write or read to an Int32. Maybe that's reasonable and I should just split up my DataFrame instead? Cheers

under-Peter avatar Nov 22 '18 13:11 under-Peter

This is an unfortunate limitation of BSON's array implementation. But we could work around it by splitting arrays into typemax(Int32) chunks.

MikeInnes avatar Nov 28 '18 00:11 MikeInnes

This is an unfortunate limitation of BSON's array implementation. But we could work around it by splitting arrays into typemax(Int32) chunks.

for example


function partition_matrix(M; chunk_size=1024)
    dim1, dim2 = size(M)
    Nc1 = dim1%chunk_size==0 ? dim1÷chunk_size :  dim1÷chunk_size + 1
    Nc2 = dim2%chunk_size==0 ? dim2÷chunk_size :  dim2÷chunk_size + 1
    @show Nc1, Nc2
    return Dict(
        (i,j) => M[ (i-1)*chunk_size+1:min(dim1,i*chunk_size),
                    (j-1)*chunk_size+1:min(dim2,j*chunk_size) ]
        for i=1:Nc1 for j=1:Nc2
    )
end


function merge_matrix(M::Dict)
    IJ = sort(collect(keys(M)))
    Nc1 = maximum(first.(IJ))
    Nc2 = maximum(last.(IJ))
    r1, r2 = size(M[(Nc1,Nc2)])
    c1, c2 = size(M[(1,1)])
    dim1 = Nc1==1 ? r1 : c1*(Nc1-1)+r1
    dim2 = Nc2==1 ? r2 : c2*(Nc2-1)+r2
    @show dim1, dim2
    MAT = zeros(eltype(M[(1,1)]), dim1, dim2)
    for (k,v) in M
        MAT[(k[1]-1)*c1+1:min(k[1]*c1,dim1), (k[2]-1)*c2+1:min(k[2]*c2,dim2)] .= v
    end
    return MAT
end


function save_big_matrix(fn_common, M; Nparts=20)
    MD  = partition_matrix(M)
    kMD = sort(collect(keys(MD)))
    L   = length(kMD) ÷ Nparts + 1
    for i = 1:Nparts
        m = Dict(k=>MD[k] for k ∈ kMD[((i-1)*L+1):min(i*L,length(kMD))])
        BSON.bson(fn_common * "_part_$(i).bson", matrix=m, allkeys=kMD)
    end
    return
end


function load_big_matrix(fn_common; Nparts=20)
    @assert all([isfile(fn_common*"_part_$(i).bson") for i=1:Nparts])
    MD  = Dict()
    for i = 1:Nparts
        fn_i = fn_common*"_part_$(i).bson"
        m = BSON.load(fn_i)[:matrix]
        MD = merge(MD, m)
    end
    fn_i = fn_common*"_part_1.bson"
    kMD = BSON.load(fn_i)[:allkeys]
    @assert sort(kMD)==sort(collect(keys(MD)))
    return merge_matrix(MD)
end




algorithmx avatar Jul 22 '21 10:07 algorithmx