BSON.jl
BSON.jl copied to clipboard
Can't save large file due to conversion to Int32
https://github.com/MikeInnes/BSON.jl/blob/7ed00f318f39cfa90a929927132ec252338745a6/src/write.jl#L32
Hi!
I'm using BSON
to save a DataFrame
with some custom types. For a large DataFrame
(about 3.2 GB) this fails at the line indicated because, I think, 3.2GB is larger than typemax(Int32)
(about 2.2GB).
Is BSON
unable to save larger files or is the cast to Int32
unnecessary and could also be changed to Int64
?
My exact trace is:
ERROR: LoadError: InexactError: trunc(Int32, 2397876411)
Stacktrace:
[1] throw_inexacterror(::Symbol, ::Any, ::Int64) at ./boot.jl:567
[2] checked_trunc_sint at ./boot.jl:589 [inlined]
[3] toInt32 at ./boot.jl:626 [inlined]
[4] Type at ./boot.jl:716 [inlined]
[5] bson_doc(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Array{Pair{String,_1} where _1,1}) at [...]/.julia/ packages/BSON/kxdIr/src/write.jl:32
[6] bson_primitive(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Array{Any,1}) at [...]/.julia/packages/BSON/kxdIr/ src/write.jl:37
[7] bson_pair(::Base.GenericIOBuffer{Array{UInt8,1}}, ::String, ::Array{Any,1}) at [...]/.julia/packages/BSON/ kxdIr/src/write.jl:22
[8] bson_doc(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Array{Pair{String,_1} where _1,1}) at [...]/.julia/ packages/BSON/kxdIr/src/write.jl:28
[9] bson_primitive(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Array{Any,1}) at [...]/.julia/packages/BSON/kxdIr/ src/write.jl:37
[10] bson_pair(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Symbol, ::Array{Any,1}) at [...]/.julia/packages/BSON/ kxdIr/src/write.jl:22
[11] bson_doc(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Dict{Symbol,Any}) at [...]/.julia/packages/BSON/kxdIr/src/ write.jl:28
[12] bson_primitive at [...]/.julia/packages/BSON/kxdIr/src/write.jl:36 [inlined]
[13] bson_pair(::Base.GenericIOBuffer{Array{UInt8,1}}, ::Symbol, ::Dict{Symbol,Any}) at [...]/.julia/packages/ BSON/kxdIr/src/write.jl:22
[14] bson_doc(::IOStream, ::Dict{Symbol,Any}) at [...]/.julia/packages/BSON/kxdIr/src/write.jl:28
[15] bson_primitive(::IOStream, ::Dict{Symbol,Any}) at [...]/.julia/packages/BSON/kxdIr/src/write.jl:36
[16] bson(::IOStream, ::Dict{Symbol,DataFrame}) at [...]/.julia/packages/BSON/kxdIr/src/write.jl:81
[17] #14 at [...]/.julia/packages/BSON/kxdIr/src/write.jl:83 [inlined]
[18] #open#294(::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::Function, ::getfield(BSON, Symbol("##14#15")){Dict{Symbol,DataFrame}}, ::String, ::Vararg{String,N} where N) at ./iostream.jl:369
[19] open at ./iostream.jl:367 [inlined]
[20] bson(::String, ::Dict{Symbol,DataFrame}) at [...]/.julia/packages/BSON/kxdIr/src/write.jl:83
[21] top-level scope at none:0
[22] include at ./boot.jl:317 [inlined]
[23] include_relative(::Module, ::String) at ./loading.jl:1044
[24] include(::Module, ::String) at ./sysimg.jl:29
[25] exec_options(::Base.JLOptions) at ./client.jl:231
[26] _start() at ./client.jl:425
in expression starting [...]/runheadless.jl:67
edit:
So it seems like in general all write
and read
cast the length of the object to write or read to an Int32
.
Maybe that's reasonable and I should just split up my DataFrame
instead?
Cheers
This is an unfortunate limitation of BSON's array implementation. But we could work around it by splitting arrays into typemax(Int32)
chunks.
This is an unfortunate limitation of BSON's array implementation. But we could work around it by splitting arrays into
typemax(Int32)
chunks.
for example
function partition_matrix(M; chunk_size=1024)
dim1, dim2 = size(M)
Nc1 = dim1%chunk_size==0 ? dim1÷chunk_size : dim1÷chunk_size + 1
Nc2 = dim2%chunk_size==0 ? dim2÷chunk_size : dim2÷chunk_size + 1
@show Nc1, Nc2
return Dict(
(i,j) => M[ (i-1)*chunk_size+1:min(dim1,i*chunk_size),
(j-1)*chunk_size+1:min(dim2,j*chunk_size) ]
for i=1:Nc1 for j=1:Nc2
)
end
function merge_matrix(M::Dict)
IJ = sort(collect(keys(M)))
Nc1 = maximum(first.(IJ))
Nc2 = maximum(last.(IJ))
r1, r2 = size(M[(Nc1,Nc2)])
c1, c2 = size(M[(1,1)])
dim1 = Nc1==1 ? r1 : c1*(Nc1-1)+r1
dim2 = Nc2==1 ? r2 : c2*(Nc2-1)+r2
@show dim1, dim2
MAT = zeros(eltype(M[(1,1)]), dim1, dim2)
for (k,v) in M
MAT[(k[1]-1)*c1+1:min(k[1]*c1,dim1), (k[2]-1)*c2+1:min(k[2]*c2,dim2)] .= v
end
return MAT
end
function save_big_matrix(fn_common, M; Nparts=20)
MD = partition_matrix(M)
kMD = sort(collect(keys(MD)))
L = length(kMD) ÷ Nparts + 1
for i = 1:Nparts
m = Dict(k=>MD[k] for k ∈ kMD[((i-1)*L+1):min(i*L,length(kMD))])
BSON.bson(fn_common * "_part_$(i).bson", matrix=m, allkeys=kMD)
end
return
end
function load_big_matrix(fn_common; Nparts=20)
@assert all([isfile(fn_common*"_part_$(i).bson") for i=1:Nparts])
MD = Dict()
for i = 1:Nparts
fn_i = fn_common*"_part_$(i).bson"
m = BSON.load(fn_i)[:matrix]
MD = merge(MD, m)
end
fn_i = fn_common*"_part_1.bson"
kMD = BSON.load(fn_i)[:allkeys]
@assert sort(kMD)==sort(collect(keys(MD)))
return merge_matrix(MD)
end