StrideArraysCore.jl
StrideArraysCore.jl copied to clipboard
Implement `SmallStrideArray`
This is a note to myself or anyone else who wants to take a stab at implementing an array type which stores up to N values as fixed size bits and then if the array gets too big it'll instead store them on the heap. From https://discourse.julialang.org/t/small-vectors/97121/5 a basic implementation looks like:
using StrideArrays
mutable struct SmallStore{SpillSize}
inline::NTuple{SpillSize, UInt8}
outline::Union{Vector{UInt8}, Nothing}
end;
function smallarray(::Type{T}, ::StaticInt{SpillSize}, size::Integer...) where {T, SpillSize}
@assert isbitstype(T)
N = prod(size) * sizeof(T)
inline = Ref{NTuple{SpillSize, UInt8}}()[]
if N <= SpillSize
outline = nothing
store = SmallStore{SpillSize}(inline, outline)
ptr = pointer_from_objref(store)
else
outline = Vector{UInt8}(undef, N)
store = SmallStore{SpillSize}(inline, outline)
ptr = pointer(outline)
end
ptrA = PtrArray(Ptr{T}(ptr), size)
strA = StrideArray(ptrA, store)
end;
Now here's a benchmark function comparing this smallarray to an array without a static lower size bound:
function foo_sa()
SpillSize = static(10*8)
if rand() < 0.95
# 95% of the time our vector fits inline
L = 10
else
# 5% of the time it'll be too big
L = 10000
end
A = smallarray(Float64, SpillSize, L)
A .= (1:L)
sum(A)
end;
function foo()
if rand() < 0.95
L = 10
else
L = 10000
end
A = StrideArray{Float64}(undef, L)
A .= (1:L)
sum(A)
end;
julia> @benchmark foo_sa()
BenchmarkTools.Trial: 10000 samples with 997 evaluations.
Range (min … max): 138.866 ns … 3.697 μs ┊ GC (min … max): 0.00% … 78.22%
Time (median): 273.836 ns ┊ GC (median): 0.00%
Time (mean ± σ): 321.531 ns ± 176.710 ns ┊ GC (mean ± σ): 10.40% ± 15.50%
▃▆██▇▄▂
▁▁▂▃▅█████████▇▅▄▃▃▂▂▂▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
139 ns Histogram: frequency by time 932 ns <
Memory estimate: 2.05 KiB, allocs estimate: 1.
julia> @benchmark foo()
BenchmarkTools.Trial: 9078 samples with 994 evaluations.
Range (min … max): 269.175 ns … 2.844 μs ┊ GC (min … max): 0.00% … 80.24%
Time (median): 475.920 ns ┊ GC (median): 0.00%
Time (mean ± σ): 551.678 ns ± 220.005 ns ┊ GC (mean ± σ): 11.35% ± 15.79%
▁▆█▇▅
▁▁▁▁▃▅██████▆▅▃▂▂▂▂▂▂▂▃▃▄▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
269 ns Histogram: frequency by time 1.53 μs <
Memory estimate: 4.77 KiB, allocs estimate: 1.
I don't think this works. I made a few tweaks:
using StrideArrays
mutable struct SmallStore{SpillSize}
inline::NTuple{SpillSize, UInt8}
outline::Vector{UInt8}
SmallStore{SpillSize}() where {SpillSize} = new{SpillSize}()
end;
# in case you want to check
is_small(x::SmallStore) = isdefined(x, :outline)
@inline function smallarray(::Type{T}, ::StaticInt{SpillSize}, size::Integer...) where {T, SpillSize}
@assert isbitstype(T)
N = prod(size)
store = SmallStore{SpillSize*sizeof(T)}()
ptr::Ptr{T} = if N <= SpillSize
Ptr{T}(pointer_from_objref(store))
else
outline = store.outline = Vector{UInt8}(undef, N*sizeof(T))
Ptr{T}(pointer(outline))
end
ptrA = PtrArray(ptr, size)
strA = StrideArray(ptrA, store)
end
The performance benefit is only because allocating arrays in Julia is slowere than heap allocating other kinds of mutable structs. It is still heap allocated.
Testing:
function foo(p::Float64=0.95)
if rand() < p
L = 10
else
L = 10000
end
A = StrideArray{Float64}(undef, L)
A .= (1:L)
sum(A)
end;
function foo_sa(p::Float64=0.95)
SpillSize = static(10)
if rand() < p
# 95% of the time our vector fits inline
L = 10
else
# 5% of the time it'll be too big
L = 10000
end
A = smallarray(Float64, SpillSize, L)
A .= (1:L)
sum(A)
end;
Note also that pointer_from_objref returns Ptr{Nothing} while pointer(::Vector{UInt8}) returns Ptr{UInt8}. Union splitting handles it, but I prefer to be explicit.
Results:
julia> @benchmark foo_sa(1.0)
BenchmarkTools.Trial: 10000 samples with 995 evaluations.
Range (min … max): 25.801 ns … 3.244 μs ┊ GC (min … max): 0.00% … 98.27%
Time (median): 27.859 ns ┊ GC (median): 0.00%
Time (mean ± σ): 32.512 ns ± 72.335 ns ┊ GC (mean ± σ): 7.66% ± 3.51%
▁▄▅▆▇▇█▆▆▄▂ ▃▆▇▇▇▆▄▂ ▁▁ ▃
████████████▇▆▆▆▇▇▇▇▇██▇▇█▇█▇▇██████████████▇▇█▆▆▇▇▇▇▇▆▅▇▇▅ █
25.8 ns Histogram: log(frequency) by time 38.3 ns <
Memory estimate: 96 bytes, allocs estimate: 1.
julia> @benchmark foo(1.0)
BenchmarkTools.Trial: 10000 samples with 990 evaluations.
Range (min … max): 44.371 ns … 5.864 μs ┊ GC (min … max): 0.00% … 98.66%
Time (median): 54.521 ns ┊ GC (median): 0.00%
Time (mean ± σ): 60.849 ns ± 149.926 ns ┊ GC (mean ± σ): 10.62% ± 4.36%
▁▄▄ ▄▇██▆▄▂▂▂▂▁▂▁▁▁▁▁▁▁ ▂
▅█████▇▅▅▅▄▅▆▅▅▄▄▅▅▅▁▅▄▅▄▄▅██████████████████████████▇█▇▇▇██ █
44.4 ns Histogram: log(frequency) by time 64.2 ns <
Memory estimate: 144 bytes, allocs estimate: 1.
Which is the same difference (roughly 20ns) as
julia> @benchmark Ref{NTuple{80,UInt8}}()
BenchmarkTools.Trial: 10000 samples with 1000 evaluations.
Range (min … max): 5.278 ns … 4.844 μs ┊ GC (min … max): 0.00% … 99.41%
Time (median): 6.266 ns ┊ GC (median): 0.00%
Time (mean ± σ): 10.707 ns ± 86.219 ns ┊ GC (mean ± σ): 24.44% ± 3.42%
▁█▆
▂▂▂▂▂▅███▇▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▁▂▁▂▂▂▂▂▃▃▄▅▆▆▆▅▅▄▃▃▃▂▂ ▃
5.28 ns Histogram: frequency by time 11.6 ns <
Memory estimate: 96 bytes, allocs estimate: 1.
julia> @benchmark Vector{UInt8}(undef, 80)
BenchmarkTools.Trial: 10000 samples with 996 evaluations.
Range (min … max): 24.496 ns … 5.879 μs ┊ GC (min … max): 0.00% … 99.21%
Time (median): 33.962 ns ┊ GC (median): 0.00%
Time (mean ± σ): 40.066 ns ± 147.741 ns ┊ GC (mean ± σ): 15.86% ± 4.41%
▂▄▆█▆▃
▁▁▁▁▁▁▁▂▂▂▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▄████████▆▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
24.5 ns Histogram: frequency by time 40.6 ns <
Memory estimate: 144 bytes, allocs estimate: 1.
This:
function foo_malloc(p::Float64=0.95)
if rand() < p
L = 10
else
L = 10000
end
p = Libc.malloc(sizeof(Float64)*L)
A = PtrArray(Ptr{Float64}(p), (L,))
A .= (1:L)
ret = sum(A)
Libc.free(p)
return ret
end;
Which is basically what slow automatic memory management in C++ is equivalent to -- except we need to do it manually in Julia, and is what SmallVector in C++ tries to avoid -- is still at least as fast as either foo or foo_sa.
They about match with p = 1.0, while the situation gets quite bad with smallere values:
julia> @benchmark foo_malloc(0.95)
BenchmarkTools.Trial: 8443 samples with 994 evaluations.
Range (min … max): 77.147 ns … 204.091 ns ┊ GC (min … max): 0.00% … 0.00%
Time (median): 117.320 ns ┊ GC (median): 0.00%
Time (mean ± σ): 117.497 ns ± 12.172 ns ┊ GC (mean ± σ): 0.00% ± 0.00%
▂▂▂▃▄▄▅▆▆▇▇██▅███▇▅▄▄▅▂▁
▂▂▂▁▂▂▂▂▂▃▃▃▃▄▄▄▅▅▇▆▇██████████████████████████▇▇▇▆▆▅▄▄▄▃▃▃▃▃ ▆
77.1 ns Histogram: frequency by time 148 ns <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> @benchmark foo_sa(0.95)
BenchmarkTools.Trial: 2660 samples with 995 evaluations.
Range (min … max): 192.502 ns … 3.925 μs ┊ GC (min … max): 0.00% … 90.53%
Time (median): 319.323 ns ┊ GC (median): 0.00%
Time (mean ± σ): 373.978 ns ± 233.384 ns ┊ GC (mean ± σ): 13.24% ± 16.25%
▁▆▇█▅▁
▂▃▄▆██████▆▄▃▂▂▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▁▁▁▂▂▂▂▂▃▃▃▃▃ ▃
193 ns Histogram: frequency by time 1.27 μs <
Memory estimate: 2.45 KiB, allocs estimate: 1.
Malloc of course also scales much better with multiple threads than Julia's GC.
Manual memory management is both much more important, and much more difficult, in Julia than it is in C++.
All that said, Julia seems to crush C++ here, at least with my initial implementation. C++:
------------------------------------------------------------------
Benchmark Time CPU Iterations
------------------------------------------------------------------
BM_VectorRandSum/95 227 ns 226 ns 3092616
BM_VectorRandSum/96 198 ns 198 ns 3528310
BM_VectorRandSum/97 171 ns 171 ns 4092251
BM_VectorRandSum/98 142 ns 142 ns 4933283
BM_VectorRandSum/99 114 ns 114 ns 6154526
BM_VectorRandSum/100 81.2 ns 81.2 ns 8605086
BM_VectorRandSumStd/95 288 ns 288 ns 2434628
BM_VectorRandSumStd/96 249 ns 249 ns 2808327
BM_VectorRandSumStd/97 211 ns 211 ns 3322150
BM_VectorRandSumStd/98 173 ns 172 ns 4056368
BM_VectorRandSumStd/99 134 ns 134 ns 5211752
BM_VectorRandSumStd/100 91.0 ns 90.9 ns 7695345
The Std suffix refers to std::vector, while without it is the implementation I wrote for LoopModels.
The number (e.g. /95) is the numerator for p, the denominator is 100. I.e., 95 means p=0.95, and 97 means p=0.97.
Julia:
julia> for p = 0.95:0.01:1.0
@btime foo_sa($p)
end
180.275 ns (1 allocation: 2.45 KiB)
139.010 ns (1 allocation: 1.74 KiB)
97.402 ns (1 allocation: 1.04 KiB)
61.998 ns (1 allocation: 579 bytes)
31.178 ns (1 allocation: 96 bytes)
25.810 ns (1 allocation: 96 bytes)
julia> for p = 0.95:0.01:1.0
@btime foo($p)
end
185.083 ns (1 allocation: 2.35 KiB)
149.837 ns (1 allocation: 1.56 KiB)
118.411 ns (1 allocation: 1.17 KiB)
84.967 ns (1 allocation: 628 bytes)
57.507 ns (1 allocation: 144 bytes)
44.419 ns (1 allocation: 144 bytes)
The above was with Clang, to make for an "apples to apples" comparison (although it's LLVM for Clang vs LoopVectorization.jl for Julia). GCC does better, at least for the predominantly small case:
------------------------------------------------------------------
Benchmark Time CPU Iterations
------------------------------------------------------------------
BM_VectorRandSum/95 253 ns 253 ns 2777074
BM_VectorRandSum/96 205 ns 205 ns 3415392
BM_VectorRandSum/97 157 ns 157 ns 4392675
BM_VectorRandSum/98 110 ns 109 ns 6460245
BM_VectorRandSum/99 61.2 ns 61.2 ns 11350744
BM_VectorRandSum/100 12.4 ns 12.4 ns 56821814
BM_VectorRandSumStd/95 318 ns 317 ns 2216642
BM_VectorRandSumStd/96 258 ns 258 ns 2716610
BM_VectorRandSumStd/97 201 ns 200 ns 3515001
BM_VectorRandSumStd/98 143 ns 143 ns 4879986
BM_VectorRandSumStd/99 84.9 ns 84.9 ns 8274289
BM_VectorRandSumStd/100 25.2 ns 25.1 ns 27825172
But even base Julia arrays do about as well as foo:
julia> function foo_base(p::Float64=0.95)
if rand() < p
L = 10
else
L = 10000
end
A = Vector{Float64}(undef, L)
A .= (1:L)
sum(A)
end;
julia> for p = 0.95:0.01:1.0
@btime foo_base($p)
end
190.326 ns (1 allocation: 1.96 KiB)
173.919 ns (1 allocation: 1.72 KiB)
127.940 ns (1 allocation: 1.09 KiB)
91.293 ns (1 allocation: 629 bytes)
60.696 ns (1 allocation: 224 bytes)
45.951 ns (1 allocation: 144 bytes)
One thing to be aware of is that malloc/free is way faster on Linux than MacOS for whatever reason. I've often found that malloc/free was barely slower than alloc (at least when using the malloc from StaticTools.jl but it should be the same or similar to the LibC one).
I wonder if it would be advantageous here to make the SmallStore.outline just store a malloc'd pointer and then if it spills, register a finalizer that frees the pointer?
We could use a library for better cross platform consistency, e.g. mimalloc:
julia> using mimalloc_jll
julia> function foo_mimalloc(p::Float64=0.95)
if rand() < p
L = 10
else
L = 10000
end
p = @ccall mimalloc_jll.libmimalloc.malloc((sizeof(Float64)*L)::Int)::Ptr{Float64}
A = PtrArray(Ptr{Float64}(p), (L,))
A .= (1:L)
ret = sum(A)
@ccall mimalloc_jll.libmimalloc.free(p::Ptr{Float64})::Nothing
return ret
end;
julia> for p = 0.95:0.01:1.0
@btime foo_mimalloc($p)
end
77.858 ns (0 allocations: 0 bytes)
64.301 ns (0 allocations: 0 bytes)
47.716 ns (0 allocations: 0 bytes)
47.642 ns (0 allocations: 0 bytes)
31.010 ns (0 allocations: 0 bytes)
29.257 ns (0 allocations: 0 bytes)
I wonder if it would be advantageous here to make the SmallStore.outline just store a malloc'd pointer and then if it spills, register a finalizer that frees the pointer?
If we immitated the C++ implementations, we wouldn't use an extra pointer.
We'd set the PtrArray's pointer to this, and then when it comes time to free, we'd check whether that pointer equals the pointer_from_objref. If so, we don't have to do anything, if not, we know we allocated it and thus free it.
So I guess the equivalent here would be to conditionally register a finalizer that unconditionally frees the pointer. I guess the finalizer would have to enclose the pointer, since it would be registered on the mutable struct SmallStorage?
This at least does not seem great:
using mimalloc_jll
@inline function smallarrayv2(::Type{T}, ::StaticInt{SpillSize}, size::Integer...) where {T, SpillSize}
@assert isbitstype(T)
N = prod(size)
store = Ref{NTuple{SpillSize*sizeof(T),UInt8}}()
ptr::Ptr{T} = if N <= SpillSize
Ptr{T}(pointer_from_objref(store))
else
p = @ccall mimalloc_jll.libmimalloc.malloc((sizeof(T)*N)::Int)::Ptr{T}
finalizer(store) do _
@ccall mimalloc_jll.libmimalloc.free(p::Ptr{T})::Nothing
end
Ptr{T}(p)
end
ptrA = PtrArray(ptr, size)
strA = StrideArray(ptrA, store)
end;
function foo_sav2(p::Float64=0.95)
SpillSize = static(10)
if rand() < p
# 95% of the time our vector fits inline
L = 10
else
# 5% of the time it'll be too big
L = 10000
end
A = smallarrayv2(Float64, SpillSize, L)
A .= (1:L)
sum(A)
end;
for p = 0.95:0.01:1.0
@btime foo_sav2($p)
end
julia> for p = 0.95:0.01:1.0
@btime foo_sav2($p)
end
217.347 ns (1 allocation: 96 bytes)
165.084 ns (1 allocation: 96 bytes)
113.404 ns (1 allocation: 96 bytes)
72.094 ns (1 allocation: 96 bytes)
42.431 ns (1 allocation: 96 bytes)
26.142 ns (1 allocation: 96 bytes)
Bypassing the GC also puts us at risk of not collecting often enough. CUDA has had problems related to this.
We could also store the poihnter in the ref, and then load it in the finalizer, rather than needing to capture it.
Interestingly, a large part of the overhead can be reduced by using union splitting:
@inline function smallarray3(::Type{T}, ::StaticInt{SpillSize}, size::Integer...) where {T, SpillSize}
@assert isbitstype(T)
N = prod(size)
if N <= SpillSize
data = Ref{NTuple{SpillSize, T}}()
p = Ptr{T}(pointer_from_objref(data))
else
data = Vector{T}(undef, N)
p = Ptr{T}(pointer(data))
end
StrideArray(PtrArray(p, size), data)
end
function foo_sa3(p=0.95)
SpillSize = static(10*8)
if rand() < p
# 95% of the time our vector fits inline
L = 10
else
# 10% of the time it'll be too big
L = 10000
end
A = @inline smallarray3(Float64, SpillSize, L)
if A.data isa RefValue{Float64}
A .= 1:L
sum(A)
elseif A.data isa Vector{Float64}
A .= 1:L
sum(A)
end
end;
@benchmark foo_sa3(0.95)
#+RESULTS:
: BenchmarkTools.Trial: 10000 samples with 1000 evaluations.
: Range (min … max): 103.960 ns … 1.210 μs ┊ GC (min … max): 0.00% … 59.47%
: Time (median): 254.570 ns ┊ GC (median): 0.00%
: Time (mean ± σ): 301.828 ns ± 166.943 ns ┊ GC (mean ± σ): 15.67% ± 18.98%
:
: ▂▄▆▇▇████▇▇▆▆▅▄▃▂▂▁ ▁▁▁▁▁▁▁▁▁▁▂▂▂▂▁ ▁▁▁ ▃
: ▄▆▇███████████████████▇▇▆▅▃▄▅▁▁▃▃▁▃▅▅▇███████████████████████ █
: 104 ns Histogram: log(frequency) by time 947 ns <
:
: Memory estimate: 1.95 KiB, allocs estimate: 0.
@benchmark foo_sa3(1)
#+RESULTS:
: BenchmarkTools.Trial: 10000 samples with 1000 evaluations.
: Range (min … max): 4.740 ns … 17.890 ns ┊ GC (min … max): 0.00% … 0.00%
: Time (median): 4.770 ns ┊ GC (median): 0.00%
: Time (mean ± σ): 4.784 ns ± 0.226 ns ┊ GC (mean ± σ): 0.00% ± 0.00%
:
: █ ▅
: ▂▁▁▁▁▂█▁▁▁▁▁▆▁▁▁▁▁▂▅▁▁▁▁▁█▁▁▁▁▁▂▂▁▁▁▁▁▅▁▁▁▁▁▂▃▁▁▁▁▁▃▁▁▁▁▁▃ ▂
: 4.74 ns Histogram: frequency by time 4.83 ns <
:
: Memory estimate: 0 bytes, allocs estimate: 0.
versus
@benchmark foo(0.95)
#+RESULTS:
: BenchmarkTools.Trial: 10000 samples with 995 evaluations.
: Range (min … max): 150.392 ns … 1.750 μs ┊ GC (min … max): 0.00% … 64.06%
: Time (median): 311.940 ns ┊ GC (median): 0.00%
: Time (mean ± σ): 374.021 ns ± 210.603 ns ┊ GC (mean ± σ): 15.74% ± 19.04%
:
: ▃▅▆▇████▇▇▆▆▅▄▃▃▂▂▁ ▁▁▁ ▂▁▁▁▁ ▃
: ▄█████████████████████▇▆▅▆▅▄▆▇▇▇████▇██████████████▇▇██▇▇█▇▆▇ █
: 150 ns Histogram: log(frequency) by time 1.24 μs <
:
: Memory estimate: 2.10 KiB, allocs estimate: 1.
@benchmark foo(1.0)
#+RESULTS:
: BenchmarkTools.Trial: 10000 samples with 995 evaluations.
: Range (min … max): 27.839 ns … 922.522 ns ┊ GC (min … max): 0.00% … 95.64%
: Time (median): 29.337 ns ┊ GC (median): 0.00%
: Time (mean ± σ): 31.730 ns ± 34.729 ns ┊ GC (mean ± σ): 5.95% ± 5.24%
:
: ▁▁▃▅▆██▇▆▃▂▁ ▁▁▁ ▂
: ██████████████████▇▇▆▆▄▅▃▄▄▃▃▄▃▄▃▁▃▁▁▃▃▃▁▃▁▃▄▃▃▃▁▃▁▄▇███████ █
: 27.8 ns Histogram: log(frequency) by time 41.9 ns <
:
: Memory estimate: 144 bytes, allocs estimate: 1.
I think you've messed up the type, it's not a RefValue{Float64} but a RefValue{NTuple{Int(SpillSize), Float64}}, you'll notice that foo_sa3(1) doesn't return anything currently.
function foo_sa3(p=0.95)
SpillSize = static(10*8)
if rand() < p
# 95% of the time our vector fits inline
L = 10
else
# 10% of the time it'll be too big
L = 10000
end
A = @inline smallarray3(Float64, SpillSize, L)
if A.data isa Base.RefValue{NTuple{Int(SpillSize), Float64}}
A .= 1:L
sum(A)
elseif A.data isa Vector{Float64}
A .= 1:L
sum(A)
end
end
Julia can already tell that smallarray3 returns a union of two values though it doesn't seem to union spilt automatically here.
Do we need to use StrideArray, is there no way to preserve data whilst returning PtrArray?
EDIT: It might be unsafe to return Base.@_gc_preserve_begin data, see https://github.com/JuliaLang/julia/blob/cd8298400ff12c3a8b77d4eeb9415c9336c7305b/src/codegen.cpp#L5880.
This is what I've managed, apart from having to free the data manually this seems pretty good to me.
@inline function smallarray5(::Type{T}, ::StaticInt{SpillSize}, size::Integer...) where {T, SpillSize}
@assert isbitstype(T)
N = prod(size)
if N <= SpillSize
data = Ref{NTuple{SpillSize, T}}()
t = Base.@_gc_preserve_begin data
p = Ptr{T}(pointer_from_objref(data))
else
data = Vector{T}(undef, N)
t = Base.@_gc_preserve_begin data
p = Ptr{T}(pointer(data))
end
return PtrArray(p, size), t
end
function foo_sa5(p=0.95)
SpillSize = static(10*8)
if rand() < p
# 95% of the time our vector fits inline
L = 10
else
# 10% of the time it'll be too big
L = 10000
end
A, t = smallarray5(Float64, SpillSize, L)
A .= 1:L
result = sum(A)
Base.@_gc_preserve_end t
return result
end
And using malloc instead
@inline function smallarray7(::Type{T}, ::StaticInt{SpillSize}, size::Integer...) where {T, SpillSize}
@assert isbitstype(T)
N = prod(size)
data = Ref{NTuple{SpillSize, T}}()
t = Base.@_gc_preserve_begin data
if N <= SpillSize
p = Ptr{T}(pointer_from_objref(data))
malloced = false
else
p = Ptr{T}(Libc.malloc(sizeof(T)*N))
malloced = true
end
return PtrArray(p, size), t, malloced
end
function foo_sa7(p=0.95)
SpillSize = static(10*8)
if rand() < p
# 95% of the time our vector fits inline
L = 10
else
# 10% of the time it'll be too big
L = 10000
end
A, t, malloced = smallarray7(Float64, SpillSize, L)
A .= 1:L
result = sum(A)
Base.@_gc_preserve_end t
if malloced
Libc.free(A.ptr)
end
return result
end
On master (should be similar on 1.9.3)
julia> for p = 0.95:0.01:1.0
@btime foo_sa5($p)
end
289.597 ns (0 allocations: 2.04 KiB)
220.990 ns (0 allocations: 1.57 KiB)
134.724 ns (0 allocations: 1.02 KiB)
74.590 ns (0 allocations: 481 bytes)
13.412 ns (0 allocations: 0 bytes)
13.652 ns (0 allocations: 0 bytes)
julia> for p = 0.95:0.01:1.0
@btime foo_malloc($p)
end
179.266 ns (0 allocations: 0 bytes)
104.274 ns (0 allocations: 0 bytes)
98.520 ns (0 allocations: 0 bytes)
59.149 ns (0 allocations: 0 bytes)
30.460 ns (0 allocations: 0 bytes)
24.373 ns (0 allocations: 0 bytes)
julia> for p = 0.95:0.01:1.0
@btime foo_sa7($p)
end
160.715 ns (0 allocations: 0 bytes)
110.330 ns (0 allocations: 0 bytes)
81.979 ns (0 allocations: 0 bytes)
47.535 ns (0 allocations: 0 bytes)
13.363 ns (0 allocations: 0 bytes)
13.130 ns (0 allocations: 0 bytes)
Got one way working of not having to manually free (I was capturing L by accident previously making things very slow)
@inline function smallarray8(f, ::Type{T}, ::StaticInt{SpillSize}, size::Integer...) where {T, SpillSize}
@assert isbitstype(T)
N = prod(size)
data = Ref{NTuple{SpillSize, T}}()
t = Base.@_gc_preserve_begin data
if N <= SpillSize
p = Ptr{T}(pointer_from_objref(data))
malloced = false
else
p = Ptr{T}(Libc.malloc(sizeof(T)*N))
malloced = true
end
A = PtrArray(p, size)
result = @inline f(A)
#Base.@_gc_preserve_end t # throws in codegen.cpp on assertion.
malloced && Libc.free(p)
return result
end
function foo_sa8(p=0.95)
SpillSize = static(10*8)
if rand() < p
# 95% of the time our vector fits inline
L = 10
else
# 10% of the time it'll be too big
L = 10000
end
return smallarray8(Float64, SpillSize, L) do A
A .= 1:length(A)
return sum(A)
end
end
julia> for p = 0.95:0.01:1.0
@btime foo_sa8($p)
end
168.676 ns (0 allocations: 0 bytes)
111.845 ns (0 allocations: 0 bytes)
83.425 ns (0 allocations: 0 bytes)
43.680 ns (0 allocations: 0 bytes)
15.290 ns (0 allocations: 0 bytes)
14.998 ns (0 allocations: 0 bytes)
This is just as fast as foo_sa7 it's just run to run variation.
I'm a bit late to the party here, but has there been any consideration into using a set of compile time know sizes that easily inline similar to the table that Julia uses to allocate other structs on stack https://github.com/JuliaLang/julia/blob/8e77b63fa7636f84767329dd298817d0a36b5ae3/src/julia_internal.h#L359.l? This would be easier to align with with how Julia naturally manages allocations. Explicit support for a limited number of byte sizes might also make it easier to explore local caching and arenas without using Julia finalize
Managed to hack in allocas.
struct SmallVector{T, N} <: AbstractVector{T}
ptr::Ptr{T}
heap::Union{Nothing, Vector{T}}
length::Int
# @inline function SmallVector{T, N}(length::Integer) where {T, N}
# @assert isbitstype(T)
# if length <= N
# ptr = Core.Intrinsics.unsafe_alloca(T, N) # pretty sure alloca lifetime ends in this constructor
# heap = nothing
# else
# heap = Vector{T}(undef, length); ptr = pointer(heap)
# end
# new{T, N}(ptr, heap, length)
# end
end
function Base.getindex(x::SmallVector, i)
heap = x.heap; GC.@preserve heap begin
unsafe_load(x.ptr, i)
end
end
function Base.setindex!(x::SmallVector, v, i)
heap = x.heap; GC.@preserve heap begin
unsafe_store!(x.ptr, v, i)
end
end
Base.IndexStyle(::SmallVector) = IndexLinear()
Base.size(x::SmallVector) = (x.length,)
function foo_sa(p=0.95)
if rand() < p
L = 10
else
L = 10000
end
N = 80
T = Float64
if L <= N
ptr = Core.Intrinsics.unsafe_alloca(T, N)
heap = nothing
else
heap = Vector{T}(undef, L); ptr = pointer(heap)
end
sv = SmallVector{Float64, 80}(ptr, heap, L)
sv .= 1:L
sum(sv)
end
julia> for p = 0.95:0.01:1.0
@btime foo_sa($p)
end
273.878 ns (0 allocations: 2.11 KiB)
192.794 ns (0 allocations: 1.49 KiB)
117.619 ns (0 allocations: 881 bytes)
55.911 ns (0 allocations: 400 bytes)
21.000 ns (0 allocations: 80 bytes)
10.108 ns (0 allocations: 0 bytes)
VLA's work as well,
struct VLA{T} <: AbstractVector{T}
ptr::Ptr{T}
length::Int
end
Base.getindex(x::VLA, i) = unsafe_load(x.ptr, i)
Base.setindex!(x::VLA, v, i) = unsafe_store!(x.ptr, v, i)
Base.IndexStyle(::VLA) = IndexLinear()
Base.size(x::VLA) = (x.length,)
function foo_vla(p=0.95)
if rand() < p
L = 10
else
L = 10000
end
T = Float64
ptr = Core.Intrinsics.unsafe_alloca(T, L)
sv = VLA{T}(ptr, L)
sv .= 1:L
sum(sv)
end
julia> for p = 0.95:0.01:1.0
@btime foo_vla($p)
end
132.823 ns (0 allocations: 0 bytes)
114.570 ns (0 allocations: 0 bytes)
78.576 ns (0 allocations: 0 bytes)
31.882 ns (0 allocations: 0 bytes)
16.608 ns (0 allocations: 0 bytes)
11.683 ns (0 allocations: 0 bytes)
@Zentrik can I see how you added the unsafe_alloca intrinsic?
https://github.com/Zentrik/julia/tree/unsafe-alloca should work on wsl ubuntu and windows.
We should be able to call unsafe_alloca in a constructor as long as it gets inlined before codegen but I'm not sure how you could force inlining without using LLVM's alwaysinline. Though I think the only case where it wouldn't get inlined would be if someone added a @noinline to the callsite of the constructor.
EDIT: We could just use a macro to construct instead.
Oh that's very interesting! I expected this to have the same problem that adding it with llvmcall would have, which is that even if the llvmcall is inlined, there is a stackrestore instruction that gets put in so that multiple alloca calls end up just all having the same pointer, but the way you've done it here with intrinsics has the "wrong" inlining behaviour, but that behaviour is exactly what we want!
This is really cool, thanks for sharing.
You need to inline at the Julia level. @inline always works when the callsite is inferred. Like you said, someone could use a callsite @noinline, and type inference could fail, making it dangerous.
I'd worry more about the inference failure, and document the invalidity with @noinline. People can always shoot themselves in the foot if they want.
alwaysinline will not work, because LLVM remembers the scope/lifetime of the alloca to be limited to that alwaysinline function. [EDIT: as Mason explained above, except I thought it was done using @llvm.lifetime_end]
The pointer may still be valid outside of it, but it could suddenly alias other stack objects!