StrideArraysCore.jl
StrideArraysCore.jl copied to clipboard
Convolution regression
Using StrideArraysCore 0.3.17, the following works as expected:
julia> using StrideArraysCore, LoopVectorization, BenchmarkTools
julia> using LayoutPointers: zero_offsets
julia> function conv!(f::F, _C, _A, _K, _b) where {F}
C = zero_offsets(_C)
A = zero_offsets(_A)
K = zero_offsets(_K)
b = zero_offsets(_b)
@turbo for j₁ ∈ axes(C, 1), j₂ ∈ axes(C, 2), o ∈ axes(K, 4)
s = zero(eltype(C))
for k₁ ∈ axes(K, 1), k₂ ∈ axes(K, 2), i ∈ axes(K, 3)
s += A[j₁+k₁, j₂+k₂, i] * K[k₁, k₂, i, o]
end
C[j₁, j₂, o] = f(s + b[o])
end
return nothing
end
conv! (generic function with 1 method)
julia> randn_stridearray(size...) = StrideArray(randn(Float32, size...), static.(size));
julia> C = randn_stridearray(8, 8, 4);
julia> A = randn_stridearray(10, 10, 4);
julia> K = randn_stridearray(3, 3, 4, 4);
julia> b = randn_stridearray(4);
julia> @btime conv!($identity, $C, $A, $K, $b)
213.707 ns (0 allocations: 0 bytes)
(I am using the convolution code from SimpleChains.jl) After upgrading to StrideArraysCore 0.4.1, I get
julia> @btime conv!($identity, $C, $A, $K, $b)
316.417 μs (1728 allocations: 101.50 KiB)
Thanks for the issue; fixed by https://github.com/JuliaSIMD/StrideArraysCore.jl/commit/9d3f77dbc1b89dd25ce7dfb2554088da9997ea6b for Julia 1.7 and newer.