LoopVectorization.jl
LoopVectorization.jl copied to clipboard
[bug] incorrect result
function _snparray_ax_additive!(out, s::Matrix{UInt8}, v)
fill!(out, zero(UInt8))
k = size(s, 1)
@avx for j ∈ eachindex(v)
for l in 1:k
block = s[(j-1)*size(s, 1) + (l-1) + 1]
# i = 4 * (l-1) + 1
Aij = block & 3
i = 4 * (l-1) + 1
out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
# i = 4 * (l-1) + 2
Aij = (block >> 2) & 3
i = 4 * (l-1) + 2
out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
# i = 4 * (l-1) + 3
Aij = (block >> 4) & 3
i = 4 * (l-1) + 3
out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
# i = 4 * (l-1) + 4
Aij = (block >> 6) & 3
i = 4 * (l-1) + 4
out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
end
end
out
end
The result is different from the code without @avx
.
Hi, mind giving code I can copy and paste to run the example (and perhaps eventually add to the tests)?
using LoopVectorization
function _snparray_ax_additive!(out, s::Matrix{UInt8}, v)
fill!(out, zero(UInt8))
k = size(s, 1)
@avx for j ∈ eachindex(v)
for l in 1:k
block = s[(j-1)*size(s, 1) + (l-1) + 1]
# i = 4 * (l-1) + 1
Aij = block & 3
i = 4 * (l-1) + 1
out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
# i = 4 * (l-1) + 2
Aij = (block >> 2) & 3
i = 4 * (l-1) + 2
out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
# i = 4 * (l-1) + 3
Aij = (block >> 4) & 3
i = 4 * (l-1) + 3
out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
# i = 4 * (l-1) + 4
Aij = (block >> 6) & 3
i = 4 * (l-1) + 4
out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
end
end
out
end
function _snparray_ax_additive_noavx!(out, s::Matrix{UInt8}, v)
fill!(out, zero(UInt8))
k = size(s, 1)
for j ∈ eachindex(v)
for l in 1:k
block = s[(j-1)*size(s, 1) + (l-1) + 1]
# i = 4 * (l-1) + 1
Aij = block & 3
i = 4 * (l-1) + 1
out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
# i = 4 * (l-1) + 2
Aij = (block >> 2) & 3
i = 4 * (l-1) + 2
out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
# i = 4 * (l-1) + 3
Aij = (block >> 4) & 3
i = 4 * (l-1) + 3
out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
# i = 4 * (l-1) + 4
Aij = (block >> 6) & 3
i = 4 * (l-1) + 4
out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
end
end
out
end
using Random, Test
a = rand(UInt8, 512, 512)
v1 = zeros(512 << 2)
v1_ = zeros(512 << 2)
v2 = rand(Float64, 512)
_snparray_ax_additive!(v1, a, v2)
_snparray_ax_additive_noavx!(v1_, a, v2)
@test isapprox(v1, v1_)
It works correctly without the intermediate variable i
:
function _snparray_ax_additive!(out, s::Matrix{UInt8}, v)
fill!(out, zero(UInt8))
k = size(s, 1)
@avx for j ∈ eachindex(v)
for l in 1:k
block = s[(j-1)*size(s, 1) + (l-1) + 1]
# i = 4 * (l-1) + 1
Aij = block & 3
out[4*(l-1)+1] += ((Aij >= 2) + (Aij >= 3)) * v[j]
# i = 4 * (l-1) + 2
Aij = (block >> 2) & 3
i = 4 * (l-1) + 2
out[4*(l-1)+2] += ((Aij >= 2) + (Aij >= 3)) * v[j]
# i = 4 * (l-1) + 3
Aij = (block >> 4) & 3
out[4*(l-1)+3] += ((Aij >= 2) + (Aij >= 3)) * v[j]
# i = 4 * (l-1) + 4
Aij = (block >> 6) & 3
out[4*l] += ((Aij >= 2) + (Aij >= 3)) * v[j]
end
end
out
end
The problem is that it doesn't un-alias the names. That is, creating two distinct variables with the same name (i
or Aij
) is likely to cause problems. It's more likely to cause problems for variables used for indexing (like i
), because indexes are going to be reordered more heavily.
This passes the tests:
function _snparray_ax_additive!(out, s::Matrix{UInt8}, v)
fill!(out, zero(UInt8))
k = size(s, 1)
@avx for j ∈ eachindex(v)
for l in 1:k
block = s[(j-1)*size(s, 1) + (l-1) + 1]
# i = 4 * (l-1) + 1
Aij_1 = block & 3
i_1 = 4 * (l-1) + 1
out[i_1] += ((Aij_1 >= 2) + (Aij_1 >= 3)) * v[j]
# i = 4 * (l-1) + 2
Aij_2 = (block >> 2) & 3
i_2 = 4 * (l-1) + 2
out[i_2] += ((Aij_2 >= 2) + (Aij_2 >= 3)) * v[j]
# i = 4 * (l-1) + 3
Aij_3 = (block >> 4) & 3
i_3 = 4 * (l-1) + 3
out[i_3] += ((Aij_3 >= 2) + (Aij_3 >= 3)) * v[j]
# i = 4 * (l-1) + 4
Aij_4 = (block >> 6) & 3
i_4 = 4 * (l-1) + 4
out[i_4] += ((Aij_4 >= 2) + (Aij_4 >= 3)) * v[j]
end
end
out
end
I'll add something to change the names of the repeated variables.
However, I'd also like to point out that LoopVectorization only SIMDs across iterations of the loops. That means that writing:
out[4*(l-1)+1] += ...
out[4*(l-1)+2] += ...
out[4*(l-1)+3] += ...
out[4*l] += ...
Will force it to load elements like:
out[4*((l .+ 0:7)-1)+1] .+= ...
out[4*((l .+ 0:7)-1)+2] .+= ...
out[4*((l .+ 0:7)-1)+3] .+= ...
out[4*(l .+ 0:7)] .+= ...
This requires using gather
and scatter
intrinsics. These are slow, and if you don't have AVX512, scatter has to be emulated with repeated extractions and scalar stores.
It would be faster if you could define
v1__ = copy(reshape(v1_, (4, length(v1_) >> 2))'); # make it a matrix
function _snparray_ax_additive_contig!(out, s::Matrix{UInt8}, v)
fill!(out, zero(UInt8))
k = size(s, 1)
# manually set unroll, seems to choose something a little faster
@avx unroll=(2,2) for j ∈ eachindex(v)
for l in 1:k
block = s[l,j]
Aij_1 = block & 3
out[l,1] += ((Aij_1 >= 2) + (Aij_1 >= 3)) * v[j]
Aij_2 = (block >> 2) & 3
out[l,2] += ((Aij_2 >= 2) + (Aij_2 >= 3)) * v[j]
Aij_3 = (block >> 4) & 3
out[l,3] += ((Aij_3 >= 2) + (Aij_3 >= 3)) * v[j]
Aij_4 = (block >> 6) & 3
out[l,4] += ((Aij_4 >= 2) + (Aij_4 >= 3)) * v[j]
end
out
end
_snparray_ax_additive_contig!(v1__, a, v2);
@test isapprox(vec(v1__'), v1_)
This doesn't actually make much difference for me, because it swaps the order of the loops to make l
the outer loop, so that it only has to gather
/scatter
once per for j in eachindex
.
Thanks for the information!