LoopVectorization.jl [bug] incorrect result

function _snparray_ax_additive!(out, s::Matrix{UInt8}, v)
    fill!(out, zero(UInt8))
    k = size(s, 1)
    @avx for j ∈ eachindex(v)
        for l in 1:k
            block = s[(j-1)*size(s, 1) + (l-1) + 1]

            # i = 4 * (l-1) + 1
            Aij = block & 3
            i = 4 * (l-1) + 1
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
            
            # i = 4 * (l-1) + 2
            Aij = (block >> 2) & 3
            i = 4 * (l-1) + 2
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 3
            Aij = (block >> 4) & 3
            i = 4 * (l-1) + 3
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
            
            # i = 4 * (l-1) + 4
            Aij = (block >> 6) & 3
            i = 4 * (l-1) + 4
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

        end
    end
    out
end

The result is different from the code without @avx.

Jul 27 '20 22:07 kose-y

Hi, mind giving code I can copy and paste to run the example (and perhaps eventually add to the tests)?

Jul 27 '20 22:07 chriselrod

using LoopVectorization
function _snparray_ax_additive!(out, s::Matrix{UInt8}, v)
    fill!(out, zero(UInt8))
    k = size(s, 1)
    @avx for j ∈ eachindex(v)
        for l in 1:k
            block = s[(j-1)*size(s, 1) + (l-1) + 1]

            # i = 4 * (l-1) + 1
            Aij = block & 3
            i = 4 * (l-1) + 1
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
            
            # i = 4 * (l-1) + 2
            Aij = (block >> 2) & 3
            i = 4 * (l-1) + 2
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 3
            Aij = (block >> 4) & 3
            i = 4 * (l-1) + 3
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
            
            # i = 4 * (l-1) + 4
            Aij = (block >> 6) & 3
            i = 4 * (l-1) + 4
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

        end
    end
    out
end


function _snparray_ax_additive_noavx!(out, s::Matrix{UInt8}, v)
    fill!(out, zero(UInt8))
    k = size(s, 1)
    for j ∈ eachindex(v)
        for l in 1:k
            block = s[(j-1)*size(s, 1) + (l-1) + 1]

            # i = 4 * (l-1) + 1
            Aij = block & 3
            i = 4 * (l-1) + 1
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
            
            # i = 4 * (l-1) + 2
            Aij = (block >> 2) & 3
            i = 4 * (l-1) + 2
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 3
            Aij = (block >> 4) & 3
            i = 4 * (l-1) + 3
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]
            
            # i = 4 * (l-1) + 4
            Aij = (block >> 6) & 3
            i = 4 * (l-1) + 4
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

        end
    end
    out
end

using Random, Test
a = rand(UInt8, 512, 512)
v1 = zeros(512 << 2)
v1_ = zeros(512 << 2)

v2 = rand(Float64, 512)
_snparray_ax_additive!(v1, a, v2)
_snparray_ax_additive_noavx!(v1_, a, v2)

@test isapprox(v1, v1_)

Jul 28 '20 00:07 kose-y

It works correctly without the intermediate variable i:

function _snparray_ax_additive!(out, s::Matrix{UInt8}, v)
    fill!(out, zero(UInt8))
    k = size(s, 1)
    @avx for j ∈ eachindex(v)
        for l in 1:k
            block = s[(j-1)*size(s, 1) + (l-1) + 1]

            # i = 4 * (l-1) + 1
            Aij = block & 3
            out[4*(l-1)+1] += ((Aij >= 2) + (Aij >= 3)) * v[j]
            
            # i = 4 * (l-1) + 2
            Aij = (block >> 2) & 3
            i = 4 * (l-1) + 2
            out[4*(l-1)+2] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 3
            Aij = (block >> 4) & 3
            out[4*(l-1)+3] += ((Aij >= 2) + (Aij >= 3)) * v[j]
            
            # i = 4 * (l-1) + 4
            Aij = (block >> 6) & 3
            out[4*l] += ((Aij >= 2) + (Aij >= 3)) * v[j]

        end
    end
    out
end

Jul 28 '20 07:07 kose-y

The problem is that it doesn't un-alias the names. That is, creating two distinct variables with the same name (i or Aij) is likely to cause problems. It's more likely to cause problems for variables used for indexing (like i), because indexes are going to be reordered more heavily. This passes the tests:

function _snparray_ax_additive!(out, s::Matrix{UInt8}, v)
    fill!(out, zero(UInt8))
    k = size(s, 1)
    @avx for j ∈ eachindex(v)
        for l in 1:k
            block = s[(j-1)*size(s, 1) + (l-1) + 1]

            # i = 4 * (l-1) + 1
            Aij_1 = block & 3
            i_1 = 4 * (l-1) + 1
            out[i_1] += ((Aij_1 >= 2) + (Aij_1 >= 3)) * v[j]
            
            # i = 4 * (l-1) + 2
            Aij_2 = (block >> 2) & 3
            i_2 = 4 * (l-1) + 2
            out[i_2] += ((Aij_2 >= 2) + (Aij_2 >= 3)) * v[j]

            # i = 4 * (l-1) + 3
            Aij_3 = (block >> 4) & 3
            i_3 = 4 * (l-1) + 3
            out[i_3] += ((Aij_3 >= 2) + (Aij_3 >= 3)) * v[j]
            
            # i = 4 * (l-1) + 4
            Aij_4 = (block >> 6) & 3
            i_4 = 4 * (l-1) + 4
            out[i_4] += ((Aij_4 >= 2) + (Aij_4 >= 3)) * v[j]

        end
    end
    out
end

I'll add something to change the names of the repeated variables.

However, I'd also like to point out that LoopVectorization only SIMDs across iterations of the loops. That means that writing:

out[4*(l-1)+1] += ...
out[4*(l-1)+2] += ...
out[4*(l-1)+3] += ...
out[4*l] += ...

Will force it to load elements like:

out[4*((l .+ 0:7)-1)+1] .+= ...
out[4*((l .+ 0:7)-1)+2] .+= ...
out[4*((l .+ 0:7)-1)+3] .+= ...
out[4*(l .+ 0:7)] .+= ...

This requires using gather and scatter intrinsics. These are slow, and if you don't have AVX512, scatter has to be emulated with repeated extractions and scalar stores.

It would be faster if you could define

v1__ = copy(reshape(v1_, (4, length(v1_) >> 2))'); # make it a matrix

function _snparray_ax_additive_contig!(out, s::Matrix{UInt8}, v)
    fill!(out, zero(UInt8))
    k = size(s, 1)
    # manually set unroll, seems to choose something a little faster
    @avx unroll=(2,2) for j ∈ eachindex(v)
        for l in 1:k
            block = s[l,j]

            Aij_1 = block & 3
            out[l,1] += ((Aij_1 >= 2) + (Aij_1 >= 3)) * v[j]
            
            Aij_2 = (block >> 2) & 3
            out[l,2] += ((Aij_2 >= 2) + (Aij_2 >= 3)) * v[j]

            Aij_3 = (block >> 4) & 3
            out[l,3] += ((Aij_3 >= 2) + (Aij_3 >= 3)) * v[j]
            
            Aij_4 = (block >> 6) & 3
            out[l,4] += ((Aij_4 >= 2) + (Aij_4 >= 3)) * v[j]

    end
    out
end
_snparray_ax_additive_contig!(v1__, a, v2);

@test isapprox(vec(v1__'), v1_)

This doesn't actually make much difference for me, because it swaps the order of the loops to make l the outer loop, so that it only has to gather/scatter once per for j in eachindex.

Jul 28 '20 12:07 chriselrod

Thanks for the information!

Jul 28 '20 13:07 kose-y

LoopVectorization.jl LoopVectorization.jl copied to clipboard

[bug] incorrect result

LoopVectorization.jl
LoopVectorization.jl copied to clipboard