LoopVectorization.jl Weird behavior

Function f(), which goes over the data once, is more than ten times slower than function f2(), which goes over the data twice. Is this some kind of bug?

using BenchmarkTools, LoopVectorization, Random
function f(g::AbstractArray{T}, q::AbstractArray{T}, f::AbstractArray{T}) where T
    r = zero(T)
    I, J = size(g)
    @turbo for j in 1:J
        for i in 1:I
            qf_local = zero(T)
            for k in 1:K
                qf_local += q[k, i] * f[k, j]
            end
            r += g[i, j] * log(qf_local) + (2one(T) - g[i, j]) * log(one(T) - qf_local)
        end
    end
end

function f2(g::AbstractArray{T}, q::AbstractArray{T}, f::AbstractArray{T}) where T
    r = zero(T)
    I, J = size(g)
    @turbo for j in 1:J
        for i in 1:I
            qf_local = zero(T)
            for k in 1:K
                qf_local += q[k, i] * f[k, j]
            end
            r += g[i, j] * log(qf_local)
        end
    end
    @turbo for j in 1:J
        for i in 1:I
            qf_local = zero(T)
            for k in 1:K
                qf_local += q[k, i] * f[k, j]
            end
            r += (2one(T) - g[i, j]) * log(one(T) - qf_local)
        end
    end
end
g = rand(100000, 1000)
qq = rand(3, 100000)
ff = rand(3, 1000)

@btime f(g, qq, ff)

  1.961 s (22 allocations: 560 bytes)

@btime f2(g, qq, ff)

  172.651 ms (53 allocations: 1.77 KiB)

Feb 01 '22 10:02 kose-y

K is not defined. I assume you're missing the line

K = size(q,1)

? I'll have to look more later, but I get

julia> @time f(g, qq, ff);
  0.030570 seconds

julia> @time f2(g, qq, ff);
  0.228648 seconds

Feb 01 '22 20:02 chriselrod

Sorry, I messed up reproducing the issue with simpler code. Actually, g is a 2-bit packed array:

using BenchmarkTools, LoopVectorization
@inline function f(g::AbstractArray{UInt8}, q::AbstractArray{T}, f) where T
    oneT = one(T)
    twoT = 2one(T)
    I, J = size(g)
    K = size(q, 1)
    r = zero(T)
    @turbo for j in 1:J
        for i in 1:I
            qf_local = zero(T)
            for k in 1:K
                qf_local += q[k, i] * f[k, j]
            end
            blk = g[(i - 1) >> 2 + 1, j]
            re = (i - 1) % 4
            blk_shifted  = blk >> (re << 1)
            gij = blk_shifted & 0x03

            r +=  ((gij * log(qf_local)) + (twoT - gij) * log(oneT - qf_local))
        end
    end
    r
end

function f2(g::AbstractArray{UInt8}, q::AbstractArray{T}, f::AbstractArray{T}) where T
    oneT = one(T)
    twoT = 2one(T)
    I, J = size(g)
    K = size(q, 1)
    r = zero(T)
    @turbo for j in 1:J
        for i in 1:I
            qf_local = zero(T)
            for k in 1:K
                qf_local += q[k, i] * f[k, j]
            end
            blk = g[(i - 1) >> 2 + 1, j]
            re = (i - 1) % 4
            blk_shifted  = blk >> (re << 1)
            gij = blk_shifted & 0x03

            r +=  gij * log(qf_local)
        end
    end
    
    @turbo for j in 1:J
        for i in 1:I
            qf_local = zero(T)
            for k in 1:K
                qf_local += q[k, i] * f[k, j]
            end
            blk = g[(i - 1) >> 2 + 1, j]
            re = (i - 1) % 4
            blk_shifted  = blk >> (re << 1)
            gij = blk_shifted & 0x03

            r +=  (twoT - gij) * log(oneT - qf_local)
        end
    end
    r
end

f2 (generic function with 2 methods)

I, J, K = 2500, 120000, 4
g = rand(UInt8, Int(ceil(I / 4)), J) 
qq = rand(K, I) 
qq./= sum(qq, dims=1)
ff = rand(K, J)

4×120000 Matrix{Float64}:
 0.540408  0.298717  0.726025  0.100985   …  0.656309  0.872686  0.532832
 0.199677  0.80906   0.447669  0.0500142     0.521077  0.147785  0.796754
 0.426851  0.956759  0.163433  0.42933       0.137063  0.602191  0.145897
 0.206329  0.228952  0.512773  0.0862005     0.627087  0.707773  0.983057

@btime f(g, qq, ff)

  2.583 s (1 allocation: 16 bytes)





-1.1394245267286244e8

@btime f2(g, qq, ff)

  2.193 s (1 allocation: 16 bytes)





-1.1394245267289072e8

Also, is there a tip to make it faster?

Feb 02 '22 01:02 kose-y

LoopVectorization.jl LoopVectorization.jl copied to clipboard

Weird behavior

LoopVectorization.jl
LoopVectorization.jl copied to clipboard