LoopVectorization.jl icon indicating copy to clipboard operation
LoopVectorization.jl copied to clipboard

Weird behavior

Open kose-y opened this issue 3 years ago • 2 comments

Function f(), which goes over the data once, is more than ten times slower than function f2(), which goes over the data twice. Is this some kind of bug?

using BenchmarkTools, LoopVectorization, Random
function f(g::AbstractArray{T}, q::AbstractArray{T}, f::AbstractArray{T}) where T
    r = zero(T)
    I, J = size(g)
    @turbo for j in 1:J
        for i in 1:I
            qf_local = zero(T)
            for k in 1:K
                qf_local += q[k, i] * f[k, j]
            end
            r += g[i, j] * log(qf_local) + (2one(T) - g[i, j]) * log(one(T) - qf_local)
        end
    end
end

function f2(g::AbstractArray{T}, q::AbstractArray{T}, f::AbstractArray{T}) where T
    r = zero(T)
    I, J = size(g)
    @turbo for j in 1:J
        for i in 1:I
            qf_local = zero(T)
            for k in 1:K
                qf_local += q[k, i] * f[k, j]
            end
            r += g[i, j] * log(qf_local)
        end
    end
    @turbo for j in 1:J
        for i in 1:I
            qf_local = zero(T)
            for k in 1:K
                qf_local += q[k, i] * f[k, j]
            end
            r += (2one(T) - g[i, j]) * log(one(T) - qf_local)
        end
    end
end
g = rand(100000, 1000)
qq = rand(3, 100000)
ff = rand(3, 1000)
@btime f(g, qq, ff)
  1.961 s (22 allocations: 560 bytes)
@btime f2(g, qq, ff)
  172.651 ms (53 allocations: 1.77 KiB)

kose-y avatar Feb 01 '22 10:02 kose-y

K is not defined. I assume you're missing the line

K = size(q,1)

? I'll have to look more later, but I get

julia> @time f(g, qq, ff);
  0.030570 seconds

julia> @time f2(g, qq, ff);
  0.228648 seconds

chriselrod avatar Feb 01 '22 20:02 chriselrod

Sorry, I messed up reproducing the issue with simpler code. Actually, g is a 2-bit packed array:

using BenchmarkTools, LoopVectorization
@inline function f(g::AbstractArray{UInt8}, q::AbstractArray{T}, f) where T
    oneT = one(T)
    twoT = 2one(T)
    I, J = size(g)
    K = size(q, 1)
    r = zero(T)
    @turbo for j in 1:J
        for i in 1:I
            qf_local = zero(T)
            for k in 1:K
                qf_local += q[k, i] * f[k, j]
            end
            blk = g[(i - 1) >> 2 + 1, j]
            re = (i - 1) % 4
            blk_shifted  = blk >> (re << 1)
            gij = blk_shifted & 0x03

            r +=  ((gij * log(qf_local)) + (twoT - gij) * log(oneT - qf_local))
        end
    end
    r
end

function f2(g::AbstractArray{UInt8}, q::AbstractArray{T}, f::AbstractArray{T}) where T
    oneT = one(T)
    twoT = 2one(T)
    I, J = size(g)
    K = size(q, 1)
    r = zero(T)
    @turbo for j in 1:J
        for i in 1:I
            qf_local = zero(T)
            for k in 1:K
                qf_local += q[k, i] * f[k, j]
            end
            blk = g[(i - 1) >> 2 + 1, j]
            re = (i - 1) % 4
            blk_shifted  = blk >> (re << 1)
            gij = blk_shifted & 0x03

            r +=  gij * log(qf_local)
        end
    end
    
    @turbo for j in 1:J
        for i in 1:I
            qf_local = zero(T)
            for k in 1:K
                qf_local += q[k, i] * f[k, j]
            end
            blk = g[(i - 1) >> 2 + 1, j]
            re = (i - 1) % 4
            blk_shifted  = blk >> (re << 1)
            gij = blk_shifted & 0x03

            r +=  (twoT - gij) * log(oneT - qf_local)
        end
    end
    r
end
f2 (generic function with 2 methods)
I, J, K = 2500, 120000, 4
g = rand(UInt8, Int(ceil(I / 4)), J) 
qq = rand(K, I) 
qq./= sum(qq, dims=1)
ff = rand(K, J)
4×120000 Matrix{Float64}:
 0.540408  0.298717  0.726025  0.100985   …  0.656309  0.872686  0.532832
 0.199677  0.80906   0.447669  0.0500142     0.521077  0.147785  0.796754
 0.426851  0.956759  0.163433  0.42933       0.137063  0.602191  0.145897
 0.206329  0.228952  0.512773  0.0862005     0.627087  0.707773  0.983057
@btime f(g, qq, ff)
  2.583 s (1 allocation: 16 bytes)





-1.1394245267286244e8
@btime f2(g, qq, ff)

  2.193 s (1 allocation: 16 bytes)





-1.1394245267289072e8

Also, is there a tip to make it faster?

kose-y avatar Feb 02 '22 01:02 kose-y