LoopVectorization.jl
LoopVectorization.jl copied to clipboard
Weird behavior
Function f()
, which goes over the data once, is more than ten times slower than function f2()
, which goes over the data twice. Is this some kind of bug?
using BenchmarkTools, LoopVectorization, Random
function f(g::AbstractArray{T}, q::AbstractArray{T}, f::AbstractArray{T}) where T
r = zero(T)
I, J = size(g)
@turbo for j in 1:J
for i in 1:I
qf_local = zero(T)
for k in 1:K
qf_local += q[k, i] * f[k, j]
end
r += g[i, j] * log(qf_local) + (2one(T) - g[i, j]) * log(one(T) - qf_local)
end
end
end
function f2(g::AbstractArray{T}, q::AbstractArray{T}, f::AbstractArray{T}) where T
r = zero(T)
I, J = size(g)
@turbo for j in 1:J
for i in 1:I
qf_local = zero(T)
for k in 1:K
qf_local += q[k, i] * f[k, j]
end
r += g[i, j] * log(qf_local)
end
end
@turbo for j in 1:J
for i in 1:I
qf_local = zero(T)
for k in 1:K
qf_local += q[k, i] * f[k, j]
end
r += (2one(T) - g[i, j]) * log(one(T) - qf_local)
end
end
end
g = rand(100000, 1000)
qq = rand(3, 100000)
ff = rand(3, 1000)
@btime f(g, qq, ff)
1.961 s (22 allocations: 560 bytes)
@btime f2(g, qq, ff)
172.651 ms (53 allocations: 1.77 KiB)
K
is not defined.
I assume you're missing the line
K = size(q,1)
? I'll have to look more later, but I get
julia> @time f(g, qq, ff);
0.030570 seconds
julia> @time f2(g, qq, ff);
0.228648 seconds
Sorry, I messed up reproducing the issue with simpler code. Actually, g
is a 2-bit packed array:
using BenchmarkTools, LoopVectorization
@inline function f(g::AbstractArray{UInt8}, q::AbstractArray{T}, f) where T
oneT = one(T)
twoT = 2one(T)
I, J = size(g)
K = size(q, 1)
r = zero(T)
@turbo for j in 1:J
for i in 1:I
qf_local = zero(T)
for k in 1:K
qf_local += q[k, i] * f[k, j]
end
blk = g[(i - 1) >> 2 + 1, j]
re = (i - 1) % 4
blk_shifted = blk >> (re << 1)
gij = blk_shifted & 0x03
r += ((gij * log(qf_local)) + (twoT - gij) * log(oneT - qf_local))
end
end
r
end
function f2(g::AbstractArray{UInt8}, q::AbstractArray{T}, f::AbstractArray{T}) where T
oneT = one(T)
twoT = 2one(T)
I, J = size(g)
K = size(q, 1)
r = zero(T)
@turbo for j in 1:J
for i in 1:I
qf_local = zero(T)
for k in 1:K
qf_local += q[k, i] * f[k, j]
end
blk = g[(i - 1) >> 2 + 1, j]
re = (i - 1) % 4
blk_shifted = blk >> (re << 1)
gij = blk_shifted & 0x03
r += gij * log(qf_local)
end
end
@turbo for j in 1:J
for i in 1:I
qf_local = zero(T)
for k in 1:K
qf_local += q[k, i] * f[k, j]
end
blk = g[(i - 1) >> 2 + 1, j]
re = (i - 1) % 4
blk_shifted = blk >> (re << 1)
gij = blk_shifted & 0x03
r += (twoT - gij) * log(oneT - qf_local)
end
end
r
end
f2 (generic function with 2 methods)
I, J, K = 2500, 120000, 4
g = rand(UInt8, Int(ceil(I / 4)), J)
qq = rand(K, I)
qq./= sum(qq, dims=1)
ff = rand(K, J)
4×120000 Matrix{Float64}:
0.540408 0.298717 0.726025 0.100985 … 0.656309 0.872686 0.532832
0.199677 0.80906 0.447669 0.0500142 0.521077 0.147785 0.796754
0.426851 0.956759 0.163433 0.42933 0.137063 0.602191 0.145897
0.206329 0.228952 0.512773 0.0862005 0.627087 0.707773 0.983057
@btime f(g, qq, ff)
2.583 s (1 allocation: 16 bytes)
-1.1394245267286244e8
@btime f2(g, qq, ff)
2.193 s (1 allocation: 16 bytes)
-1.1394245267289072e8
Also, is there a tip to make it faster?