LoopVectorization.jl
LoopVectorization.jl copied to clipboard
Weird behavior
trafficstars
Function f(), which goes over the data once, is more than ten times slower than function f2(), which goes over the data twice. Is this some kind of bug?
using BenchmarkTools, LoopVectorization, Random
function f(g::AbstractArray{T}, q::AbstractArray{T}, f::AbstractArray{T}) where T
r = zero(T)
I, J = size(g)
@turbo for j in 1:J
for i in 1:I
qf_local = zero(T)
for k in 1:K
qf_local += q[k, i] * f[k, j]
end
r += g[i, j] * log(qf_local) + (2one(T) - g[i, j]) * log(one(T) - qf_local)
end
end
end
function f2(g::AbstractArray{T}, q::AbstractArray{T}, f::AbstractArray{T}) where T
r = zero(T)
I, J = size(g)
@turbo for j in 1:J
for i in 1:I
qf_local = zero(T)
for k in 1:K
qf_local += q[k, i] * f[k, j]
end
r += g[i, j] * log(qf_local)
end
end
@turbo for j in 1:J
for i in 1:I
qf_local = zero(T)
for k in 1:K
qf_local += q[k, i] * f[k, j]
end
r += (2one(T) - g[i, j]) * log(one(T) - qf_local)
end
end
end
g = rand(100000, 1000)
qq = rand(3, 100000)
ff = rand(3, 1000)
@btime f(g, qq, ff)
1.961 s (22 allocations: 560 bytes)
@btime f2(g, qq, ff)
172.651 ms (53 allocations: 1.77 KiB)
K is not defined.
I assume you're missing the line
K = size(q,1)
? I'll have to look more later, but I get
julia> @time f(g, qq, ff);
0.030570 seconds
julia> @time f2(g, qq, ff);
0.228648 seconds
Sorry, I messed up reproducing the issue with simpler code. Actually, g is a 2-bit packed array:
using BenchmarkTools, LoopVectorization
@inline function f(g::AbstractArray{UInt8}, q::AbstractArray{T}, f) where T
oneT = one(T)
twoT = 2one(T)
I, J = size(g)
K = size(q, 1)
r = zero(T)
@turbo for j in 1:J
for i in 1:I
qf_local = zero(T)
for k in 1:K
qf_local += q[k, i] * f[k, j]
end
blk = g[(i - 1) >> 2 + 1, j]
re = (i - 1) % 4
blk_shifted = blk >> (re << 1)
gij = blk_shifted & 0x03
r += ((gij * log(qf_local)) + (twoT - gij) * log(oneT - qf_local))
end
end
r
end
function f2(g::AbstractArray{UInt8}, q::AbstractArray{T}, f::AbstractArray{T}) where T
oneT = one(T)
twoT = 2one(T)
I, J = size(g)
K = size(q, 1)
r = zero(T)
@turbo for j in 1:J
for i in 1:I
qf_local = zero(T)
for k in 1:K
qf_local += q[k, i] * f[k, j]
end
blk = g[(i - 1) >> 2 + 1, j]
re = (i - 1) % 4
blk_shifted = blk >> (re << 1)
gij = blk_shifted & 0x03
r += gij * log(qf_local)
end
end
@turbo for j in 1:J
for i in 1:I
qf_local = zero(T)
for k in 1:K
qf_local += q[k, i] * f[k, j]
end
blk = g[(i - 1) >> 2 + 1, j]
re = (i - 1) % 4
blk_shifted = blk >> (re << 1)
gij = blk_shifted & 0x03
r += (twoT - gij) * log(oneT - qf_local)
end
end
r
end
f2 (generic function with 2 methods)
I, J, K = 2500, 120000, 4
g = rand(UInt8, Int(ceil(I / 4)), J)
qq = rand(K, I)
qq./= sum(qq, dims=1)
ff = rand(K, J)
4×120000 Matrix{Float64}:
0.540408 0.298717 0.726025 0.100985 … 0.656309 0.872686 0.532832
0.199677 0.80906 0.447669 0.0500142 0.521077 0.147785 0.796754
0.426851 0.956759 0.163433 0.42933 0.137063 0.602191 0.145897
0.206329 0.228952 0.512773 0.0862005 0.627087 0.707773 0.983057
@btime f(g, qq, ff)
2.583 s (1 allocation: 16 bytes)
-1.1394245267286244e8
@btime f2(g, qq, ff)
2.193 s (1 allocation: 16 bytes)
-1.1394245267289072e8
Also, is there a tip to make it faster?