LoopVectorization.jl
LoopVectorization.jl copied to clipboard
`@avx` incorrectly handling variable redefinitions between iterations
I'm not sure if this is something that @avx is supposed to be able to handle, but when I write
using LoopVectorization
function matmul!(u::AbstractVector{T}, A::Tridiagonal{T}, v::AbstractVector{T}) where {T}
@assert length(u) == size(A,1) == size(A,2) == length(v)
dl, d, du = A.dl, A.d, A.du
N = length(u); @assert N > 2
p = zero(T)
c = v[1]
n = v[2]
@inbounds u[1] = d[1] * c + du[1] * n
for i in 2:(N-1)
p = c
c = n
n = v[i+1]
u[i] = dl[i-1] * p + d[i] * c + du[i] * n
end
p = c
c = n
@inbounds u[N] = dl[N-1] * p + d[N] * c
u
end
I get correct results compared to mul!
let N = 5
T = Float64
A = Tridiagonal(rand(T, N-1), rand(T, N), rand(T, N-1))
v = rand(T, N)
u1 = Array{T}(undef, N)
u2 = Array{T}(undef, N)
matmul!(u1, A, v)
mul!(u2, A, v)
u1 - u2
end
#+RESULTS:
: 5-element Vector{Float64}:
: 0.0
: 0.0
: 0.0
: 0.0
: 0.0
However, sticking @avx on the loop makes it get the wrong answer:
function matmul_avx!(u::AbstractVector{T}, A::Tridiagonal{T}, v::AbstractVector{T}) where {T}
@assert length(u) == size(A,1) == size(A,2) == length(v)
dl, d, du = A.dl, A.d, A.du
N = length(u); @assert N > 2
p = zero(T)
c = v[1]
n = v[2]
@inbounds u[1] = d[1] * c + du[1] * n
@avx for i in 2:(N-1)
p = c
c = n
n = v[i+1]
u[i] = dl[i-1] * p + d[i] * c + du[i] * n
end
p = c
c = n
@inbounds u[N] = dl[N-1] * p + d[N] * c
u
end
let N = 5
T = Float64
A = Tridiagonal(rand(T, N-1), rand(T, N), rand(T, N-1))
v = rand(T, N)
u1 = Array{T}(undef, N)
u2 = Array{T}(undef, N)
matmul_avx!(u1, A, v)
mul!(u2, A, v)
u1 - u2
end
#+RESULTS:
: 5-element Vector{Float64}:
: 0.0
: 5.551115123125783e-17
: 0.3137201723219849
: 0.40281562647422675
: 0.29883752908665995
One thing I tried in order to fix this was to use an intermediate array to store p, c, n, but @avx didn't like that, giving an undefvarerror:
function matmul2_avx!(u::AbstractVector{T}, A::Tridiagonal{T}, v::AbstractVector{T}) where {T}
@assert length(u) == size(A,1) == size(A,2) == length(v)
dl, d, du = A.dl, A.d, A.du
N = length(u)
if N == 0
nothing
elseif N == 1
u[1] = d[1] * v[1]
return u
else
dep = MVector((zero(T), v[1], v[2]))
@inbounds u[1] = d[1] * dep[2] + du[1] * dep[3]
@avx for i in 2:(N-1)
dep[1] = dep[2]
dep[2] = dep[3]
dep[3] = v[i+1]
u[i] = dl[i-1] * dep[1] + d[i] * dep[2] + du[i] * dep[3]
end
dep[1] = dep[2]
dep[2] = dep[3]
@inbounds u[N] = dl[N-1] * dep[1] + d[N] * dep[2]
end
u
end
let N = 5
T = Float64
A = Tridiagonal(rand(T, N-1), rand(T, N), rand(T, N-1))
v = rand(T, N)
u1 = Array{T}(undef, N)
u2 = Array{T}(undef, N)
matmul2_avx!(u1, A, v)
mul!(u2, A, v)
u1 - u2
end
#+RESULTS:
:RESULTS:
# [goto error]
#+begin_example
UndefVarError: ######RHS###4######5### not defined
Stacktrace:
[1] matmul2_avx!(u::Vector{Float64}, A::Tridiagonal{Float64, Vector{Float64}}, v::Vector{Float64})
@ Main ./In[179]:41
[2] top-level scope
@ In[184]:8
[3] eval
@ ./boot.jl:360 [inlined]
[4] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
@ Base ./loading.jl:1094
#+end_example
:END:
Thanks for the issue. I think correctly handling this will wait for the rewrite, which will track dependencies between loop iterations. Not sure about a timeline for it, though.
I see. Does that apply as well to the MVector version?
Yes.
Worse than that, it currently tries to hoist the loads out of the loop. Although that doesn't explain the ######RHS###4######5### not defined error.
@MasonProtter, here is how to rewrite the loops to get them to work:
using LoopVectorization, LinearAlgebra
function matmul!(u::AbstractVector{T}, A::Tridiagonal{T}, v::AbstractVector{T}) where {T}
@assert length(u) == size(A,1) == size(A,2) == length(v)
dl, d, du = A.dl, A.d, A.du
N = length(u); @assert N > 2
p = zero(T)
@inbounds u[1] = d[1] * v[1] + du[1] * v[2]
for i in 2:(N-1)
p = v[i-1]
c = v[i]
n = v[i+1]
u[i] = dl[i-1] * p + d[i] * c + du[i] * n
end
p = v[N-1]
c = v[N]
@inbounds u[N] = dl[N-1] * p + d[N] * c
u
end
function matmul_turbo!(u::AbstractVector{T}, A::Tridiagonal{T}, v::AbstractVector{T}) where {T}
@assert length(u) == size(A,1) == size(A,2) == length(v)
dl, d, du = A.dl, A.d, A.du
N = length(u); @assert N > 2
p = zero(T)
@inbounds u[1] = d[1] * v[1] + du[1] * v[2]
@turbo for i in 2:(N-1)
p = v[i-1]
c = v[i]
n = v[i+1]
u[i] = dl[i-1] * p + d[i] * c + du[i] * n
end
p = v[N-1]
c = v[N]
@inbounds u[N] = dl[N-1] * p + d[N] * c
u
end
let N = 5
T = Float64
A = Tridiagonal(rand(T, N-1), rand(T, N), rand(T, N-1))
v = rand(T, N)
u1 = Array{T}(undef, N)
u2 = Array{T}(undef, N)
matmul!(u1, A, v)
mul!(u2, A, v)
u1 - u2
end
let N = 5
T = Float64
A = Tridiagonal(rand(T, N-1), rand(T, N), rand(T, N-1))
v = rand(T, N)
u1 = Array{T}(undef, N)
u2 = Array{T}(undef, N)
matmul_turbo!(u1, A, v)
mul!(u2, A, v)
u1 - u2
end
I get:
julia> let N = 5
T = Float64
A = Tridiagonal(rand(T, N-1), rand(T, N), rand(T, N-1))
v = rand(T, N)
u1 = Array{T}(undef, N)
u2 = Array{T}(undef, N)
matmul!(u1, A, v)
mul!(u2, A, v)
u1 - u2
end
5-element Vector{Float64}:
0.0
0.0
0.0
0.0
0.0
julia> let N = 5
T = Float64
A = Tridiagonal(rand(T, N-1), rand(T, N), rand(T, N-1))
v = rand(T, N)
u1 = Array{T}(undef, N)
u2 = Array{T}(undef, N)
matmul_turbo!(u1, A, v)
mul!(u2, A, v)
u1 - u2
end
5-element Vector{Float64}:
0.0
0.0
0.0
1.1102230246251565e-16
0.0
This obviously doesn't solve the issue, LoopVectorization should figure out how to "correctly handle variable redefinitions between iterations" itself. But it's at least a way to get the example to work.
Ah interesting, I had tried this before and was running into performance troubles, but I just tried it again and it's quite fast at least for larger matrices. Maybe a change happened in LV or maybe I was just doing something dumb. Thanks Chris!