LoopVectorization.jl `@avx` incorrectly handling variable redefinitions between iterations

trafficstars

I'm not sure if this is something that @avx is supposed to be able to handle, but when I write

using LoopVectorization

function matmul!(u::AbstractVector{T}, A::Tridiagonal{T}, v::AbstractVector{T}) where {T}
    @assert length(u) == size(A,1) == size(A,2) == length(v)
    dl, d, du = A.dl, A.d, A.du
    N = length(u); @assert N > 2

    p = zero(T)
    c = v[1]
    n = v[2] 
    @inbounds u[1] = d[1] * c + du[1] * n
    for i in 2:(N-1)
        p = c
        c = n
        n = v[i+1]
        u[i] = dl[i-1] * p + d[i] * c + du[i] * n
    end
    p = c
    c = n
    @inbounds u[N] = dl[N-1] * p + d[N] * c
    u
end

I get correct results compared to mul!

let N = 5
    T = Float64
    A = Tridiagonal(rand(T, N-1), rand(T, N), rand(T, N-1))
    v  = rand(T, N)
    u1 = Array{T}(undef, N)
    u2 = Array{T}(undef, N)

    matmul!(u1, A, v)
       mul!(u2, A, v)
    u1 - u2
end

#+RESULTS:
: 5-element Vector{Float64}:
:  0.0
:  0.0
:  0.0
:  0.0
:  0.0

However, sticking @avx on the loop makes it get the wrong answer:

function matmul_avx!(u::AbstractVector{T}, A::Tridiagonal{T}, v::AbstractVector{T}) where {T}
    @assert length(u) == size(A,1) == size(A,2) == length(v)
    dl, d, du = A.dl, A.d, A.du
    N = length(u); @assert N > 2

    p = zero(T)
    c = v[1]
    n = v[2] 
    @inbounds u[1] = d[1] * c + du[1] * n
    @avx for i in 2:(N-1)
        p = c
        c = n
        n = v[i+1]
        u[i] = dl[i-1] * p + d[i] * c + du[i] * n
    end
    p = c
    c = n
    @inbounds u[N] = dl[N-1] * p + d[N] * c
    u
end

let N = 5
    T = Float64
    A = Tridiagonal(rand(T, N-1), rand(T, N), rand(T, N-1))
    v  = rand(T, N)
    u1 = Array{T}(undef, N)
    u2 = Array{T}(undef, N)

    matmul_avx!(u1, A, v)
       mul!(u2, A, v)
    u1 - u2
end

#+RESULTS:
: 5-element Vector{Float64}:
:  0.0
:  5.551115123125783e-17
:  0.3137201723219849
:  0.40281562647422675
:  0.29883752908665995

One thing I tried in order to fix this was to use an intermediate array to store p, c, n, but @avx didn't like that, giving an undefvarerror:

function matmul2_avx!(u::AbstractVector{T}, A::Tridiagonal{T}, v::AbstractVector{T}) where {T}
    @assert length(u) == size(A,1) == size(A,2) == length(v)
    dl, d, du = A.dl, A.d, A.du
    N = length(u)
    if N == 0
        nothing
    elseif N == 1
        u[1] = d[1] * v[1]
        return u
    else
        dep = MVector((zero(T), v[1], v[2]))
        @inbounds u[1] = d[1] * dep[2] + du[1] * dep[3]
        @avx for i in 2:(N-1)
            dep[1] = dep[2]
            dep[2] = dep[3]
            dep[3] = v[i+1]
            u[i] = dl[i-1] * dep[1] + d[i] * dep[2] + du[i] * dep[3]
        end
        dep[1] = dep[2]
        dep[2] = dep[3]
        @inbounds u[N] = dl[N-1] * dep[1] + d[N] * dep[2]
    end
    u
end

let N = 5
    T = Float64
    A = Tridiagonal(rand(T, N-1), rand(T, N), rand(T, N-1))
    v  = rand(T, N)
    u1 = Array{T}(undef, N)
    u2 = Array{T}(undef, N)

    matmul2_avx!(u1, A, v)
    mul!(u2, A, v)
    u1 - u2
end

#+RESULTS:
:RESULTS:
# [goto error]
#+begin_example
UndefVarError: ######RHS###4######5### not defined

Stacktrace:
 [1] matmul2_avx!(u::Vector{Float64}, A::Tridiagonal{Float64, Vector{Float64}}, v::Vector{Float64})
   @ Main ./In[179]:41
 [2] top-level scope
   @ In[184]:8
 [3] eval
   @ ./boot.jl:360 [inlined]
 [4] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094
#+end_example
:END:

May 19 '21 01:05 MasonProtter

Thanks for the issue. I think correctly handling this will wait for the rewrite, which will track dependencies between loop iterations. Not sure about a timeline for it, though.

May 19 '21 04:05 chriselrod

I see. Does that apply as well to the MVector version?

May 19 '21 04:05 MasonProtter

Yes. Worse than that, it currently tries to hoist the loads out of the loop. Although that doesn't explain the ######RHS###4######5### not defined error.

May 19 '21 08:05 chriselrod

@MasonProtter, here is how to rewrite the loops to get them to work:

using LoopVectorization, LinearAlgebra
function matmul!(u::AbstractVector{T}, A::Tridiagonal{T}, v::AbstractVector{T}) where {T}
  @assert length(u) == size(A,1) == size(A,2) == length(v)
  dl, d, du = A.dl, A.d, A.du
  N = length(u); @assert N > 2

  p = zero(T)
  @inbounds u[1] = d[1] * v[1] + du[1] * v[2]
  for i in 2:(N-1)
    p = v[i-1]
    c = v[i]
    n = v[i+1]
    u[i] = dl[i-1] * p + d[i] * c + du[i] * n
  end
  p = v[N-1]
  c = v[N]
  @inbounds u[N] = dl[N-1] * p + d[N] * c
  u
end
function matmul_turbo!(u::AbstractVector{T}, A::Tridiagonal{T}, v::AbstractVector{T}) where {T}
  @assert length(u) == size(A,1) == size(A,2) == length(v)
  dl, d, du = A.dl, A.d, A.du
  N = length(u); @assert N > 2

  p = zero(T)
  @inbounds u[1] = d[1] * v[1] + du[1] * v[2]
  @turbo for i in 2:(N-1)
    p = v[i-1]
    c = v[i]
    n = v[i+1]
    u[i] = dl[i-1] * p + d[i] * c + du[i] * n
  end
  p = v[N-1]
  c = v[N]
  @inbounds u[N] = dl[N-1] * p + d[N] * c
  u
end
let N = 5
    T = Float64
    A = Tridiagonal(rand(T, N-1), rand(T, N), rand(T, N-1))
    v  = rand(T, N)
    u1 = Array{T}(undef, N)
    u2 = Array{T}(undef, N)

    matmul!(u1, A, v)
       mul!(u2, A, v)
    u1 - u2
end
let N = 5
    T = Float64
    A = Tridiagonal(rand(T, N-1), rand(T, N), rand(T, N-1))
    v  = rand(T, N)
    u1 = Array{T}(undef, N)
    u2 = Array{T}(undef, N)

    matmul_turbo!(u1, A, v)
       mul!(u2, A, v)
    u1 - u2
end

I get:

julia> let N = 5
           T = Float64
           A = Tridiagonal(rand(T, N-1), rand(T, N), rand(T, N-1))
           v  = rand(T, N)
           u1 = Array{T}(undef, N)
           u2 = Array{T}(undef, N)

           matmul!(u1, A, v)
              mul!(u2, A, v)
           u1 - u2
       end
5-element Vector{Float64}:
 0.0
 0.0
 0.0
 0.0
 0.0

julia> let N = 5
           T = Float64
           A = Tridiagonal(rand(T, N-1), rand(T, N), rand(T, N-1))
           v  = rand(T, N)
           u1 = Array{T}(undef, N)
           u2 = Array{T}(undef, N)

           matmul_turbo!(u1, A, v)
              mul!(u2, A, v)
           u1 - u2
       end
5-element Vector{Float64}:
 0.0
 0.0
 0.0
 1.1102230246251565e-16
 0.0

This obviously doesn't solve the issue, LoopVectorization should figure out how to "correctly handle variable redefinitions between iterations" itself. But it's at least a way to get the example to work.

May 29 '21 10:05 chriselrod

Ah interesting, I had tried this before and was running into performance troubles, but I just tried it again and it's quite fast at least for larger matrices. Maybe a change happened in LV or maybe I was just doing something dumb. Thanks Chris!

May 29 '21 18:05 MasonProtter

LoopVectorization.jl LoopVectorization.jl copied to clipboard

`@avx` incorrectly handling variable redefinitions between iterations

LoopVectorization.jl
LoopVectorization.jl copied to clipboard