LoopVectorization.jl Handle loop carried dependencies that are apparent from constant offsets

trafficstars

using LoopVectorization
function grad!(𝛥rec, 𝛥ℛ, rec)
    for i in 2:size(𝛥rec, 1)-1
        for j in 2:size(𝛥rec, 2)-1
            ℰ𝓍1 = conj(𝛥ℛ[1])
            ℰ𝓍2 = rec[i, j + 1] + rec[i, j - 1]
            ℰ𝓍3 = rec[i + 1, j] + ℰ𝓍2
            ℰ𝓍4 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍5 = 4 * rec[i, j]
            ℰ𝓍6 = ℰ𝓍4 - ℰ𝓍5
            ℰ𝓍7 = 2ℰ𝓍6
            ℰ𝓍8 = ℰ𝓍1 * ℰ𝓍7
            ℰ𝓍9 = conj(ℰ𝓍8)
            ℰ𝓍10 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍11 = ℰ𝓍10 - ℰ𝓍5
            ℰ𝓍12 = 2ℰ𝓍11
            ℰ𝓍13 = ℰ𝓍1 * ℰ𝓍12
            ℰ𝓍14 = conj(ℰ𝓍13)
            ℰ𝓍15 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍16 = ℰ𝓍15 - ℰ𝓍5
            ℰ𝓍17 = 2ℰ𝓍16
            ℰ𝓍18 = ℰ𝓍1 * ℰ𝓍17
            ℰ𝓍19 = conj(ℰ𝓍18)
            ℰ𝓍20 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍21 = ℰ𝓍20 - ℰ𝓍5
            ℰ𝓍22 = 2ℰ𝓍21
            ℰ𝓍23 = ℰ𝓍1 * ℰ𝓍22
            ℰ𝓍24 = conj(ℰ𝓍23)
            ℰ𝓍25 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍26 = ℰ𝓍25 - ℰ𝓍5
            ℰ𝓍27 = 2ℰ𝓍26
            ℰ𝓍28 = -4ℰ𝓍27
            ℰ𝓍29 = ℰ𝓍1 * ℰ𝓍28
            ℰ𝓍30 = conj(ℰ𝓍29)
            𝛥rec[i - 1, j] = 𝛥rec[i - 1, j] + ℰ𝓍9
            𝛥rec[i + 1, j] = 𝛥rec[i + 1, j] + ℰ𝓍14
            𝛥rec[i, j + 1] = 𝛥rec[i, j + 1] + ℰ𝓍19
            𝛥rec[i, j - 1] = 𝛥rec[i, j - 1] + ℰ𝓍24
            𝛥rec[i, j] = 𝛥rec[i, j] + ℰ𝓍30
        end
    end
end
function gradavx!(𝛥rec, 𝛥ℛ, rec)
    @avx for i in 2:size(𝛥rec, 1)-1
        for j in 2:size(𝛥rec, 2)-1
            ℰ𝓍1 = conj(𝛥ℛ[1])
            ℰ𝓍2 = rec[i, j + 1] + rec[i, j - 1]
            ℰ𝓍3 = rec[i + 1, j] + ℰ𝓍2
            ℰ𝓍4 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍5 = 4 * rec[i, j]
            ℰ𝓍6 = ℰ𝓍4 - ℰ𝓍5
            ℰ𝓍7 = 2ℰ𝓍6
            ℰ𝓍8 = ℰ𝓍1 * ℰ𝓍7
            ℰ𝓍9 = conj(ℰ𝓍8)
            ℰ𝓍10 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍11 = ℰ𝓍10 - ℰ𝓍5
            ℰ𝓍12 = 2ℰ𝓍11
            ℰ𝓍13 = ℰ𝓍1 * ℰ𝓍12
            ℰ𝓍14 = conj(ℰ𝓍13)
            ℰ𝓍15 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍16 = ℰ𝓍15 - ℰ𝓍5
            ℰ𝓍17 = 2ℰ𝓍16
            ℰ𝓍18 = ℰ𝓍1 * ℰ𝓍17
            ℰ𝓍19 = conj(ℰ𝓍18)
            ℰ𝓍20 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍21 = ℰ𝓍20 - ℰ𝓍5
            ℰ𝓍22 = 2ℰ𝓍21
            ℰ𝓍23 = ℰ𝓍1 * ℰ𝓍22
            ℰ𝓍24 = conj(ℰ𝓍23)
            ℰ𝓍25 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍26 = ℰ𝓍25 - ℰ𝓍5
            ℰ𝓍27 = 2ℰ𝓍26
            ℰ𝓍28 = -4ℰ𝓍27
            ℰ𝓍29 = ℰ𝓍1 * ℰ𝓍28
            ℰ𝓍30 = conj(ℰ𝓍29)
            𝛥rec[i - 1, j] = 𝛥rec[i - 1, j] + ℰ𝓍9
            𝛥rec[i + 1, j] = 𝛥rec[i + 1, j] + ℰ𝓍14
            𝛥rec[i, j + 1] = 𝛥rec[i, j + 1] + ℰ𝓍19
            𝛥rec[i, j - 1] = 𝛥rec[i, j - 1] + ℰ𝓍24
            𝛥rec[i, j] = 𝛥rec[i, j] + ℰ𝓍30
        end
    end
end
x = rand(10,10);
dx1 = fill!(similar(x), 0); dx2 = fill!(similar(x), 0);
del = ones(1);
grad!(dx1, del, x); gradavx!(dx2, del, x)
dx1 .- dx2

Jun 01 '20 16:06 chriselrod

Do you get a larger disagreement here than this?

julia> dx1 .- dx2 |> extrema                                             
(-3.552713678800501e-15, 3.552713678800501e-15)

Jun 01 '20 16:06 mcabbott

julia> dx1 .- dx2 |> extrema
(-4.304807428601304, 7.271464924157296)

You should be able to replicate results like that with @avx unroll=(1,4) for...

Jun 01 '20 16:06 chriselrod

Ah indeed. unroll=4 did not, but (1,4) gives large numbers.

Jun 01 '20 16:06 mcabbott

Awesome, would be curious to see how you did it. But no rush, obviously.

By making it a lot more conservative with respect to non-aliasing (e.g., stop it from reordering stores, and remove all the flags communicating non-aliasing to LLVM).

But the cost modeling had trouble, because it still wanted to unroll the j loop, even though this was very slow with the above adjustments: Data would have have to repeatedly move through stores and reloads in quick succession, which is something LoopVectorization hadn't been set up to account for (assuming non-aliasing, that doesn't happen).

Unrolling the i loop by 10 led to those impressive speeds. I think the extreme factors helped, because it lets us get around the aliasing problems and actually have independent operations with which to take advantage of out of order execution. Otherwise we're crippled by the slow store/load dependency chain through 𝛥rec.

The better solution than just preventing it from unrolling j to force it to unroll i is to actually (a) model this dependency chain, and (b) convert the dependency chain from a sequence of loads and stores into a sequence of reassignments (free) and shuffles (shuffles are needed for i - 1 -> i + 1 -> i):

𝛥rec[i - 1, j] = 𝛥rec[i - 1, j] + ℰ𝓍9
𝛥rec[i + 1, j] = 𝛥rec[i + 1, j] + ℰ𝓍14 # ℰ𝓍14 == ℰ𝓍9
𝛥rec[i, j + 1] = 𝛥rec[i, j + 1] + ℰ𝓍19 # ℰ𝓍19  == ℰ𝓍9
𝛥rec[i, j - 1] = 𝛥rec[i, j - 1] + ℰ𝓍24 # ℰ𝓍24 == ℰ𝓍9
𝛥rec[i, j] = 𝛥rec[i, j] + ℰ𝓍30

With it actually understanding this dependency chain, it could decide for itself whether to break it up by unrolling i a lot, or if it's better to unroll j to get more savings on stores/reloads (because it doesn't do the shuffles and reassignments correctly here in the face of aliasing yet, the j option was super slow, but it'll probably be pretty fast once it learns how to do it correctly).

Jun 02 '20 07:06 chriselrod

LoopVectorization.jl LoopVectorization.jl copied to clipboard

Handle loop carried dependencies that are apparent from constant offsets

LoopVectorization.jl
LoopVectorization.jl copied to clipboard