LoopVectorization.jl icon indicating copy to clipboard operation
LoopVectorization.jl copied to clipboard

Handle loop carried dependencies that are apparent from constant offsets

Open chriselrod opened this issue 5 years ago β€’ 4 comments
trafficstars

using LoopVectorization
function grad!(π›₯rec, π›₯β„›, rec)
    for i in 2:size(π›₯rec, 1)-1
        for j in 2:size(π›₯rec, 2)-1
            ℰ𝓍1 = conj(π›₯β„›[1])
            ℰ𝓍2 = rec[i, j + 1] + rec[i, j - 1]
            ℰ𝓍3 = rec[i + 1, j] + ℰ𝓍2
            ℰ𝓍4 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍5 = 4 * rec[i, j]
            ℰ𝓍6 = ℰ𝓍4 - ℰ𝓍5
            ℰ𝓍7 = 2ℰ𝓍6
            ℰ𝓍8 = ℰ𝓍1 * ℰ𝓍7
            ℰ𝓍9 = conj(ℰ𝓍8)
            ℰ𝓍10 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍11 = ℰ𝓍10 - ℰ𝓍5
            ℰ𝓍12 = 2ℰ𝓍11
            ℰ𝓍13 = ℰ𝓍1 * ℰ𝓍12
            ℰ𝓍14 = conj(ℰ𝓍13)
            ℰ𝓍15 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍16 = ℰ𝓍15 - ℰ𝓍5
            ℰ𝓍17 = 2ℰ𝓍16
            ℰ𝓍18 = ℰ𝓍1 * ℰ𝓍17
            ℰ𝓍19 = conj(ℰ𝓍18)
            ℰ𝓍20 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍21 = ℰ𝓍20 - ℰ𝓍5
            ℰ𝓍22 = 2ℰ𝓍21
            ℰ𝓍23 = ℰ𝓍1 * ℰ𝓍22
            ℰ𝓍24 = conj(ℰ𝓍23)
            ℰ𝓍25 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍26 = ℰ𝓍25 - ℰ𝓍5
            ℰ𝓍27 = 2ℰ𝓍26
            ℰ𝓍28 = -4ℰ𝓍27
            ℰ𝓍29 = ℰ𝓍1 * ℰ𝓍28
            ℰ𝓍30 = conj(ℰ𝓍29)
            π›₯rec[i - 1, j] = π›₯rec[i - 1, j] + ℰ𝓍9
            π›₯rec[i + 1, j] = π›₯rec[i + 1, j] + ℰ𝓍14
            π›₯rec[i, j + 1] = π›₯rec[i, j + 1] + ℰ𝓍19
            π›₯rec[i, j - 1] = π›₯rec[i, j - 1] + ℰ𝓍24
            π›₯rec[i, j] = π›₯rec[i, j] + ℰ𝓍30
        end
    end
end
function gradavx!(π›₯rec, π›₯β„›, rec)
    @avx for i in 2:size(π›₯rec, 1)-1
        for j in 2:size(π›₯rec, 2)-1
            ℰ𝓍1 = conj(π›₯β„›[1])
            ℰ𝓍2 = rec[i, j + 1] + rec[i, j - 1]
            ℰ𝓍3 = rec[i + 1, j] + ℰ𝓍2
            ℰ𝓍4 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍5 = 4 * rec[i, j]
            ℰ𝓍6 = ℰ𝓍4 - ℰ𝓍5
            ℰ𝓍7 = 2ℰ𝓍6
            ℰ𝓍8 = ℰ𝓍1 * ℰ𝓍7
            ℰ𝓍9 = conj(ℰ𝓍8)
            ℰ𝓍10 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍11 = ℰ𝓍10 - ℰ𝓍5
            ℰ𝓍12 = 2ℰ𝓍11
            ℰ𝓍13 = ℰ𝓍1 * ℰ𝓍12
            ℰ𝓍14 = conj(ℰ𝓍13)
            ℰ𝓍15 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍16 = ℰ𝓍15 - ℰ𝓍5
            ℰ𝓍17 = 2ℰ𝓍16
            ℰ𝓍18 = ℰ𝓍1 * ℰ𝓍17
            ℰ𝓍19 = conj(ℰ𝓍18)
            ℰ𝓍20 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍21 = ℰ𝓍20 - ℰ𝓍5
            ℰ𝓍22 = 2ℰ𝓍21
            ℰ𝓍23 = ℰ𝓍1 * ℰ𝓍22
            ℰ𝓍24 = conj(ℰ𝓍23)
            ℰ𝓍25 = rec[i - 1, j] + ℰ𝓍3
            ℰ𝓍26 = ℰ𝓍25 - ℰ𝓍5
            ℰ𝓍27 = 2ℰ𝓍26
            ℰ𝓍28 = -4ℰ𝓍27
            ℰ𝓍29 = ℰ𝓍1 * ℰ𝓍28
            ℰ𝓍30 = conj(ℰ𝓍29)
            π›₯rec[i - 1, j] = π›₯rec[i - 1, j] + ℰ𝓍9
            π›₯rec[i + 1, j] = π›₯rec[i + 1, j] + ℰ𝓍14
            π›₯rec[i, j + 1] = π›₯rec[i, j + 1] + ℰ𝓍19
            π›₯rec[i, j - 1] = π›₯rec[i, j - 1] + ℰ𝓍24
            π›₯rec[i, j] = π›₯rec[i, j] + ℰ𝓍30
        end
    end
end
x = rand(10,10);
dx1 = fill!(similar(x), 0); dx2 = fill!(similar(x), 0);
del = ones(1);
grad!(dx1, del, x); gradavx!(dx2, del, x)
dx1 .- dx2

chriselrod avatar Jun 01 '20 16:06 chriselrod

Do you get a larger disagreement here than this?

julia> dx1 .- dx2 |> extrema                                             
(-3.552713678800501e-15, 3.552713678800501e-15)

mcabbott avatar Jun 01 '20 16:06 mcabbott

julia> dx1 .- dx2 |> extrema
(-4.304807428601304, 7.271464924157296)

You should be able to replicate results like that with @avx unroll=(1,4) for...

chriselrod avatar Jun 01 '20 16:06 chriselrod

Ah indeed. unroll=4 did not, but (1,4) gives large numbers.

mcabbott avatar Jun 01 '20 16:06 mcabbott

Awesome, would be curious to see how you did it. But no rush, obviously.

By making it a lot more conservative with respect to non-aliasing (e.g., stop it from reordering stores, and remove all the flags communicating non-aliasing to LLVM).

But the cost modeling had trouble, because it still wanted to unroll the j loop, even though this was very slow with the above adjustments: Data would have have to repeatedly move through stores and reloads in quick succession, which is something LoopVectorization hadn't been set up to account for (assuming non-aliasing, that doesn't happen).

Unrolling the i loop by 10 led to those impressive speeds. I think the extreme factors helped, because it lets us get around the aliasing problems and actually have independent operations with which to take advantage of out of order execution. Otherwise we're crippled by the slow store/load dependency chain through π›₯rec.

The better solution than just preventing it from unrolling j to force it to unroll i is to actually (a) model this dependency chain, and (b) convert the dependency chain from a sequence of loads and stores into a sequence of reassignments (free) and shuffles (shuffles are needed for i - 1 -> i + 1 -> i):

π›₯rec[i - 1, j] = π›₯rec[i - 1, j] + ℰ𝓍9
π›₯rec[i + 1, j] = π›₯rec[i + 1, j] + ℰ𝓍14 # ℰ𝓍14 == ℰ𝓍9
π›₯rec[i, j + 1] = π›₯rec[i, j + 1] + ℰ𝓍19 # ℰ𝓍19  == ℰ𝓍9
π›₯rec[i, j - 1] = π›₯rec[i, j - 1] + ℰ𝓍24 # ℰ𝓍24 == ℰ𝓍9
π›₯rec[i, j] = π›₯rec[i, j] + ℰ𝓍30

With it actually understanding this dependency chain, it could decide for itself whether to break it up by unrolling i a lot, or if it's better to unroll j to get more savings on stores/reloads (because it doesn't do the shuffles and reassignments correctly here in the face of aliasing yet, the j option was super slow, but it'll probably be pretty fast once it learns how to do it correctly).

chriselrod avatar Jun 02 '20 07:06 chriselrod