LoopVectorization.jl icon indicating copy to clipboard operation
LoopVectorization.jl copied to clipboard

LoopVectorization returning zeros in @tturbo accelerated region

Open Chiil opened this issue 3 years ago • 3 comments

I have written a macro that generates a fast loop for interpolations. Without @tturbo and with Threads.@threads the code runs fine, but it fails with @tturbo which does not do any computations. I print the generated code to the screen, this is rather straightforward and I do not know what goes wrong here:

## Load the packages.
using LoopVectorization
using Interpolations
using Statistics
using BenchmarkTools
using PyPlot
using Printf


## Define the upsample functions.
macro upsample_lin_fast(suffix, ifac, jfac, kfac, ioff, joff, koff)
    ex_inner = []
    for kk in 0:kfac-1, jj in 0:jfac-1, ii in 0:ifac-1
        push!(ex_inner, :(i_hi = $ifac*i+$ii+is_hi), :(j_hi = $jfac*j+$jj+js_hi), :(k_hi = $kfac*k+$kk+ks_hi))

        i_pos = -1/2 + ioff + (1/2 - ioff + ii)/ifac
        j_pos = -1/2 + joff + (1/2 - joff + jj)/jfac
        k_pos = -1/2 + koff + (1/2 - koff + kk)/kfac

        i_lo_off = floor(Int, i_pos) + is_lo
        j_lo_off = floor(Int, j_pos) + js_lo
        k_lo_off = floor(Int, k_pos) + ks_lo
        push!(ex_inner, :(i_lo = i + $i_lo_off), :(j_lo = j + $j_lo_off), :(k_lo = k + $k_lo_off))

        fi = mod(i_pos, 1)
        fj = mod(j_pos, 1)
        fk = mod(k_pos, 1)

        coef = zeros(2, 2, 2)
        coef[1, 1, 1] = (((1-fi)*fj+fi-1)*fk+(fi-1)*fj-fi+1)
        coef[2, 1, 1] = ((fi*fj-fi)*fk-fi*fj+fi)
        coef[1, 2, 1] = ((fi-1)*fj*fk+(1-fi)*fj)
        coef[2, 2, 1] = (fi*fj-fi*fj*fk)
        coef[1, 1, 2] = ((fi-1)*fj-fi+1)*fk
        coef[2, 1, 2] = (fi-fi*fj)*fk
        coef[1, 2, 2] = (1-fi)*fj*fk
        coef[2, 2, 2] = fi*fj*fk

        rhs_list = []
        for kc in 1:2, jc in 1:2, ic in 1:2
            if !(coef[ic, jc, kc] ≈ 0)
                push!(rhs_list, :( $(coef[ic, jc, kc])*lo[i_lo+$(ic-1), j_lo+$(jc-1), k_lo+$(kc-1)] ))
            end
        end

        rhs = Expr(:call, :+, rhs_list...)

        push!(ex_inner, :(hi[i_hi, j_hi, k_hi] = $rhs))
    end

    ex_inner_block = Expr(:block, ex_inner...)

    name = Symbol(@sprintf "upsample_lin_%s!" suffix)
    ex = quote
        function $name(hi, lo, itot, jtot, ktot, is_hi, js_hi, ks_hi, is_lo, js_lo, ks_lo)
            # THIS WORKS WELL
            # Threads.@threads for k in 0:ktot-1
            #     for j in 0:jtot-1
            #         @inbounds @simd for i in 0:itot-1
            #             $ex_inner_block
            #         end
            #     end
            # end
            @tturbo for k in 0:ktot-1
                for j in 0:jtot-1
                    for i in 0:itot-1
                        $ex_inner_block
                    end
                end
            end
        end
    end
    println("FAILING CODE START")
    println(ex)
    println("FAILING END")

    return esc(ex)
end

function upsample_lin_ref!(hi, lo, x_hi, y_hi, z_hi, x_lo, y_lo, z_lo, is_hi, js_hi, ks_hi, ie_hi, je_hi, ke_hi)
    interp = interpolate((x_lo, y_lo, z_lo), lo, (Gridded(Linear()), Gridded(Linear()), Gridded(Linear())))
    hi[is_hi:ie_hi, js_hi:je_hi, ks_hi:ke_hi] .= interp(x_hi[is_hi:ie_hi], y_hi[js_hi:je_hi], z_hi[ks_hi:ke_hi])
end


## Set up the grids.
itot_lo = 256; jtot_lo = 192; ktot_lo = 128
itot_hi = 512; jtot_hi = 384; ktot_hi = 256
igc_lo = 1; jgc_lo = 1; kgc_lo = 1
igc_hi = 1; jgc_hi = 1; kgc_hi = 1

# Derive dimensions
ifac = itot_hi÷itot_lo; jfac = jtot_hi÷jtot_lo; kfac = ktot_hi÷ktot_lo;
icells_lo = itot_lo + 2igc_lo; jcells_lo = jtot_lo + 2jgc_lo; kcells_lo = ktot_lo + 2kgc_lo
icells_hi = itot_hi + 2igc_hi; jcells_hi = jtot_hi + 2jgc_hi; kcells_hi = ktot_hi + 2kgc_hi
is_lo = igc_lo+1; js_lo = jgc_lo+1; ks_lo = kgc_lo+1
is_hi = igc_hi+1; js_hi = jgc_hi+1; ks_hi = kgc_hi+1
ie_lo = igc_lo+itot_lo; je_lo = jgc_lo+jtot_lo; ke_lo = kgc_lo+ktot_lo
ie_hi = igc_hi+itot_hi; je_hi = jgc_hi+jtot_hi; ke_hi = kgc_hi+ktot_hi

a_lo = rand(icells_lo, jcells_lo, kcells_lo)
a_hi_ref = zeros(icells_hi, jcells_hi, kcells_hi)
a_hi = zeros(icells_hi, jcells_hi, kcells_hi)
a_hi_fast = zeros(icells_hi, jcells_hi, kcells_hi)

dx_lo = 1/itot_lo; dy_lo = 1/jtot_lo; dz_lo = 1/ktot_lo
x_lo = collect(range(0.5-igc_lo, length=icells_lo)) .* dx_lo
y_lo = collect(range(0.5-jgc_lo, length=jcells_lo)) .* dy_lo
z_lo = collect(range(0.5-kgc_lo, length=kcells_lo)) .* dz_lo
xh_lo = collect(range(-igc_lo, length=icells_lo)) .* dx_lo
yh_lo = collect(range(-jgc_lo, length=jcells_lo)) .* dy_lo

dx_hi = 1/itot_hi; dy_hi = 1/jtot_hi; dz_hi = 1/ktot_hi
x_hi = collect(range(0.5-igc_hi, length=icells_hi)) .* dx_hi
y_hi = collect(range(0.5-jgc_hi, length=jcells_hi)) .* dy_hi
z_hi = collect(range(0.5-kgc_hi, length=kcells_hi)) .* dz_hi
xh_hi = collect(range(-igc_hi, length=icells_hi)) .* dx_hi
yh_hi = collect(range(-jgc_hi, length=jcells_hi)) .* dy_hi


## Compute a reference using Interpolations.jl
@btime upsample_lin_ref!(
    a_hi_ref, a_lo, x_hi, y_hi, z_hi, x_lo, y_lo, z_lo,
    is_hi, js_hi, ks_hi, ie_hi, je_hi, ke_hi)
@upsample_lin_fast("222", 2, 2, 2, 0, 0, 0)
@btime upsample_lin_222!(
    a_hi_fast, a_lo, itot_lo, jtot_lo, ktot_lo,
    is_hi, js_hi, ks_hi, is_lo, js_lo, ks_lo)

a_lo_int = @view a_lo[is_lo:ie_lo, js_lo:je_lo, ks_lo:ke_lo]
a_hi_ref_int = @view a_hi_ref[is_hi:ie_hi, js_hi:je_hi, ks_hi:ke_hi]
a_hi_int = @view a_hi[is_hi:ie_hi, js_hi:je_hi, ks_hi:ke_hi]
a_hi_fast_int = @view a_hi_fast[is_hi:ie_hi, js_hi:je_hi, ks_hi:ke_hi]

println("Fast equal to ref: ", a_hi_fast_int ≈ a_hi_ref_int)


## Plot the output.
x_lo_int = @view x_lo[is_lo:ie_lo]
x_hi_int = @view x_hi[is_hi:ie_hi]

figure()
plot(x_hi_int, a_hi_ref_int[:, 1, 1], "C0-^")
plot(x_hi_int, a_hi_fast_int[:, 1, 1], "C1-*")
plot(x_lo_int, a_lo_int[:, 1, 1], "k:+")
tight_layout()
display(gcf())
show()

Chiil avatar Nov 22 '21 16:11 Chiil

In the actual version, LoopVectorization returns an error, which is acceptable.

Chiil avatar Mar 24 '22 14:03 Chiil

I haven't had the time to look at this. The free time I've been spending on loop optimizaiton and SIMD has been dedicated to rewriting LoopVectorization, so my plan for supporting this would be having it work in the rewrite. Although the rewrite won't support threading for a while (and it'll still be a long time before it is ready).

But just looking at the @macroexpand

i_hi = 2i + 0 + is_hi
j_hi = 2j + 0 + js_hi
k_hi = 2k + 0 + ks_hi
i_lo = i + 1
j_lo = j + 1
k_lo = k + 1
hi[i_hi, j_hi, k_hi] = 0.015625 * lo[i_lo + 0, j_lo + 0, k_lo + 0] + 0.046875 * lo[i_lo + 1, j_lo + 0, k_lo + 0] + 0.046875 * lo[i_lo + 0, j_lo + 1, k_lo + 0] + 0.140625 * lo[i_lo + 1, j_lo + 1, k_lo + 0] + 0.046875 * lo[i_lo + 0, j_lo + 0, k_lo + 1] + 0.140625 * lo[i_lo + 1, j_lo + 0, k_lo + 1] + 0.140625 * lo[i_lo + 0, j_lo + 1, k_lo + 1] + 0.421875 * lo[i_lo + 1, j_lo + 1, k_lo + 1]
i_hi = 2i + 1 + is_hi
j_hi = 2j + 0 + js_hi
k_hi = 2k + 0 + ks_hi
i_lo = i + 2
j_lo = j + 1
k_lo = k + 1
hi[i_hi, j_hi, k_hi] = 0.046875 * lo[i_lo + 0, j_lo + 0, k_lo + 0] + 0.015625 * lo[i_lo + 1, j_lo + 0, k_lo + 0] + 0.140625 * lo[i_lo + 0, j_lo + 1, k_lo + 0] + 0.046875 * lo[i_lo + 1, j_lo + 1, k_lo + 0] + 0.140625 * lo[i_lo + 0, j_lo + 0, k_lo + 1] + 0.046875 * lo[i_lo + 1, j_lo + 0, k_lo + 1] + 0.421875 * lo[i_lo + 0, j_lo + 1, k_lo + 1] + 0.140625 * lo[i_lo + 1, j_lo + 1, k_lo + 1]
i_hi = 2i + 0 + is_hi
j_hi = 2j + 1 + js_hi
k_hi = 2k + 0 + ks_hi
i_lo = i + 1
j_lo = j + 2
k_lo = k + 1
hi[i_hi, j_hi, k_hi] = 0.046875 * lo[i_lo + 0, j_lo + 0, k_lo + 0] + 0.140625 * lo[i_lo + 1, j_lo + 0, k_lo + 0] + 0.015625 * lo[i_lo + 0, j_lo + 1, k_lo + 0] + 0.046875 * lo[i_lo + 1, j_lo + 1, k_lo + 0] + 0.140625 * lo[i_lo + 0, j_lo + 0, k_lo + 1] + 0.421875 * lo[i_lo + 1, j_lo + 0, k_lo + 1] + 0.046875 * lo[i_lo + 0, j_lo + 1, k_lo + 1] + 0.140625 * lo[i_lo + 1, j_lo + 1, k_lo + 1]
i_hi = 2i + 1 + is_hi
j_hi = 2j + 1 + js_hi
k_hi = 2k + 0 + ks_hi
i_lo = i + 2
j_lo = j + 2
k_lo = k + 1
hi[i_hi, j_hi, k_hi] = 0.140625 * lo[i_lo + 0, j_lo + 0, k_lo + 0] + 0.046875 * lo[i_lo + 1, j_lo + 0, k_lo + 0] + 0.046875 * lo[i_lo + 0, j_lo + 1, k_lo + 0] + 0.015625 * lo[i_lo + 1, j_lo + 1, k_lo + 0] + 0.421875 * lo[i_lo + 0, j_lo + 0, k_lo + 1] + 0.140625 * lo[i_lo + 1, j_lo + 0, k_lo + 1] + 0.140625 * lo[i_lo + 0, j_lo + 1, k_lo + 1] + 0.046875 * lo[i_lo + 1, j_lo + 1, k_lo + 1]
i_hi = 2i + 0 + is_hi
j_hi = 2j + 0 + js_hi
k_hi = 2k + 1 + ks_hi
i_lo = i + 1
j_lo = j + 1
k_lo = k + 2
hi[i_hi, j_hi, k_hi] = 0.046875 * lo[i_lo + 0, j_lo + 0, k_lo + 0] + 0.140625 * lo[i_lo + 1, j_lo + 0, k_lo + 0] + 0.140625 * lo[i_lo + 0, j_lo + 1, k_lo + 0] + 0.421875 * lo[i_lo + 1, j_lo + 1, k_lo + 0] + 0.015625 * lo[i_lo + 0, j_lo + 0, k_lo + 1] + 0.046875 * lo[i_lo + 1, j_lo + 0, k_lo + 1] + 0.046875 * lo[i_lo + 0, j_lo + 1, k_lo + 1] + 0.140625 * lo[i_lo + 1, j_lo + 1, k_lo + 1]
i_hi = 2i + 1 + is_hi
j_hi = 2j + 0 + js_hi
k_hi = 2k + 1 + ks_hi
i_lo = i + 2
j_lo = j + 1
k_lo = k + 2
hi[i_hi, j_hi, k_hi] = 0.140625 * lo[i_lo + 0, j_lo + 0, k_lo + 0] + 0.046875 * lo[i_lo + 1, j_lo + 0, k_lo + 0] + 0.421875 * lo[i_lo + 0, j_lo + 1, k_lo + 0] + 0.140625 * lo[i_lo + 1, j_lo + 1, k_lo + 0] + 0.046875 * lo[i_lo + 0, j_lo + 0, k_lo + 1] + 0.015625 * lo[i_lo + 1, j_lo + 0, k_lo + 1] + 0.140625 * lo[i_lo + 0, j_lo + 1, k_lo + 1] + 0.046875 * lo[i_lo + 1, j_lo + 1, k_lo + 1]
i_hi = 2i + 0 + is_hi
j_hi = 2j + 1 + js_hi
k_hi = 2k + 1 + ks_hi
i_lo = i + 1
j_lo = j + 2
k_lo = k + 2
hi[i_hi, j_hi, k_hi] = 0.140625 * lo[i_lo + 0, j_lo + 0, k_lo + 0] + 0.421875 * lo[i_lo + 1, j_lo + 0, k_lo + 0] + 0.046875 * lo[i_lo + 0, j_lo + 1, k_lo + 0] + 0.140625 * lo[i_lo + 1, j_lo + 1, k_lo + 0] + 0.046875 * lo[i_lo + 0, j_lo + 0, k_lo + 1] + 0.140625 * lo[i_lo + 1, j_lo + 0, k_lo + 1] + 0.015625 * lo[i_lo + 0, j_lo + 1, k_lo + 1] + 0.046875 * lo[i_lo + 1, j_lo + 1, k_lo + 1]
i_hi = 2i + 1 + is_hi
j_hi = 2j + 1 + js_hi
k_hi = 2k + 1 + ks_hi
i_lo = i + 2
j_lo = j + 2
k_lo = k + 2
hi[i_hi, j_hi, k_hi] = 0.421875 * lo[i_lo + 0, j_lo + 0, k_lo + 0] + 0.140625 * lo[i_lo + 1, j_lo + 0, k_lo + 0] + 0.140625 * lo[i_lo + 0, j_lo + 1, k_lo + 0] + 0.046875 * lo[i_lo + 1, j_lo + 1, k_lo + 0] + 0.140625 * lo[i_lo + 0, j_lo + 0, k_lo + 1] + 0.046875 * lo[i_lo + 1, j_lo + 0, k_lo + 1] + 0.046875 * lo[i_lo + 0, j_lo + 1, k_lo + 1] + 0.015625 * lo[i_lo + 1, j_lo + 1, k_lo + 1]

I'm guessing there is a bug in how LoopVectorization understands the indices used for storing into hi.

chriselrod avatar Mar 24 '22 15:03 chriselrod

Alright. Thanks for the feedback. I am already glad the function does not return zeros anymore, because in an earlier version the function did not return an error.

Chiil avatar Mar 24 '22 15:03 Chiil