Tullio.jl Wrong derivative reducing over `min`, when using KernelAbstractions

MWE:

julia> using Tullio, CUDA, KernelAbstractions, Zygote, FiniteDifferences

julia> g(μ) = ifelse(μ == 4, 1, -1)
g (generic function with 1 method)

julia> push!(Tullio.BASE_NOGRAD, :g);

julia> f2(w, k) = @tullio min f[m] := w[j, m] * g(μ) * (k[j, μ] - k[m, μ])^2
f2 (generic function with 1 method)

julia> using Random; Random.seed!(1);

julia> k = rand(Float32, 2, 4)
2×4 Array{Float32,2}:
 0.547994  0.567737  0.27934   0.8135
 0.819285  0.557336  0.777828  0.00389743

julia> w = rand(Float32, 2, 2)
2×2 Array{Float32,2}:
 0.699683  0.903188
 0.568097  0.416541

julia> Δ = rand(Float32, 2)
2-element Array{Float32,1}:
 0.51531243
 0.13594759

julia> Zygote.pullback(f2, w, k)[2](Δ)[2]
2×4 Array{Float32,2}:
 0.0  0.0   0.414277  0.0
 0.0  0.0  -0.414277  0.0

julia> Zygote.pullback(f2, cu(w), cu(k))[2](cu(Δ))[2]
2×4 CuArray{Float32,2}:
 0.0  0.0   0.414277  0.0
 0.0  0.0  -0.122415  0.0

julia> j′vp(central_fdm(5, 1), f2, Δ, w, k)[2]
2×4 Array{Float32,2}:
 5.59755f-9  5.59755f-9   0.414277  5.59755f-9
 5.59755f-9  5.59755f-9  -0.414277  5.59755f-9

Seems fine if I use + instead of min as the reducer.

Dec 18 '20 18:12 simeonschaub

My guess is that this is a race condition due to a threading bug, or whatever the right GPU terminology is. I can't check locally at the moment, but I can look at the code it's generating:

┌ Warning: can't parallelise this gradient, no shared indices (symbolic gradient)
└ @ Tullio ~/.julia/dev/Tullio/src/macro.jl:1058
┌ Info: =====KA===== KernelAbstractions kernel (symbolic gradient)
│   verbosetidy(kex1) =
│    quote
│        KernelAbstractions.@kernel function var"##🇨🇺#520"(𝛥w, 𝛥k, 𝛥ℛ::AbstractArray{𝒯}, ℛ, w, k, 𝒶𝓍m, 𝒶𝓍j, 𝒶𝓍μ, ♻️, 💀) where 𝒯
│                (m,) = @index(Global, NTuple)
│                begin
│                    🆗𝓇𝒽𝓈 = 0
│                    for μ = 𝒶𝓍μ
│                        for j = 𝒶𝓍j
│                            𝓇𝒽𝓈⛰ = Tullio.onlyone(ℛ[m] == w[j, m] * g(μ) * (k[j, μ] - k[m, μ]) ^ 2, 🆗𝓇𝒽𝓈)
│                            begin
│                                𝛥w[j, m] += (ifelse)(𝓇𝒽𝓈⛰, 𝛥ℛ[m] * conj(g(μ) * (k[j, μ] - k[m, μ]) ^ 2), (zero)(𝒯))
│                                𝛥k[j, μ] += (ifelse)(𝓇𝒽𝓈⛰, 𝛥ℛ[m] * conj(((2 * (k[j, μ] - k[m, μ])) * g(μ)) * w[j, m]), (zero)(𝒯))
│                                𝛥k[m, μ] += (ifelse)(𝓇𝒽𝓈⛰, 𝛥ℛ[m] * conj((-(2 * (k[j, μ] - k[m, μ])) * g(μ)) * w[j, m]), (zero)(𝒯))
│                            end
│                            🆗𝓇𝒽𝓈 += Tullio.anyone(𝓇𝒽𝓈⛰)
│                        end
│                    end
│                end
│            end
└    end
┌ Info: =====KA===== KernelAbstractions CUDA actor (symbolic gradient)
│   verbosetidy(kex2) =
│    quote
│        local @inline(function ∇𝒜𝒸𝓉!(::Type{<:CuArray}, 𝛥w, 𝛥k, 𝛥ℛ::AbstractArray{𝒯}, ℛ, w, k, 𝒶𝓍m, 𝒶𝓍j, 𝒶𝓍μ, ♻️ = nothing, 💀 = true) where 𝒯
│                    @info "running KernelAbstractions + CUDA actor (symbolic gradient)" maxlog = 3 _id = 0x345f3c433fa484fd
│                    cu_kern! = var"##🇨🇺#520"(CUDADevice(), 256)
│                    (first)(𝒶𝓍m) == 1 || (throw)("KernelAbstractions can't handle OffsetArrays here")
│                    𝒜𝒸𝒸 = cu_kern!(𝛥w, 𝛥k, 𝛥ℛ::AbstractArray{𝒯}, ℛ, w, k, 𝒶𝓍m, 𝒶𝓍j, 𝒶𝓍μ, ♻️, 💀; ndrange = tuple(length(𝒶𝓍m)), dependencies = Event(CUDADevice()))
│                    KernelAbstractions.wait(CUDADevice(), 𝒜𝒸𝒸)
│                end)
└    end
[ Info: success expanding KernelAbstractions.@kernel
┌ Info: <<<<< Gradient maker function
│   verbosetidy(ex_make) =
│    quote
│        local function ∇ℳ𝒶𝓀ℯ(𝛥ℛ::AbstractArray{𝒯}, ℛ, w, k) where 𝒯
│                local 𝛥w = fill!(similar(w, Base.promote_type(eltype(w), 𝒯)), 0)
│                local 𝛥k = fill!(similar(k, Base.promote_type(eltype(k), 𝒯)), 0)
│                local 𝒶𝓍m = (axes)(w, 2)
│                (axes)(k, 1) == (axes)(w, 2) || (throw)("range of index m must agree")
│                local 𝒶𝓍μ = (axes)(k, 2)
│                (axes)(k, 2) == (axes)(k, 2) || (throw)("range of index μ must agree")
│                local 𝒶𝓍j = (axes)(w, 1)
│                (axes)(k, 1) == (axes)(w, 1) || (throw)("range of index j must agree")
│                (Tullio.∇threader)(∇𝒜𝒸𝓉!, (Tullio.storage_type)(𝛥w, 𝛥k, w, k), tuple(𝛥w, 𝛥k, 𝛥ℛ, ℛ, w, k), tuple(), tuple(𝒶𝓍m, 𝒶𝓍j, 𝒶𝓍μ), 20165)
│                return (𝛥w, 𝛥k)
│            end
└    end

The line ∇threader(∇𝒜𝒸𝓉!, storage_type(𝛥w, 𝛥k, w, k), tuple(𝛥w, 𝛥k, 𝛥ℛ, ℛ, w, k), tuple(), tuple(𝒶𝓍m, 𝒶𝓍j, 𝒶𝓍μ), 20165) is telling the CPU threader that it cannot multi-thread the gradient: There is no index which appears on all the gradient arrays, thus no safe way to divide the work. Hence the warning it prints.

But the line 𝒜𝒸𝒸 = cu_kern!(...; ndrange = tuple(length(𝒶𝓍m)), dependencies = Event(CUDADevice())) looks like it is still threading over m on the GPU, and writing 𝛥k[j, μ] += ... is then not safe. Which could produce this.

Will see if I can figure out why it's doing that. Although, once that's fixed, GPU kernels with nothing to thread over also have problems, I think.

Dec 18 '20 19:12 mcabbott

Sorry I haven't got back to this. I decided that the way I was dividing loops into safe/unsafe was incorrect for the min/max gradients. But haven't re-organised them yet. (I might change the approach to those, which would change the constraints.)

Jan 23 '21 13:01 mcabbott

Is that affected by #86?

Mar 16 '21 19:03 roflmaostc

No, I think it's an unrelated logic mistake, copying what made sense for + over to max.

Mar 16 '21 21:03 mcabbott