Tullio.jl Error when taking gradient of gpu expression

Running this code gives an error, when setting CUDA.allowscalar(false) it does show scalar operations are being done. On cpu everything works as expected.

using CUDA, KernelAbstractions, Tullio, Flux

f(x) = @tullio out[m, n, batch] := W[m, i] * x[i, n, batch] + b[m]
a = cu(rand(3, 5, 6)); W = cu(rand(10, 3)); b = cu(rand(10))

Flux.gradient(Flux.Params(W)) do
    sum(f(a))
end

AssertionError: length(__workgroupsize) <= length(ndrange)

Stacktrace:
 [1] partition at C:\Users\jules\.julia\packages\KernelAbstractions\yw9SF\src\nditeration.jl:103 [inlined]
 [2] partition(::KernelAbstractions.Kernel{CUDADevice,KernelAbstractions.NDIteration.StaticSize{(256,)},KernelAbstractions.NDIteration.DynamicSize,var"#gpu_##🇨🇺#421#156"}, ::Tuple{}, ::Nothing) at C:\Users\jules\.julia\packages\KernelAbstractions\yw9SF\src\KernelAbstractions.jl:368
 [3] (::KernelAbstractions.Kernel{CUDADevice,KernelAbstractions.NDIteration.StaticSize{(256,)},KernelAbstractions.NDIteration.DynamicSize,var"#gpu_##🇨🇺#421#156"})(::CuArray{Float32,2,Nothing}, ::Vararg{Any,N} where N; ndrange::Tuple{}, dependencies::Nothing, workgroupsize::Nothing, progress::Function) at C:\Users\jules\.julia\packages\KernelAbstractions\yw9SF\src\backends\cuda.jl:196
 [4] ∇𝒜𝒸𝓉! at C:\Users\jules\.julia\packages\Tullio\YtOlX\src\macro.jl:814 [inlined]
 [5] (::var"#∇𝒜𝒸𝓉!#154")(::Type{CuArray{Float32,N,Nothing} where N}, ::CuArray{Float32,2,Nothing}, ::CuArray{Float32,3,Nothing}, ::CuArray{Float32,1,Nothing}, ::CuArray{Float32,3,Nothing}, ::CuArray{Float32,3,Nothing}, ::CuArray{Float32,2,Nothing}, ::CuArray{Float32,3,Nothing}, ::CuArray{Float32,1,Nothing}, ::Base.OneTo{Int64}, ::Base.OneTo{Int64}, ::Base.OneTo{Int64}, ::Base.OneTo{Int64}) at C:\Users\jules\.julia\packages\Tullio\YtOlX\src\macro.jl:811
 [6] ∇threader(::Function, ::Type{CuArray{Float32,N,Nothing} where N}, ::Tuple{CuArray{Float32,2,Nothing},CuArray{Float32,3,Nothing},CuArray{Float32,1,Nothing},CuArray{Float32,3,Nothing},CuArray{Float32,3,Nothing},CuArray{Float32,2,Nothing},CuArray{Float32,3,Nothing},CuArray{Float32,1,Nothing}}, ::Tuple{}, ::NTuple{4,Base.OneTo{Int64}}; block::Int64) at C:\Users\jules\.julia\packages\Tullio\YtOlX\src\eval.jl:74
 [7] #750#∇ℳ𝒶𝓀ℯ at C:\Users\jules\.julia\packages\Tullio\YtOlX\src\macro.jl:908 [inlined]
 [8] #217 at C:\Users\jules\.julia\packages\Tullio\YtOlX\src\grad\zygote.jl:8 [inlined]
 [9] (::Tullio.var"#632#back#219"{Tullio.var"#217#218"{Tullio.Eval{var"#ℳ𝒶𝓀ℯ#159"{var"#𝒜𝒸𝓉!#150"},var"#750#∇ℳ𝒶𝓀ℯ#158"{var"#∇𝒜𝒸𝓉!#154"}},Tuple{CuArray{Float32,2,Nothing},CuArray{Float32,3,Nothing},CuArray{Float32,1,Nothing}},CuArray{Float32,3,Nothing}}})(::CuArray{Float32,3,Nothing}) at C:\Users\jules\.julia\packages\ZygoteRules\6nssF\src\adjoint.jl:49
 [10] Affine at .\In[23]:11 [inlined]
 [11] (::typeof(∂(λ)))(::CuArray{Float32,3,Nothing}) at C:\Users\jules\.julia\packages\Zygote\EjVY4\src\compiler\interface2.jl:0
 [12] #237 at .\In[88]:4 [inlined]
 [13] (::typeof(∂(#237)))(::Float32) at C:\Users\jules\.julia\packages\Zygote\EjVY4\src\compiler\interface2.jl:0
 [14] (::Zygote.var"#54#55"{Zygote.Params,Zygote.Context,typeof(∂(#237))})(::Float32) at C:\Users\jules\.julia\packages\Zygote\EjVY4\src\compiler\interface.jl:177
 [15] gradient(::Function, ::Zygote.Params) at C:\Users\jules\.julia\packages\Zygote\EjVY4\src\compiler\interface.jl:54
 [16] top-level scope at In[88]:3

Upon removing the b[m] term, the code doesn't error anymore but scalar iteration seems to take place.

Jul 05 '20 19:07 jumerckx

A quick look, without GPU. Without KernelAbstractions, I think you need Params([W]) but then it works fine:

julia> using Tullio, Zygote
julia> a = (rand(3, 5, 6)); W = (rand(10, 3)); b = (rand(10))
julia> f(x) = @tullio out[m, n, batch] := W[m, i] * x[i, n, batch] + b[m]

julia> gradient(Params(W)) do
           sum(f(a))
       end
Grads(...)

julia> ans[W]
ERROR: KeyError: key [0.99091051 ... 0.763998] not found

julia> gradient(Params([W])) do
           sum(f(a))
       end
Grads(...)

julia> ans[W]
10×3 Array{Float64,2}:
 14.5659  14.3667  13.7422
 14.5659  14.3667  13.7422
 14.5659  14.3667  13.7422
...

It's a bit of a hack but threads=false currently allows it to use KernelAbstractions on the CPU. This gives the same error:

julia> using KernelAbstractions

julia> f(x) = @tullio out[m, n, batch] := W[m, i] * x[i, n, batch] + b[m] threads=false

julia> gradient(Params([W])) do
           sum(f(a))
       end
ERROR: AssertionError: length(__workgroupsize) <= length(ndrange)
Stacktrace:
 [1] partition at /Users/me/.julia/packages/KernelAbstractions/yw9SF/src/nditeration.jl:103 [inlined]
 [2] partition(::KernelAbstractions.Kernel{CPU,KernelAbstractions.NDIteration.StaticSize{(4,)},KernelAbstractions.NDIteration.DynamicSize,var"#cpu_##🇨🇺#679#115"}, ::Tuple{}, ::Nothing) at /Users/me/.julia/packages/KernelAbstractions/yw9SF/src/KernelAbstractions.jl:368
 [3] #_#37(::Tuple{}, ::Nothing, ::Nothing, ::Nothing, ::KernelAbstractions.Kernel{CPU,KernelAbstractions.NDIteration.StaticSize{(4,)},KernelAbstractions.NDIteration.DynamicSize,var"#cpu_##🇨🇺#679#115"}, ::Array{Float64,2}, ::Vararg{Any,N} where N) at /Users/me/.julia/packages/KernelAbstractions/yw9SF/src/backends/cpu.jl:94
 [4] (::Core.var"#kw#Any")(::NamedTuple{(:ndrange,),Tuple{Tuple{}}}, ::KernelAbstractions.Kernel{CPU,KernelAbstractions.NDIteration.StaticSize{(4,)},KernelAbstractions.NDIteration.DynamicSize,var"#cpu_##🇨🇺#679#115"}, ::Array{Float64,2}, ::Vararg{Any,N} where N) at ./none:0
 [5] ∇𝒜𝒸𝓉! at ./logging.jl:327 [inlined]
 [6] (::var"#∇𝒜𝒸𝓉!#111")(::Type{Array{Float64,N} where N}, ::Array{Float64,2}, ::Array{Float64,3}, ::Array{Float64,1}, ::FillArrays.Fill{Float64,3,Tuple{Base.OneTo{Int64},Base.OneTo{Int64},Base.OneTo{Int64}}}, ::Array{Float64,2}, ::Array{Float64,3}, ::Array{Float64,1}, ::UnitRange{Int64}, ::UnitRange{Int64}, ::UnitRange{Int64}, ::UnitRange{Int64}, ::Nothing) at ./none:0
 [7] #∇threader#174 at ./none:0 [inlined]
 [8] #∇threader at ./none:0 [inlined]
 [9] #1096#∇ℳ𝒶𝓀ℯ at ./none:0 [inlined]
 [10] #219 at /Users/me/.julia/packages/Tullio/hs3FA/src/grad/zygote.jl:7 [inlined]
...

julia> gradient(sum∘f, a)[1]  # without Params(), same error
ERROR: AssertionError: length(__workgroupsize) <= length(ndrange)
Stacktrace:
 [1] partition at /Users/me/.julia/packages/KernelAbstractions/yw9SF/src/nditeration.jl:103 [inlined]

Jul 05 '20 20:07 mcabbott

You're completely right, thanks for the explanation.

Jul 05 '20 20:07 jumerckx

I think it's a real bug, but whether it's in Tullio or in KernelAbstractions isn't clear yet.

Minimal example so far:

using Tullio, KernelAbstractions, Tracker

g(x) = @tullio y[i] := x[i] threads=false # verbose=2
gradient(sum∘g, rand(2))[1] # this works

f(x) = @tullio y[i] := x[i] * x[j]  threads=false # verbose=2
gradient(sum∘f, rand(2))[1] # same error

Jul 05 '20 20:07 mcabbott

I think the problem is here, and may prove tricky to resolve.

On the forward pass of out[m, n, batch] := W[m, i] * x[i, n, batch] + b[m], it can safely divide up the work along any of m, n, batch, and do it in parallel.

But on the reverse pass, it wants to fill in 𝛥W[m, i], 𝛥x[i, n, batch] and 𝛥b[m] simultaneously. Since they have no indices in common, it can't safely divide this up, e.g. if you divide along i, then different threads will want to write into 𝛥b[m] at the same time. I had some tricks for CPU threads to work around this in some cases.

For now I made this an error (https://github.com/mcabbott/Tullio.jl/commit/3eb172f8194d34dfdc5436362071dcbc9927ca53) like so:

julia> f(x) = @tullio out[m, n, batch] := W[m, i] * x[i, n, batch] + b[m] verbose=true
┌ Info: symbolic gradients
│   inbody =
│    3-element Array{Any,1}:
│     :(𝛥W[m, i] = 𝛥W[m, i] + conj(conj(𝛥ℛ[m, n, batch]) * x[i, n, batch]))
│     :(𝛥x[i, n, batch] = 𝛥x[i, n, batch] + conj(conj(𝛥ℛ[m, n, batch]) * W[m, i]))
└     :(𝛥b[m] = 𝛥b[m] + conj(conj(𝛥ℛ[m, n, batch])))
┌ Warning: KernelAbstractions failed  (symbolic gradient)
│   err = KernelAbstractions can't parallelise this gradient
└ @ Tullio ~/.julia/packages/Tullio/Snt4E/src/macro.jl:844

Note also that out[m, n, batch] := W[m, i] * x[i, n, batch] doesn't have this problem, as indix i is shared.

Jul 09 '20 13:07 mcabbott

Tullio.jl Tullio.jl copied to clipboard

Error when taking gradient of gpu expression

Tullio.jl
Tullio.jl copied to clipboard