SparseDiffTools.jl out of place Jacobian decomposition mutates

Performance results are highly variable when using cached out of place methods. I've gotten segfaults, although I cannot reliably reproduce that portion of the issue. Using CuArrays seems to amplify the issue.

using Revise
using Flux, BenchmarkTools, CuArrays, CUDAnative, ForwardDiff, LinearAlgebra, Random

function mwe(N, ::Type{T}=Float32) where T<:Real
  A::Matrix{T} = rand(T, N,N)
  cuA = A |> gpu

  function f!(out, A)
    out .= A .+ A .* A .+ 1f0
  end

  krn(x) = x + x*x + 1f0
  function f!(out, A::CuMatrix{Float32})
    out .= krn.(A)
  end

  function f(A)
    return A .+ A .* A .+ 1f0
  end

  function f(A::CuMatrix{Float32})
    return krn.(A)
  end

  J = rand(T, N^2, N^2)
  @info "test cpu (inplace)"
  cache = SparseDiffTools.ForwardColorJacCache(f!,A, dx = similar(A))
  SparseDiffTools.forwarddiff_color_jacobian!(J, f!, A, cache)
  (N<5) && @info "test ∇f cpu inplace: $(J)"
  (N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian!($J, $f!, $A, $cache)

  @info "test cpu (out of place)"
  cacheoos = SparseDiffTools.ForwardColorJacCache(f,A, dx = similar(A))
  J = SparseDiffTools.forwarddiff_color_jacobian(f, A, cacheoos)
  (N<5) && @info "test ∇f cpu oop: $(J)"
  (N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian($f, $A, $cacheoos)


  @info "test gpu (inplace)"
  cuJ = J |> gpu
  cucache = SparseDiffTools.ForwardColorJacCache(f!,cuA, dx = similar(cuA))
  SparseDiffTools.forwarddiff_color_jacobian!(cuJ, f!, cuA, cucache)
  (N<5) && @info "test ∇f gpu inplace: $(cuJ)"
  (N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian!($cuJ, $f!, $cuA, $cucache)

  @info "test gpu (outofplace)"
  cucacheoop = SparseDiffTools.ForwardColorJacCache(f,cuA, dx = similar(cuA))
  cuJ = SparseDiffTools.forwarddiff_color_jacobian(f, cuA, cucacheoop)
  (N<5) && @info "test ∇f  gpu oop: $(cuJ)"
  (N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian($f, $cuA, $cucacheoop)

end

mwe(12)

Output:

[ Info: test cpu (inplace)
  46.500 μs (8 allocations: 320 bytes)
[ Info: test cpu (out of place)
  181.946 ms (80271 allocations: 853.10 MiB)
[ Info: test gpu (inplace)
  12.860 ms (10965 allocations: 402.14 KiB)
[ Info: test gpu (outofplace)
  3.110 s (3516919 allocations: 122.65 MiB)

May 16 '20 03:05 clintonTE

Confirmed to not be due to scalar indexing:

using Flux, SparseDiffTools, BenchmarkTools, CuArrays, ForwardDiff, LinearAlgebra, Random
CuArrays.allowscalar(false)
N = 10
T = Float32
A = rand(T, N,N)

cuA = A |> gpu



function f!(out, A)

  out .= A .+ A .* A .+ 1f0

end



krn(x) = x + x*x + 1f0

function f!(out, A::CuMatrix{Float32})

  out .= krn.(A)

end



function f(A)

  return A .+ A .* A .+ 1f0

end



function f(A::CuMatrix{Float32})

  return krn.(A)

end



J = rand(T, N^2, N^2)

@info "test cpu (inplace)"
cache = SparseDiffTools.ForwardColorJacCache(f!,A, dx = similar(A))
SparseDiffTools.forwarddiff_color_jacobian!(J, f!, A, cache)
(N<5) && @info "test ∇f cpu inplace: $(J)"
(N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian!($J, $f!, $A, $cache)

@info "test cpu (out of place)"
cacheoos = SparseDiffTools.ForwardColorJacCache(f,A, dx = similar(A))
J = SparseDiffTools.forwarddiff_color_jacobian(f, A, cacheoos)
(N<5) && @info "test ∇f cpu oop: $(J)"
(N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian($f, $A, $cacheoos)


@info "test gpu (inplace)"
cuJ = J |> gpu
cucache = SparseDiffTools.ForwardColorJacCache(f!,cuA, dx = similar(cuA))
SparseDiffTools.forwarddiff_color_jacobian!(cuJ, f!, cuA, cucache)
(N<5) && @info "test ∇f gpu inplace: $(cuJ)"
(N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian!($cuJ, $f!, $cuA, $cucache)

@info "test gpu (outofplace)"
cucacheoop = SparseDiffTools.ForwardColorJacCache(f,cuA, dx = similar(cuA))
cuJ = SparseDiffTools.forwarddiff_color_jacobian(f, cuA, cucacheoop)
(N<5) && @info "test ∇f  gpu oop: $(cuJ)"
(N>5) && @btime SparseDiffTools.forwarddiff_color_jacobian($f, $cuA, $cucacheoop)

The problem is likely the fact that https://github.com/JuliaDiff/SparseDiffTools.jl/blob/v1.8.0/src/differentiation/compute_jacobian_ad.jl#L123-L126 produces too many kernels. It's somewhat tied up with https://github.com/JuliaDiff/SparseDiffTools.jl/pull/106 and the upstream issue https://github.com/JuliaGPU/CuArrays.jl/issues/571

May 16 '20 21:05 ChrisRackauckas

https://github.com/JuliaDiff/SparseDiffTools.jl/pull/115 makes this much faster. I'll leave this open because it's not perfect (it now mutates), but the speed boost will essentially make this issue go away for most people.

Jul 06 '20 21:07 ChrisRackauckas

SparseDiffTools.jl SparseDiffTools.jl copied to clipboard

out of place Jacobian decomposition mutates

SparseDiffTools.jl
SparseDiffTools.jl copied to clipboard