NeuralPDE.jl icon indicating copy to clipboard operation
NeuralPDE.jl copied to clipboard

Sporadic GPU compilation error mid-training

Open bgroenks96 opened this issue 1 year ago • 2 comments

Hi,

We have observed a strange issue where a GPU compilation error occurs (seemingly randomly) during training with BFGS:

ERROR: GPU compilation of MethodInstance for (::GPUArrays.var"#broadcast_kernel#26")(::CUDA.CuKernelContext, ::ComponentMatrix{Float64, CUDA.CuDeviceMatrix{Float64, 1}, Tuple{Axis{(layer_1 = ViewAxis(1:48, Axis(weight = ViewAxis(1:32, ShapedAxis((16, 2), NamedTuple())), bias = ViewAxis(33:48, ShapedAxis((16, 1), NamedTuple())))), layer_2 = ViewAxis(49:320, Axis(weight = ViewAxis(1:256, ShapedAxis((16, 16), NamedTuple())), bias = ViewAxis(257:272, ShapedAxis((16, 1), NamedTuple())))), layer_3 = ViewAxis(321:592, Axis(weight = ViewAxis(1:256, ShapedAxis((16, 16), NamedTuple())), bias = ViewAxis(257:272, ShapedAxis((16, 1), NamedTuple())))), layer_4 = ViewAxis(593:609, Axis(weight = ViewAxis(1:16, ShapedAxis((1, 16), NamedTuple())), bias = ViewAxis(17:17, ShapedAxis((1, 1), NamedTuple())))))}, Axis{(layer_1 = ViewAxis(1:48, Axis(weight = ViewAxis(1:32, ShapedAxis((16, 2), NamedTuple())), bias = ViewAxis(33:48, ShapedAxis((16, 1), NamedTuple())))), layer_2 = ViewAxis(49:320, Axis(weight = ViewAxis(1:256, ShapedAxis((16, 16), NamedTuple())), bias = ViewAxis(257:272, ShapedAxis((16, 1), NamedTuple())))), layer_3 = ViewAxis(321:592, Axis(weight = ViewAxis(1:256, ShapedAxis((16, 16), NamedTuple())), bias = ViewAxis(257:272, ShapedAxis((16, 1), NamedTuple())))), layer_4 = ViewAxis(593:609, Axis(weight = ViewAxis(1:16, ShapedAxis((1, 16), NamedTuple())), bias = ViewAxis(17:17, ShapedAxis((1, 1), NamedTuple())))))}}}, ::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{2}, Tuple{ComponentArrays.CombinedAxis{Axis{(layer_1 = ViewAxis(1:48, Axis(weight = ViewAxis(1:32, ShapedAxis((16, 2), NamedTuple())), bias = ViewAxis(33:48, ShapedAxis((16, 1), NamedTuple())))), layer_2 = ViewAxis(49:320, Axis(weight = ViewAxis(1:256, ShapedAxis((16, 16), NamedTuple())), bias = ViewAxis(257:272, ShapedAxis((16, 1), NamedTuple())))), layer_3 = ViewAxis(321:592, Axis(weight = ViewAxis(1:256, ShapedAxis((16, 16), NamedTuple())), bias = ViewAxis(257:272, ShapedAxis((16, 1), NamedTuple())))), layer_4 = ViewAxis(593:609, Axis(weight = ViewAxis(1:16, ShapedAxis((1, 16), NamedTuple())), bias = ViewAxis(17:17, ShapedAxis((1, 1), NamedTuple())))))}, Base.OneTo{Int64}}, ComponentArrays.CombinedAxis{Axis{(layer_1 = ViewAxis(1:48, Axis(weight = ViewAxis(1:32, ShapedAxis((16, 2), NamedTuple())), bias = ViewAxis(33:48, ShapedAxis((16, 1), NamedTuple())))), layer_2 = ViewAxis(49:320, Axis(weight = ViewAxis(1:256, ShapedAxis((16, 16), NamedTuple())), bias = ViewAxis(257:272, ShapedAxis((16, 1), NamedTuple())))), layer_3 = ViewAxis(321:592, Axis(weight = ViewAxis(1:256, ShapedAxis((16, 16), NamedTuple())), bias = ViewAxis(257:272, ShapedAxis((16, 1), NamedTuple())))), layer_4 = ViewAxis(593:609, Axis(weight = ViewAxis(1:16, ShapedAxis((1, 16), NamedTuple())), bias = ViewAxis(17:17, ShapedAxis((1, 1), NamedTuple())))))}, Base.OneTo{Int64}}}, typeof(identity), Tuple{Base.Broadcast.Extruded{Matrix{Float64}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}}}, ::Int64) failed
KernelError: passing and using non-bitstype argument

Argument 4 to your kernel function is of type Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{2}, Tuple{ComponentArrays.CombinedAxis{Axis{(layer_1 = ViewAxis(1:48, Axis(weight = ViewAxis(1:32, ShapedAxis((16, 2), NamedTuple())), bias = ViewAxis(33:48, ShapedAxis((16, 1), NamedTuple())))), layer_2 = ViewAxis(49:320, Axis(weight = ViewAxis(1:256, ShapedAxis((16, 16), NamedTuple())), bias = ViewAxis(257:272, ShapedAxis((16, 1), NamedTuple())))), layer_3 = ViewAxis(321:592, Axis(weight = ViewAxis(1:256, ShapedAxis((16, 16), NamedTuple())), bias = ViewAxis(257:272, ShapedAxis((16, 1), NamedTuple())))), layer_4 = ViewAxis(593:609, Axis(weight = ViewAxis(1:16, ShapedAxis((1, 16), NamedTuple())), bias = ViewAxis(17:17, ShapedAxis((1, 1), NamedTuple())))))}, Base.OneTo{Int64}}, ComponentArrays.CombinedAxis{Axis{(layer_1 = ViewAxis(1:48, Axis(weight = ViewAxis(1:32, ShapedAxis((16, 2), NamedTuple())), bias = ViewAxis(33:48, ShapedAxis((16, 1), NamedTuple())))), layer_2 = ViewAxis(49:320, Axis(weight = ViewAxis(1:256, ShapedAxis((16, 16), NamedTuple())), bias = ViewAxis(257:272, ShapedAxis((16, 1), NamedTuple())))), layer_3 = ViewAxis(321:592, Axis(weight = ViewAxis(1:256, ShapedAxis((16, 16), NamedTuple())), bias = ViewAxis(257:272, ShapedAxis((16, 1), NamedTuple())))), layer_4 = ViewAxis(593:609, Axis(weight = ViewAxis(1:16, ShapedAxis((1, 16), NamedTuple())), bias = ViewAxis(17:17, ShapedAxis((1, 1), NamedTuple())))))}, Base.OneTo{Int64}}}, typeof(identity), Tuple{Base.Broadcast.Extruded{Matrix{Float64}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}}}, which is not isbits:
  .args is of type Tuple{Base.Broadcast.Extruded{Matrix{Float64}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}} which is not isbits.
    .1 is of type Base.Broadcast.Extruded{Matrix{Float64}, Tuple{Bool, Bool}, Tuple{Int64, Int64}} which is not isbits.
      .x is of type Matrix{Float64} which is not isbits.

Here is the MWE:

# Neural networks and optimization
using Lux
using NeuralPDE
using Optimization, OptimizationOptimJL, OptimizationOptimisers

# Packages for numerical solutions
using MethodOfLines
using OrdinaryDiffEq

# Other utilities
using ComponentArrays
using Plots
using Random

########
# Simple heat conduction model (no forcings)
# Uses ModelingToolkit
########

using DiffEqBase
using MethodOfLines
using ModelingToolkit
using IntervalSets
using IfElse

"""
    linear_heat_conduction(; t_domain=(0.0,24*3600.0), z_domain=(0.0,100.0))

Simple linear 1D heat conduction on a rectangular domain.
"""
function linear_heat_conduction(; t_domain=(0.0,24*3600.0), z_domain=(0.0,100.0), T_ub=0.0, T_lb=0.0, α=0.1)
    dvars = @variables T(..)
    ivars = @parameters t z
    # params = @parameters α
    Dzz = Differential(z)^2
    Dt = Differential(t)
    eqs = [
        Dt(T(t,z)) ~ α*Dzz(T(t,z))
    ]
    # Space and time domains
    domains = [
        t ∈ Interval(t_domain...),
        z ∈ Interval(z_domain...),
    ]
    bcs = [
        T(t_domain[1],z) ~ sin(2π*z),
        T(t,z_domain[1]) ~ T_ub,
        T(t,z_domain[2]) ~ T_lb,
    ]
    pdesys = PDESystem(eqs, bcs, domains, [t,z], [T(t,z)], name=:heat)
    return (; pdesys, ivars, dvars)
end

function simulate_heat_conduction_simple(t_domain, z_domain; kwargs...)
    heat_pde_system, ivars, dvars = linear_heat_conduction(; t_domain, z_domain, kwargs...)
    t, z = ivars
    
    # solve with method of lines
    @info "Running reference simulation"
    mol_discretization = MOLFiniteDifference([z => 0.01], t, approx_order=2)
    mol_prob = discretize(heat_pde_system, mol_discretization)
    mol_sol = solve(mol_prob, ImplicitEuler(), reltol=1e-6)
    return (
        prob = mol_prob,
        sol = mol_sol,
    )
end

function fit_pinn_heat_conduction_simple(t_domain, z_domain; maxiters=100, strategy=GridTraining(0.1), opt=BFGS(), kwargs...)
    heat_pde_system, ivars, dvars = linear_heat_conduction(; t_domain, z_domain, kwargs...)
    t, z = ivars

    # now solve with PINN
    @info "Fitting PINN"
    chain = Lux.Chain(
        Dense(2, 16),
        Dense(16, 16, Lux.σ),
        Dense(16, 16, Lux.σ),
        Dense(16, 1)
    )
    ps, st = Lux.setup(Random.default_rng(), chain)
    ps = ps |> ComponentArray |> gpu .|> Float64
    pinn_discretization = PhysicsInformedNN(chain, strategy, init_params=ps)
    pinn_prob = discretize(heat_pde_system, pinn_discretization)
    callback = function (p, l)
        println("Current loss is: $l")
        @assert all(isfinite.(p))
        return false
    end
    
    # optimize
    res = Optimization.solve(pinn_prob, opt; callback = callback, maxiters = maxiters, allow_f_increases=true, iterations=maxiters)
    return (
        model = chain,
        prob = pinn_prob,
        phi = pinn_discretization.phi,
        res = res,
    )
end

t_domain = (0.0, 60.0)
z_domain = (0.0, 10.0)
ref4 = simulate_heat_conduction_simple(t_domain, z_domain, α=1e-3);
pinn4 = fit_pinn_heat_conduction_simple(t_domain, z_domain, maxiters=1500, α=1e-3);
plt1 = heatmap(collect(LinRange(t_domain[1], t_domain[end], 100)), collect(LinRange(z_domain[1], z_domain[end], 100)), (t,z) -> ref4.sol(t,z)[1], yflip=true)
plt2 = heatmap(collect(LinRange(t_domain[1], t_domain[end], 100)), collect(LinRange(z_domain[1], z_domain[end], 100)), (t,z) -> pinn4.phi([t,z], cpu(pinn4.res.u))[1], yflip=true)
plt3 = heatmap(collect(LinRange(t_domain[1], t_domain[end], 100)), collect(LinRange(z_domain[1], z_domain[end], 100)), (t,z) -> pinn4.phi([t,z], pinn4.res.u)[1] - ref4.sol(t,z)[1], yflip=true, cmap=cm)
plot(plt1, plt2, plt3, dpi=150)
savefig("output/pinn_experiment4.png")

bgroenks96 avatar Jul 20 '23 15:07 bgroenks96