GeophysicalFlows.jl
                                
                                 GeophysicalFlows.jl copied to clipboard
                                
                                    GeophysicalFlows.jl copied to clipboard
                            
                            
                            
                        Reduce GPU scalar operations; use `views` in `MultiLayerQG.energies`
This is an attempt to resolve #273.
I benchmarked it. It's not any faster than v0.14.2. But it's cleaner.
Seems like scalar operations mentioned in #273 were not causing any bottleneck after all...
Bench script:
using GeophysicalFlows, Printf
using Random: seed!
dev = GPU()     # Device (CPU/GPU)
n = 512                  # 2D resolution = n²
stepper = "FilteredRK4"  # timestepper
     dt = 0.5e-3         # timestep
 nsteps = 20000          # total number of time-steps
 nsubs  = 200            # number of time-steps for plotting (nsteps must be multiple of nsubs)
 L = 2π                  # domain size
μ = 5e-2                 # bottom drag
β = 5                    # the y-gradient of planetary PV
 
nlayers = 2              # number of layers
f₀, g = 1, 1             # Coriolis parameter and gravitational constant
H = [0.2, 0.8]           # the rest depths of each layer
ρ = [4.0, 5.0]           # the density of each layer
 
U = zeros(nlayers)       # the imposed mean zonal flow in each layer
U[1] = 1.0
U[2] = 0.0
prob = MultiLayerQG.Problem(nlayers, dev; nx=n, Lx=L, f₀, g, H, ρ, U, μ, β,
                            dt, stepper, aliased_fraction=0)
sol, clock, params, vars, grid = prob.sol, prob.clock, prob.params, prob.vars, prob.grid
x, y = grid.x, grid.y
seed!(1234) # reset of the random number generator for reproducibility
q₀  = 1e-2 * ArrayType(dev)(randn((grid.nx, grid.ny, nlayers)))
q₀h = prob.timestepper.filter .* rfft(q₀, (1, 2)) # apply rfft  only in dims=1, 2
q₀  = irfft(q₀h, grid.nx, (1, 2))                 # apply irfft only in dims=1, 2
MultiLayerQG.set_q!(prob, q₀)
# Create Diagnostics -- `energies` function is imported at the top.
E = Diagnostic(MultiLayerQG.energies, prob; nsteps)
diags = [E] # A list of Diagnostics types passed to "stepforward!" will  be updated every timestep.
nothing # hide
stepforward!(prob, diags, nsubs)
startwalltime = time()
for j in 0:round(Int, nsteps / nsubs)
  if j % (1000 / nsubs) == 0
    cfl = clock.dt * maximum([maximum(vars.u) / grid.dx, maximum(vars.v) / grid.dy])
    
    log = @sprintf("step: %04d, t: %.1f, cfl: %.2f, KE₁: %.3e, KE₂: %.3e, PE: %.3e, walltime: %.2f min",
                   clock.step, clock.t, cfl, E.data[E.i][1][1], E.data[E.i][1][2], E.data[E.i][2][1], (time()-startwalltime)/60)
    println(log)
  end
  
  stepforward!(prob, diags, nsubs)
  MultiLayerQG.updatevars!(prob)
end
Using this PR
step: 0200, t: 0.1, cfl: 0.00, KE₁: 1.132e-09, KE₂: 5.210e-09, PE: 2.476e-10, walltime: 0.00 min
step: 1200, t: 0.6, cfl: 0.00, KE₁: 1.551e-09, KE₂: 4.983e-09, PE: 1.317e-09, walltime: 0.07 min
step: 2200, t: 1.1, cfl: 0.00, KE₁: 2.142e-09, KE₂: 4.868e-09, PE: 2.515e-09, walltime: 0.14 min
step: 3200, t: 1.6, cfl: 0.00, KE₁: 2.718e-09, KE₂: 4.855e-09, PE: 3.338e-09, walltime: 0.21 min
step: 4200, t: 2.1, cfl: 0.00, KE₁: 3.221e-09, KE₂: 4.975e-09, PE: 3.492e-09, walltime: 0.29 min
step: 5200, t: 2.6, cfl: 0.00, KE₁: 3.872e-09, KE₂: 5.214e-09, PE: 3.589e-09, walltime: 0.36 min
step: 6200, t: 3.1, cfl: 0.00, KE₁: 5.084e-09, KE₂: 5.449e-09, PE: 5.163e-09, walltime: 0.43 min
step: 7200, t: 3.6, cfl: 0.00, KE₁: 6.869e-09, KE₂: 5.836e-09, PE: 7.685e-09, walltime: 0.50 min
step: 8200, t: 4.1, cfl: 0.00, KE₁: 9.229e-09, KE₂: 6.496e-09, PE: 1.071e-08, walltime: 0.57 min
step: 9200, t: 4.6, cfl: 0.00, KE₁: 1.229e-08, KE₂: 7.577e-09, PE: 1.397e-08, walltime: 0.64 min
step: 10200, t: 5.1, cfl: 0.00, KE₁: 1.630e-08, KE₂: 9.216e-09, PE: 1.759e-08, walltime: 0.72 min
step: 11200, t: 5.6, cfl: 0.00, KE₁: 2.200e-08, KE₂: 1.142e-08, PE: 2.344e-08, walltime: 0.78 min
step: 12200, t: 6.1, cfl: 0.00, KE₁: 3.007e-08, KE₂: 1.443e-08, PE: 3.233e-08, walltime: 0.85 min
step: 13200, t: 6.6, cfl: 0.00, KE₁: 4.135e-08, KE₂: 1.871e-08, PE: 4.477e-08, walltime: 0.92 min
step: 14200, t: 7.1, cfl: 0.00, KE₁: 5.703e-08, KE₂: 2.477e-08, PE: 6.181e-08, walltime: 0.99 min
step: 15200, t: 7.6, cfl: 0.00, KE₁: 7.877e-08, KE₂: 3.345e-08, PE: 8.464e-08, walltime: 1.06 min
step: 16200, t: 8.1, cfl: 0.00, KE₁: 1.093e-07, KE₂: 4.566e-08, PE: 1.168e-07, walltime: 1.13 min
step: 17200, t: 8.6, cfl: 0.00, KE₁: 1.524e-07, KE₂: 6.279e-08, PE: 1.628e-07, walltime: 1.21 min
step: 18200, t: 9.1, cfl: 0.00, KE₁: 2.130e-07, KE₂: 8.692e-08, PE: 2.275e-07, walltime: 1.28 min
step: 19200, t: 9.6, cfl: 0.00, KE₁: 2.983e-07, KE₂: 1.209e-07, PE: 3.185e-07, walltime: 1.35 min
step: 20200, t: 10.1, cfl: 0.00, KE₁: 4.182e-07, KE₂: 1.691e-07, PE: 4.456e-07, walltime: 1.42 min
With GeophysicalFlows v0.14.2
step: 0200, t: 0.1, cfl: 0.00, KE₁: 1.132e-09, KE₂: 5.210e-09, PE: 2.476e-10, walltime: 0.00 min
step: 1200, t: 0.6, cfl: 0.00, KE₁: 1.551e-09, KE₂: 4.983e-09, PE: 1.317e-09, walltime: 0.07 min
step: 2200, t: 1.1, cfl: 0.00, KE₁: 2.142e-09, KE₂: 4.868e-09, PE: 2.515e-09, walltime: 0.14 min
step: 3200, t: 1.6, cfl: 0.00, KE₁: 2.718e-09, KE₂: 4.855e-09, PE: 3.338e-09, walltime: 0.20 min
step: 4200, t: 2.1, cfl: 0.00, KE₁: 3.221e-09, KE₂: 4.975e-09, PE: 3.492e-09, walltime: 0.27 min
step: 5200, t: 2.6, cfl: 0.00, KE₁: 3.872e-09, KE₂: 5.214e-09, PE: 3.589e-09, walltime: 0.34 min
step: 6200, t: 3.1, cfl: 0.00, KE₁: 5.084e-09, KE₂: 5.449e-09, PE: 5.163e-09, walltime: 0.41 min
step: 7200, t: 3.6, cfl: 0.00, KE₁: 6.869e-09, KE₂: 5.836e-09, PE: 7.685e-09, walltime: 0.47 min
step: 8200, t: 4.1, cfl: 0.00, KE₁: 9.229e-09, KE₂: 6.496e-09, PE: 1.071e-08, walltime: 0.54 min
step: 9200, t: 4.6, cfl: 0.00, KE₁: 1.229e-08, KE₂: 7.577e-09, PE: 1.397e-08, walltime: 0.59 min
step: 10200, t: 5.1, cfl: 0.00, KE₁: 1.630e-08, KE₂: 9.216e-09, PE: 1.759e-08, walltime: 0.65 min
step: 11200, t: 5.6, cfl: 0.00, KE₁: 2.200e-08, KE₂: 1.142e-08, PE: 2.344e-08, walltime: 0.71 min
step: 12200, t: 6.1, cfl: 0.00, KE₁: 3.007e-08, KE₂: 1.443e-08, PE: 3.233e-08, walltime: 0.79 min
step: 13200, t: 6.6, cfl: 0.00, KE₁: 4.135e-08, KE₂: 1.871e-08, PE: 4.477e-08, walltime: 0.86 min
step: 14200, t: 7.1, cfl: 0.00, KE₁: 5.703e-08, KE₂: 2.477e-08, PE: 6.181e-08, walltime: 0.93 min
step: 15200, t: 7.6, cfl: 0.00, KE₁: 7.877e-08, KE₂: 3.345e-08, PE: 8.464e-08, walltime: 1.00 min
step: 16200, t: 8.1, cfl: 0.00, KE₁: 1.093e-07, KE₂: 4.566e-08, PE: 1.168e-07, walltime: 1.08 min
step: 17200, t: 8.6, cfl: 0.00, KE₁: 1.524e-07, KE₂: 6.279e-08, PE: 1.628e-07, walltime: 1.14 min
step: 18200, t: 9.1, cfl: 0.00, KE₁: 2.130e-07, KE₂: 8.692e-08, PE: 2.275e-07, walltime: 1.20 min
step: 19200, t: 9.6, cfl: 0.00, KE₁: 2.983e-07, KE₂: 1.209e-07, PE: 3.185e-07, walltime: 1.26 min
step: 20200, t: 10.1, cfl: 0.00, KE₁: 4.182e-07, KE₂: 1.691e-07, PE: 4.456e-07, walltime: 1.32 min
Can't find dotview anywhere....
OK! Got it.
julia> Base.Broadcast.dotview
dotview (generic function with 2 methods)
@glwagner I'm thinking we merge this. It does not do what the PR title currently suggests but it does cleanup the code a bit. What do you think?