OrdinaryDiffEq.jl Krylov.jl optimizations for ODE usage

Krylov.jl optimizations for ODE usage

Open ChrisRackauckas opened this issue 3 years ago • 28 comments

MWE of preconditioned Newton-Krylov solves with OrdinaryDiffEq and Sundials:

using OrdinaryDiffEq, LinearSolve, Test

const N = 32
const xyd_brusselator = range(0,stop=1,length=N)
brusselator_f(x, y, t) = (((x-0.3)^2 + (y-0.6)^2) <= 0.1^2) * (t >= 1.1) * 5.
limit(a, N) = a == N+1 ? 1 : a == 0 ? N : a

function brusselator_2d_loop(du, u, p, t)
  A, B, alpha, dx = p
  alpha = alpha/dx^2
  @inbounds for I in CartesianIndices((N, N))
    i, j = Tuple(I)
    x, y = xyd_brusselator[I[1]], xyd_brusselator[I[2]]
    ip1, im1, jp1, jm1 = limit(i+1, N), limit(i-1, N), limit(j+1, N), limit(j-1, N)
    du[i,j,1] = alpha*(u[im1,j,1] + u[ip1,j,1] + u[i,jp1,1] + u[i,jm1,1] - 4u[i,j,1]) +
                B + u[i,j,1]^2*u[i,j,2] - (A + 1)*u[i,j,1] + brusselator_f(x, y, t)
    du[i,j,2] = alpha*(u[im1,j,2] + u[ip1,j,2] + u[i,jp1,2] + u[i,jm1,2] - 4u[i,j,2]) +
                A*u[i,j,1] - u[i,j,1]^2*u[i,j,2]
    end
end
p = (3.4, 1., 10., step(xyd_brusselator))

function init_brusselator_2d(xyd)
  N = length(xyd)
  u = zeros(N, N, 2)
  for I in CartesianIndices((N, N))
    x = xyd[I[1]]
    y = xyd[I[2]]
    u[I,1] = 22*(y*(1-y))^(3/2)
    u[I,2] = 27*(x*(1-x))^(3/2)
  end
  u
end
u0 = init_brusselator_2d(xyd_brusselator)
prob_ode_brusselator_2d = ODEProblem(brusselator_2d_loop,
                                     u0,(0.,11.5),p)

using Symbolics
du0 = copy(u0)
jac = Symbolics.jacobian_sparsity((du,u)->brusselator_2d_loop(du,u,p,0.0),du0,u0)

prob_ode_brusselator_2d_sparse = ODEProblem(ODEFunction(brusselator_2d_loop,jac_prototype = float.(jac)),
                                            u0,(0.,11.5),p)

using ModelingToolkit
prob_ode_brusselator_2d_mtk = ODEProblem(modelingtoolkitize(prob_ode_brusselator_2d_sparse),[],(0.0,11.5),jac=true,sparse=true);

prob_ode_brusselator_2d_sparse = ODEProblem(ODEFunction(brusselator_2d_loop,jac_prototype = float.(jac)),
                                            u0,(0.,11.5),p)

using IncompleteLU
Base.eltype(::IncompleteLU.ILUFactorization{Tv,Ti}) where {Tv,Ti} = Tv

function incompletelu(W,du,u,p,t,newW,Plprev,Prprev,solverdata)
  if newW === nothing || newW
    Pl = ilu(convert(AbstractMatrix,W), τ = 50.0)
  else
    Pl = Plprev
  end
  Pl,nothing
end

using AlgebraicMultigrid
function algebraicmultigrid(W,du,u,p,t,newW,Plprev,Prprev,solverdata)
  if newW === nothing || newW
    Pl = aspreconditioner(ruge_stuben(convert(AbstractMatrix,W)))
  else
    Pl = Plprev
  end
  Pl,nothing
end

function algebraicmultigrid2(W,du,u,p,t,newW,Plprev,Prprev,solverdata)
  if newW === nothing || newW
    A = convert(AbstractMatrix,W)
    Pl = aspreconditioner(ruge_stuben(A, presmoother = AlgebraicMultigrid.Jacobi(rand(size(A,1))), postsmoother = AlgebraicMultigrid.Jacobi(rand(size(A,1)))))
  else
    Pl = Plprev
  end
  Pl,nothing
end

using Sundials, LinearAlgebra

u0 = prob_ode_brusselator_2d_mtk.u0
p  = prob_ode_brusselator_2d_mtk.p
const jaccache = prob_ode_brusselator_2d_mtk.f.jac(u0,p,0.0)
const W = I - 1.0*jaccache

prectmp = ilu(W, τ = 50.0)
const preccache = Ref(prectmp)

function psetupilu(p, t, u, du, jok, jcurPtr, gamma)
  if jok
    prob_ode_brusselator_2d_mtk.f.jac(jaccache,u,p,t)
    jcurPtr[] = true

    # W = I - gamma*J
    @. W = -gamma*jaccache
    idxs = diagind(W)
    @. @view(W[idxs]) = @view(W[idxs]) + 1

    # Build preconditioner on W
    preccache[] = ilu(W, τ = 5.0)
  end
end

function precilu(z,r,p,t,y,fy,gamma,delta,lr)
  ldiv!(z,preccache[],r)
end

prectmp2 = aspreconditioner(ruge_stuben(W, presmoother = AlgebraicMultigrid.Jacobi(rand(size(W,1))), postsmoother = AlgebraicMultigrid.Jacobi(rand(size(W,1)))))
const preccache3 = Ref(prectmp2)
function psetupamg(p, t, u, du, jok, jcurPtr, gamma)
  if jok
    prob_ode_brusselator_2d_mtk.f.jac(jaccache,u,p,t)
    jcurPtr[] = true

    # W = I - gamma*J
    @. W = -gamma*jaccache
    idxs = diagind(W)
    @. @view(W[idxs]) = @view(W[idxs]) + 1

    # Build preconditioner on W
    preccache3[] = aspreconditioner(ruge_stuben(W, presmoother = AlgebraicMultigrid.Jacobi(rand(size(W,1))), postsmoother = AlgebraicMultigrid.Jacobi(rand(size(W,1)))))
  end
end

function precamg(z,r,p,t,y,fy,gamma,delta,lr)
  ldiv!(z,preccache3[],r)
end

# No Preconditioning
@time solve(prob_ode_brusselator_2d,KenCarp47(linsolve=KrylovJL_GMRES()),save_everystep=false);
# 0.715011 seconds (172.53 k allocations: 30.978 MiB)

@time solve(prob_ode_brusselator_2d_sparse,CVODE_BDF(linear_solver=:GMRES),save_everystep=false);
# 0.212607 seconds (58.38 k allocations: 2.846 MiB)

# ILU
@time solve(prob_ode_brusselator_2d_sparse,KenCarp47(linsolve=KrylovJL_GMRES(),precs=incompletelu,concrete_jac=true),save_everystep=false);
# 0.178704 seconds (61.76 k allocations: 61.385 MiB)

@time solve(prob_ode_brusselator_2d_sparse,CVODE_BDF(linear_solver=:GMRES,prec=precilu,psetup=psetupilu,prec_side=1),save_everystep=false);
# 0.095955 seconds (17.72 k allocations: 77.079 MiB)

#AMG
@time solve(prob_ode_brusselator_2d_sparse,KenCarp47(linsolve=KrylovJL_GMRES(),precs=algebraicmultigrid2,concrete_jac=true),save_everystep=false);
# 0.285271 seconds (65.71 k allocations: 170.192 MiB, 2.49% gc time)

@time solve(prob_ode_brusselator_2d_sparse,CVODE_BDF(linear_solver=:GMRES,prec=precamg,psetup=psetupamg,prec_side=1),save_everystep=false);
# 0.146130 seconds (30.68 k allocations: 275.589 MiB, 7.51% gc time)

Sundials.jl is still slightly ahead of OrdinaryDiffEq here, but within 2x once preconditioners are applied (and preconditioners are a hell of a lot easier to define in OrdinaryDiffEq, so in some sense it's ahead). The non-preconditioned case is still ~3.5x difference though, which points to some missing optimizations in Krylov.jl, though even the preconditioners could use some optimizations from what I can tell.

Jan 10 '22 08:01 ChrisRackauckas

@chriselrod we can continue the discussion from https://github.com/SciML/OrdinaryDiffEq.jl/issues/1551#issuecomment-1008334603 here. The example above highlights the vs Sundials for each of the cases. The ILU case is the one that is back and forward substitution limited. The Krylov.jl without preconditioners case is the one that is a bit BLAS fishy. The AMG case, I don't know why that one is so slow without Jacobi smoothing precs = algebraicmultigrid, that seems to be due to the Gauss-Seidel implementation. So there's a lot of code optimization to be done here.

Jan 10 '22 08:01 ChrisRackauckas

The GMRES method of Krylov.jl is slower than the GMRES of IterativeSolvers.jl ?

Jan 13 '22 23:01 amontoison

No, it's like an order of magnitude faster than the IterativeSolvers.jl one. IterativeSolvers.jl isn't even in the running.

Jan 13 '22 23:01 ChrisRackauckas

To optimize the performance of Krylov methods, you could use MKLSparse.jl. Matrix-vector products are multi-threaded (https://juliasmoothoptimizers.github.io/Krylov.jl/dev/tips/).

Jan 13 '22 23:01 amontoison

https://github.com/SciML/OrdinaryDiffEq.jl/issues/1551#issuecomment-1008235901 has a bunch of examples. IterativeSolvers.jl is just aggressively bad here and I haven't had time to look into it, but it was far enough away that I wasn't going to spend anymore time with it. Krylov.jl is much better, but I did see a few things:

https://github.com/SciML/OrdinaryDiffEq.jl/issues/1551#issuecomment-1008237675

1/3 of the time was in the f call, and I think we could get that closer to 1. It was spending a lot of time in the dot operation, but it's known that the OpenBLAS defaults tend to give you a very slow dot, slower than just using a pure Julia one. That was 1/3 of the time, so I think it might be hitting some of that and a manual SIMD might be better. Of course, if someone is using MKL that isn't quite the case, so it should probably check the vendor etc. . Another 1/3 was in the forward and backwards substitutions in IncompleteLU.jl, which I know @chriselrod did a bunch of similar optimizations before in https://github.com/JuliaSIMD/TriangularSolve.jl so that preconditioner implementation might be able to be accelerated, but that's separate from the Krylov.jl time.

Jan 13 '22 23:01 ChrisRackauckas

@time solve(prob_ode_brusselator_2d,Rosenbrock23(linsolve=KrylovJL_GMRES()),save_everystep=false);
# 0.891711 seconds (230.15 k allocations: 184.457 MiB, 5.10% gc time)

@time solve(prob_ode_brusselator_2d,Rosenbrock23(linsolve=IterativeSolversJL_GMRES()),save_everystep=false);
# 6.231675 seconds (1.37 M allocations: 206.219 MiB, 0.28% gc time)

@time solve(prob_ode_brusselator_2d,Rodas4(linsolve=IterativeSolversJL_GMRES()),save_everystep=false);
# 8.095367 seconds (1.82 M allocations: 296.098 MiB, 0.38% gc time)

@time solve(prob_ode_brusselator_2d,Rodas4(linsolve=KrylovJL_GMRES()),save_everystep=false);
# 1.215097 seconds (317.01 k allocations: 152.230 MiB, 3.26% gc time)

Jan 13 '22 23:01 ChrisRackauckas

#1551 (comment) has a bunch of examples. IterativeSolvers.jl is just aggressively bad here and I haven't had time to look into it, but it was far enough away that I wasn't going to spend anymore time with it. Krylov.jl is much better, but I did see a few things:

#1551 (comment)

1/3 of the time was in the f call, and I think we could get that closer to 1. It was spending a lot of time in the dot operation, but it's known that the OpenBLAS defaults tend to give you a very slow dot, slower than just using a pure Julia one. That was 1/3 of the time, so I think it might be hitting some of that and a manual SIMD might be better. Of course, if someone is using MKL that isn't quite the case, so it should probably check the vendor etc. . Another 1/3 was in the forward and backwards substitutions in IncompleteLU.jl, which I know @chriselrod did a bunch of similar optimizations before in https://github.com/JuliaSIMD/TriangularSolve.jl so that preconditioner implementation might be able to be accelerated, but that's separate from the Krylov.jl time.

I will do some tests with a pure Julia dot, we also have this issue with the CUDA.dot implementation for solving linear systems on GPU. If only the MKL version is efficient, I can easily change the dispatch to check BLAS vendor and use a pure julia dot in our Krylov macros (https://github.com/JuliaSmoothOptimizers/Krylov.jl/blob/main/src/krylov_utils.jl#L203-L220). For GPU problems, broadcast is sometimes more efficient than CUBLAS routines but I never checked on linear systems stored on CPU.

Jan 13 '22 23:01 amontoison

Specifically, this is the issue that can really hurt users of default Julia installations: https://github.com/JuliaLang/julia/issues/33409

Jan 13 '22 23:01 ChrisRackauckas

Specifically, this is the issue that can really hurt users of default Julia installations: JuliaLang/julia#33409

You opened the issue in 2019 and it's still not fixed ?! :disappointed: I will do some benchmarks with MKL and OpenBLAS asap.

Jan 14 '22 00:01 amontoison

I did some benchmarks with MKL and OpenBLAS for dot products. The relative difference between them is small.

┌───────────┬────────────────────┬────────────────────┬─────────┐
│ Dimension │ OpenBLAS 4 threads │ OpenBLAS 2 threads │     MKL │
├───────────┼────────────────────┼────────────────────┼─────────┤
│      10.0 │            1.02666 │                1.0 │ 1.38784 │
│     100.0 │            1.01277 │                1.0 │ 1.00692 │
│    1000.0 │            1.04922 │            1.04665 │     1.0 │
│   10000.0 │            1.13293 │            1.10058 │     1.0 │
│  100000.0 │            1.10543 │            1.10557 │     1.0 │
│     1.0e6 │            1.03908 │                1.0 │ 1.00173 │
│     1.0e7 │            1.02159 │            1.05796 │     1.0 │
└───────────┴────────────────────┴────────────────────┴─────────┘

Code used for the benchmarks:

using BenchmarkTools, LinearAlgebra

krylov_dot(n :: Integer, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasReal = BLAS.dot(n, x, dx, y, dy)

p = 7
time_openblas1 = zeros(p)
time_openblas2 = zeros(p)
time_mkl       = zeros(p)

for i = 1:p
  n = 10^i
  println("Dimension: ", n)
  T = Float64
  x = rand(T, n)
  y = rand(T, n)
  dx = 1
  dy = 1
  @btime for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
  time_openblas1[i] = @belapsed for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
end

NMAX = Sys.CPU_THREADS
N = Int(NMAX / 2)
BLAS.set_num_threads(N)

for i = 1:p
  n = 10^i
  println("Dimension: ", n)
  T = Float64
  x = rand(T, n)
  y = rand(T, n)
  dx = 1
  dy = 1
  @btime for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
  time_openblas2[i] = @belapsed for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
end

using MKL

for i = 1:p
  n = 10^i
  println("Dimension: ", n)
  T = Float64
  x = rand(T, n)
  y = rand(T, n)
  dx = 1
  dy = 1
  @btime for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
  time_mkl[i] = @belapsed for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
end

using PrettyTables

dimension = [10^i for i=1:p]
time = hcat(dimension, time_openblas1, time_openblas2, time_mkl)
pretty_table(time ; header = ["Dimension", "OpenBLAS $NMAX threads", "OpenBLAS $N threads", "MKL"])

for i = 1:p
  val = min(time[i,2], time[i,3], time[i,4])
  time[i,2] /= val
  time[i,3] /= val
  time[i,4] /= val
end
pretty_table(time ; header = ["Dimension", "OpenBLAS $NMAX threads", "OpenBLAS $N threads", "MKL"])

Jan 25 '22 23:01 amontoison

Do we have an efficient Julia implementation of dot somewhere to easily compare with BLAS versions?

Jan 26 '22 00:01 amontoison

Do we have an efficient Julia implementation of dot somewhere to easily compare with BLAS versions?

function dot_julia(x,y)
    s = zero(promote_eltype(x, y))
    @fastmath for i in eachindex(x, y)
        s += x[i]'*y[i]
    end
    return s
end

is fairly good, but will probably fall behind for large sizes vs multithreaded on x86 CPUs, because they tend to get better total memory bandwidth when more than 1 core is active.

Jan 26 '22 04:01 chriselrod

┌───────────┬────────────────────┬────────────────────┬─────────┬─────────┐
│ Dimension │ OpenBLAS 4 threads │ OpenBLAS 2 threads │     MKL │   Julia │
├───────────┼────────────────────┼────────────────────┼─────────┼─────────┤
│      10.0 │            1.13365 │            1.13204 │ 1.13028 │     1.0 │
│     100.0 │            1.01151 │            1.00737 │     1.0 │ 1.62794 │
│    1000.0 │            1.01729 │            1.01326 │     1.0 │ 4.57984 │
│   10000.0 │            1.01232 │            1.00192 │     1.0 │  3.5687 │
│  100000.0 │            1.10966 │            1.08058 │     1.0 │ 4.33227 │
│     1.0e6 │            1.02231 │            1.02389 │     1.0 │ 1.03161 │
│     1.0e7 │             1.0632 │            1.11198 │ 1.06174 │     1.0 │
└───────────┴────────────────────┴────────────────────┴─────────┴─────────┘

Jan 26 '22 05:01 amontoison

It definitely depends on the CPU.

┌───────────┬─────────────────────┬─────────────────────┬───────────┐
│ Dimension │ OpenBLAS 32 threads │ OpenBLAS 16 threads │       MKL │
├───────────┼─────────────────────┼─────────────────────┼───────────┤
│      10.0 │             7.95e-5 │              8.2e-5 │    1.1e-5 │
│     100.0 │             6.31e-5 │             6.22e-5 │   2.06e-5 │
│    1000.0 │           0.0001188 │           0.0001227 │ 0.0001226 │
│   10000.0 │           0.0011194 │            0.001119 │ 0.0007435 │
│  100000.0 │           0.0421721 │           0.0706043 │ 0.0024566 │
│     1.0e6 │           0.0681789 │           0.0977464 │ 0.0242952 │
│     1.0e7 │             3.81686 │             3.74455 │   3.88031 │
└───────────┴─────────────────────┴─────────────────────┴───────────┘
┌───────────┬─────────────────────┬─────────────────────┬─────────┐
│ Dimension │ OpenBLAS 32 threads │ OpenBLAS 16 threads │     MKL │
├───────────┼─────────────────────┼─────────────────────┼─────────┤
│      10.0 │             7.22727 │             7.45455 │     1.0 │
│     100.0 │             3.06311 │             3.01942 │     1.0 │
│    1000.0 │                 1.0 │             1.03283 │ 1.03199 │
│   10000.0 │             1.50558 │             1.50504 │     1.0 │
│  100000.0 │             17.1669 │             28.7407 │     1.0 │
│     1.0e6 │             2.80627 │             4.02328 │     1.0 │
│     1.0e7 │             1.01931 │                 1.0 │ 1.03626 │
└───────────┴─────────────────────┴─────────────────────┴─────────┘

julia> versioninfo()
Julia Version 1.7.0
Commit 3bf9d17731 (2021-11-30 12:12 UTC)
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: AMD Ryzen 9 5950X 16-Core Processor
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-12.0.1 (ORCJIT, znver3)
Environment:
  JULIA_EDITOR = "C:\Users\accou\AppData\Local\atom\app-1.58.0\atom.exe"  -a        
  JULIA_NUM_THREADS = 32

Jan 26 '22 10:01 ChrisRackauckas

│ Dimension ├───── │ 10.0 │ │ 100.0 │ │ 1000.0 │ │ 10000.0 │ │ 100000.0 │ │ 1.0e6 │ │ 1.0e7 │ └───── ┌───── │ Dimension ├───── │ 10.0 │ │ 100.0 │ │ 1000.0 │ │ 10000.0 │ │ 100000.0 │ │ 1.0e6 │ │ 1.0e7 │ └─────

┌───────────┬─────────────────────┬─────────────────────┬───────────┬───────────┐ │ OpenBLAS 32 threads │ OpenBLAS 16 threads │       MKL │     Julia │ ──────┼─────────────────────┼─────────────────────┼───────────┼───────────┤ 7.95e-5 │              8.2e-5 │    1.1e-5 │ 3.3125e-6 │ 6.31e-5 │             6.22e-5 │   2.06e-5 │   3.45e-5 │ 0.0001188 │           0.0001227 │ 0.0001226 │ 0.0003104 │ 0.0011194 │            0.001119 │ 0.0007435 │  0.003095 │ 0.0421721 │           0.0706043 │ 0.0024566 │ 0.0313813 │ 0.0681789 │           0.0977464 │ 0.0242952 │  0.315843 │ 3.81686 │             3.74455 │   3.88031 │   3.15861 │ ──────┴─────────────────────┴─────────────────────┴───────────┴───────────┘ ──────┬─────────────────────┬─────────────────────┬─────────┬─────────┐ │ OpenBLAS 32 threads │ OpenBLAS 16 threads │     MKL │   Julia │ ──────┼─────────────────────┼─────────────────────┼─────────┼─────────┤ 24.0 │             24.7547 │ 3.32075 │     1.0 │ 3.06311 │             3.01942 │     1.0 │ 1.67476 │ 1.0 │             1.03283 │ 1.03199 │ 2.61279 │ 1.50558 │             1.50504 │     1.0 │ 4.16274 │ 17.1669 │             28.7407 │     1.0 │ 12.7743 │ 2.80627 │             4.02328 │     1.0 │ 13.0002 │ 1.2084 │             1.18551 │ 1.22849 │     1.0 │ ──────┴─────────────────────┴─────────────────────┴─────────┴─────────┘
                                    
                                        
                                            
                                            
                                                Jan 26
                                                '22  10:01
                                            
                                            
                                                ChrisRackauckas


                                                                    
                                    
                                        
MKL outperforms OpenBLAS and the Julia implementation for dot products.
I did other benchmarks yesterday with complex numbers for scal!, axpy! and axpby! and MKL surpasses the other versions too (https://github.com/JuliaSmoothOptimizers/Krylov.jl/pull/481#issuecomment-1021935428, https://github.com/JuliaSmoothOptimizers/Krylov.jl/pull/481#issuecomment-1022288383).

                                    
                                    
                                        
                                            
                                            
                                                Jan 26
                                                '22  15:01
                                            
                                            
                                                amontoison
                                            
                                        
                                    
                                    
                                
                                                                    
                                    
                                        
Updated script with some fixes:
using BenchmarkTools, LinearAlgebra, LoopVectorization

# `@noinline`, otherwise the compiler optimizes it away
@noinline function dot_julia(x,y)
    s = zero(Base.promote_eltype(x, y))
    # I thought Julia's bound check elimination pass would be able
    # to remove bounds checks because of the `eachindex`, but 
    @fastmath for i in eachindex(x, y)
        @inbounds s += x[i]'*y[i]
    end
    return s
end
function dot_tturbo(x,y)
    s = zero(Base.promote_eltype(x, y))
    @tturbo for i in eachindex(x, y)
        s += x[i]*y[i]
    end
    return s
end

krylov_dot(n :: Integer, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasReal = BLAS.dot(n, x, dx, y, dy)

p = 7
time_openblas1 = zeros(p);
time_openblas2 = zeros(p);
time_mkl       = zeros(p);
time_julia       = zeros(p);
time_tturbo      = zeros(p);

function belapsed_time(br)
    mintime = BenchmarkTools.time(minimum(br))
    allocs = BenchmarkTools.allocs(br)
    println("  ", (BenchmarkTools).prettytime(mintime), " (", allocs, " allocation", if allocs == 1
            ""
        else
            "s"
        end, ": ", (BenchmarkTools).prettymemory((BenchmarkTools).memory(br)), ")")
    return mintime
end

for i = 1:p
  n = 10^i
  println("Dimension: ", n)
  T = Float64
  x = rand(T, n)
  y = rand(T, n)
  dx = 1
  dy = 1
  b = @benchmark for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
  time_openblas1[i] = belapsed_time(b)
end

NMAX = Sys.CPU_THREADS
N = Int(NMAX / 2)
BLAS.set_num_threads(N)

for i = 1:p
  n = 10^i
  println("Dimension: ", n)
  T = Float64
  x = rand(T, n)
  y = rand(T, n)
  dx = 1
  dy = 1
  b = @benchmark for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
  time_openblas2[i] = belapsed_time(b)
end

using MKL

for i = 1:p
  n = 10^i
  println("Dimension: ", n)
  T = Float64
  x = rand(T, n)
  y = rand(T, n)
  dx = 1
  dy = 1
  b = @benchmark for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
  time_mkl[i] = belapsed_time(b)
end


for i = 1:p
  n = 10^i
  println("Dimension: ", n)
  T = Float64
  x = rand(T, n)
  y = rand(T, n)
  dx = 1
  dy = 1
  b = @benchmark for k in 1:1000 dot_julia($x, $y) end
  time_julia[i] = belapsed_time(b)
end

for i = 1:p
  n = 10^i
  println("Dimension: ", n)
  T = Float64
  x = rand(T, n)
  y = rand(T, n)
  dx = 1
  dy = 1
  b = @benchmark for k in 1:1000 dot_tturbo($x, $y) end
  time_tturbo[i] = belapsed_time(b)
end


using PrettyTables

dimension = [10^i for i=1:p];
time = hcat(dimension, time_openblas1, time_openblas2, time_mkl, time_julia, time_tturbo)
pretty_table(time ; header = ["Dimension", "OpenBLAS $NMAX threads", "OpenBLAS $N threads", "MKL", "Julia", "@tturbo"])

for i = 1:p
  v = @view(time[i,2:end])
  v ./= minimum(v)
end
pretty_table(time ; header = ["Dimension", "OpenBLAS $NMAX threads", "OpenBLAS $N threads", "MKL", "Julia", "@tturbo"])
versioninfo()

I get:
┌───────────┬─────────────────────┬─────────────────────┬───────────┬───────────┬───────────┐
│ Dimension │ OpenBLAS 36 threads │ OpenBLAS 18 threads │       MKL │     Julia │   @tturbo │
├───────────┼─────────────────────┼─────────────────────┼───────────┼───────────┼───────────┤
│      10.0 │             8624.33 │             8708.33 │   14244.0 │    8806.0 │    9252.0 │
│     100.0 │             12004.0 │             11990.0 │   15962.0 │   12749.0 │   12477.0 │
│    1000.0 │             55191.0 │             55050.0 │   54769.0 │   55958.0 │   50141.0 │
│   10000.0 │            905247.0 │            907152.0 │ 1.33786e6 │  916908.0 │  921683.0 │
│  100000.0 │           4.74124e6 │           4.11024e6 │ 2.74366e6 │ 5.47155e7 │ 3.47289e6 │
│     1.0e6 │           9.28879e7 │           1.85967e7 │  1.6299e7 │ 6.25422e8 │ 6.52952e7 │
│     1.0e7 │           2.13776e9 │           2.08145e9 │ 2.08633e9 │ 1.2057e10 │ 2.08374e9 │
└───────────┴─────────────────────┴─────────────────────┴───────────┴───────────┴───────────┘

┌───────────┬─────────────────────┬─────────────────────┬─────────┬─────────┬─────────┐
│ Dimension │ OpenBLAS 36 threads │ OpenBLAS 18 threads │     MKL │   Julia │ @tturbo │
├───────────┼─────────────────────┼─────────────────────┼─────────┼─────────┼─────────┤
│      10.0 │                 1.0 │             1.00974 │ 1.65161 │ 1.02106 │ 1.07278 │
│     100.0 │             1.00117 │                 1.0 │ 1.33128 │  1.0633 │ 1.04062 │
│    1000.0 │             1.10072 │              1.0979 │  1.0923 │ 1.11601 │     1.0 │
│   10000.0 │                 1.0 │              1.0021 │  1.4779 │ 1.01288 │ 1.01816 │
│  100000.0 │             1.72807 │             1.49809 │     1.0 │ 19.9425 │ 1.26579 │
│     1.0e6 │             5.69899 │             1.14097 │     1.0 │ 38.3718 │ 4.00608 │
│     1.0e7 │             1.02705 │                 1.0 │ 1.00235 │ 5.79261 │  1.0011 │
└───────────┴─────────────────────┴─────────────────────┴─────────┴─────────┴─────────┘

julia> versioninfo()
Julia Version 1.7.2-pre.0
Commit 3f024fd0ab* (2021-12-23 18:27 UTC)
Platform Info:
  OS: Linux (x86_64-redhat-linux)
  CPU: Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz

@tturbo isn't using that many threads for dot, which may be why it is falling behind for 10^6.
If it were to use all 18 cores, it would have enough L2 cache to fit:
julia> using CPUSummary

julia> CPUSummary.cache_size(Val(2)) * CPUSummary.num_l2cache() / sizeof(Float64)
2.359296e6

just under 2.36 Float64 into its L2 caches.
The 1e6 benchmark requires 2e6 Float64, so if it were to use all 18 cores, then it could keep the vectors in L2 cache, and we'd be benchmarking L2 -> core streaming performance.
Instead, @tturbo is only using 10 threads at that size. This doesn't fit in the L2 cache of 10 cores, meaning we're streaming memory from L3, and fewer cores are doing the streaming.
I could make LV consider this in deciding how many cores to use...
I should probably remove the loops in the above script.

                                    
                                    
                                        
                                            
                                            
                                                Jan 26
                                                '22  15:01
                                            
                                            
                                                chriselrod
                                            
                                        
                                    
                                    
                                
                                                                    
                                    
                                        
@amontoison what CPU are you using? Maybe part of it is that OpenBLAS is just exceedingly awful on Ryzen, while MKL is just slacking on non-Intel chips.
┌───────────┬─────────────────────┬─────────────────────┬───────────┬───────────┬───────────┐
│ Dimension │ OpenBLAS 32 threads │ OpenBLAS 16 threads │       MKL │     Julia │   @tturbo │
├───────────┼─────────────────────┼─────────────────────┼───────────┼───────────┼───────────┤
│      10.0 │             75100.0 │             80600.0 │   10800.0 │    7050.0 │   8733.33 │
│     100.0 │             68200.0 │             67300.0 │   20200.0 │   10200.0 │   13000.0 │
│    1000.0 │            117800.0 │            121400.0 │  124800.0 │   60100.0 │   60200.0 │
│   10000.0 │            1.1182e6 │            1.1215e6 │  749100.0 │  1.0459e6 │  642200.0 │
│  100000.0 │           4.62663e7 │           9.06275e7 │  2.5587e6 │ 1.21059e7 │  2.0268e6 │
│     1.0e6 │            7.3046e7 │           1.04869e8 │ 2.50255e7 │ 1.23583e8 │ 2.11222e7 │
│     1.0e7 │           3.53264e9 │           3.69242e9 │ 3.88145e9 │ 4.29649e9 │ 3.70713e9 │
└───────────┴─────────────────────┴─────────────────────┴───────────┴───────────┴───────────┘
┌───────────┬─────────────────────┬─────────────────────┬─────────┬─────────┬─────────┐
│ Dimension │ OpenBLAS 32 threads │ OpenBLAS 16 threads │     MKL │   Julia │ @tturbo │
├───────────┼─────────────────────┼─────────────────────┼─────────┼─────────┼─────────┤
│      10.0 │             10.6525 │             11.4326 │ 1.53191 │     1.0 │ 1.23877 │
│     100.0 │             6.68627 │             6.59804 │ 1.98039 │     1.0 │ 1.27451 │
│    1000.0 │             1.96007 │             2.01997 │ 2.07654 │     1.0 │ 1.00166 │
│   10000.0 │              1.7412 │             1.74634 │ 1.16646 │ 1.62862 │     1.0 │
│  100000.0 │             22.8273 │             44.7146 │ 1.26243 │ 5.97291 │     1.0 │
│     1.0e6 │             3.45826 │             4.96486 │  1.1848 │ 5.85086 │     1.0 │
│     1.0e7 │                 1.0 │             1.04523 │ 1.09874 │ 1.21623 │ 1.04939 │
└───────────┴─────────────────────┴─────────────────────┴─────────┴─────────┴─────────┘

Commit 3bf9d17731 (2021-11-30 12:12 UTC)
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: AMD Ryzen 9 5950X 16-Core Processor
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-12.0.1 (ORCJIT, znver3)
Environment:
  JULIA_EDITOR = "C:\Users\accou\AppData\Local\atom\app-1.58.0\atom.exe"  -a
  JULIA_NUM_THREADS = 32


                                    
                                    
                                        
                                            
                                            
                                                Jan 26
                                                '22  16:01
                                            
                                            
                                                ChrisRackauckas
                                            
                                        
                                    
                                    
                                
                                                                    
                                    
                                        

Maybe part of it is that OpenBLAS is just exceedingly awful on Ryzen

If you were on Julia 1.6, this would be the case. Julia 1.6.5 used OpenBLAS 0.3.10:
https://github.com/JuliaLang/julia/blob/v1.6.5/deps/openblas.version
While it was 0.3.11 that added Zen3 support/detection:
https://github.com/xianyi/OpenBLAS/releases/tag/v0.3.11
with this commit:
https://github.com/xianyi/OpenBLAS/commit/200f5c44cc14f356d7dba6af257044016a0573da
But Julia 1.7 is using 0.3.13:
https://github.com/JuliaLang/julia/blob/v1.7.1/deps/openblas.version
so it should be fine.
If you don't mind building from source, you could try something like this in your Make.user:
USE_BINARYBUILDER_OPENBLAS=0
OPENBLAS_TARGET_ARCH=HASWELL

to see if pretending to be some other CPU makes OpenBLAS better.

                                    
                                    
                                        
                                            
                                            
                                                Jan 26
                                                '22  16:01
                                            
                                            
                                                chriselrod
                                            
                                        
                                    
                                    
                                
                                                                    
                                    
                                        
@ChrisRackauckas I have an Intel i5-6200U (Skylake architecture)
Julia Version 1.7.1
Commit ac5cc99908 (2021-12-22 19:35 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-12.0.1 (ORCJIT, skylake)
Environment:
  JULIA_PARDISO = /home/alexis/Applications/PARDISO
  JULIA_NUM_THREADS = 4


                                    
                                    
                                        
                                            
                                            
                                                Jan 26
                                                '22  19:01
                                            
                                            
                                                amontoison
                                            
                                        
                                    
                                    
                                
                                                                    
                                    
                                        
The SpMVs of SuiteSparse:GraphBLAS can be 3+ times faster than MKLSparse (and probably even better on Ryzen although that's untested). What do I need to do to use the SpMV from SuiteSparseGraphBLAS.jl in Krylov? I'd prefer not to commit piracy like MKLSparse.jl unless I have to.

                                    
                                    
                                        
                                            
                                            
                                                Jan 27
                                                '22  01:01
                                            
                                            
                                                rayegun
                                            
                                        
                                    
                                    
                                
                                                                    
                                    
                                        
We could change all mul! into @kmul! and after add a special dispatch to spMVs of GraphBLAS when they can be used here.
We do that for all BLAS 1 operations (scal!, axpy!, axpby!, etc...).
For instance,  the generic @kaxpby! was based on the broadcast in the past to have an efficient version for GPU. I wanted to avoid the generic Julia implementation of axpby!.

                                    
                                    
                                        
                                            
                                            
                                                Jan 27
                                                '22  03:01
                                            
                                            
                                                amontoison
                                            
                                        
                                    
                                    
                                
                                                                    
                                    
                                        
@Wimmerer
I created a branch for you https://github.com/JuliaSmoothOptimizers/Krylov.jl/tree/spmv_graphblas.
You just need to add your methods here. :smiley:

                                    
                                    
                                        
                                            
                                            
                                                Jan 27
                                                '22  03:01
                                            
                                            
                                                amontoison
                                            
                                        
                                    
                                    
                                
                                                                    
                                    
                                        
If you open a PR, I can easily use our JSOBot to run the benchmarks and see the improvments (https://github.com/JuliaSmoothOptimizers/Krylov.jl/pull/456).

                                    
                                    
                                        
                                            
                                            
                                                Jan 27
                                                '22  03:01
                                            
                                            
                                                amontoison
                                            
                                        
                                    
                                    
                                
                                                                    
                                    
                                        
can you print # of Krylov iterations? I'd like to make sure we all solvers are doing the same # of iterations.
https://github.com/SciML/OrdinaryDiffEq.jl/issues/1571

                                    
                                    
                                        
                                            
                                            
                                                Jan 27
                                                '22  03:01
                                            
                                            
                                                vpuri3
                                            
                                        
                                    
                                    
                                
                                                                    
                                    
                                        
@vpuri3
Is it related to this issue ?

                                    
                                    
                                        
                                            
                                            
                                                Jan 27
                                                '22  03:01
                                            
                                            
                                                amontoison
                                            
                                        
                                    
                                    
                                
                                                                    
                                    
                                        
#1571 is about ordinarydiffeq passing verbose argument to linearsolve to print krylov solve stats

                                    
                                    
                                        
                                            
                                            
                                                Jan 27
                                                '22  04:01
                                            
                                            
                                                vpuri3
                                            
                                        
                                    
                                    
                                
                                                                    
                                    
                                        

can you print # of Krylov iterations? I'd like to make sure we all solvers are doing the same # of iterations.

You can check sol.destats at the end to see the number of f calls.

                                    
                                    
                                        
                                            
                                            
                                                Jan 27
                                                '22  13:01
                                            
                                            
                                                ChrisRackauckas

OrdinaryDiffEq.jl OrdinaryDiffEq.jl copied to clipboard

Krylov.jl optimizations for ODE usage

OrdinaryDiffEq.jl
OrdinaryDiffEq.jl copied to clipboard