OrdinaryDiffEq.jl
OrdinaryDiffEq.jl copied to clipboard
Krylov.jl optimizations for ODE usage
MWE of preconditioned Newton-Krylov solves with OrdinaryDiffEq and Sundials:
using OrdinaryDiffEq, LinearSolve, Test
const N = 32
const xyd_brusselator = range(0,stop=1,length=N)
brusselator_f(x, y, t) = (((x-0.3)^2 + (y-0.6)^2) <= 0.1^2) * (t >= 1.1) * 5.
limit(a, N) = a == N+1 ? 1 : a == 0 ? N : a
function brusselator_2d_loop(du, u, p, t)
A, B, alpha, dx = p
alpha = alpha/dx^2
@inbounds for I in CartesianIndices((N, N))
i, j = Tuple(I)
x, y = xyd_brusselator[I[1]], xyd_brusselator[I[2]]
ip1, im1, jp1, jm1 = limit(i+1, N), limit(i-1, N), limit(j+1, N), limit(j-1, N)
du[i,j,1] = alpha*(u[im1,j,1] + u[ip1,j,1] + u[i,jp1,1] + u[i,jm1,1] - 4u[i,j,1]) +
B + u[i,j,1]^2*u[i,j,2] - (A + 1)*u[i,j,1] + brusselator_f(x, y, t)
du[i,j,2] = alpha*(u[im1,j,2] + u[ip1,j,2] + u[i,jp1,2] + u[i,jm1,2] - 4u[i,j,2]) +
A*u[i,j,1] - u[i,j,1]^2*u[i,j,2]
end
end
p = (3.4, 1., 10., step(xyd_brusselator))
function init_brusselator_2d(xyd)
N = length(xyd)
u = zeros(N, N, 2)
for I in CartesianIndices((N, N))
x = xyd[I[1]]
y = xyd[I[2]]
u[I,1] = 22*(y*(1-y))^(3/2)
u[I,2] = 27*(x*(1-x))^(3/2)
end
u
end
u0 = init_brusselator_2d(xyd_brusselator)
prob_ode_brusselator_2d = ODEProblem(brusselator_2d_loop,
u0,(0.,11.5),p)
using Symbolics
du0 = copy(u0)
jac = Symbolics.jacobian_sparsity((du,u)->brusselator_2d_loop(du,u,p,0.0),du0,u0)
prob_ode_brusselator_2d_sparse = ODEProblem(ODEFunction(brusselator_2d_loop,jac_prototype = float.(jac)),
u0,(0.,11.5),p)
using ModelingToolkit
prob_ode_brusselator_2d_mtk = ODEProblem(modelingtoolkitize(prob_ode_brusselator_2d_sparse),[],(0.0,11.5),jac=true,sparse=true);
prob_ode_brusselator_2d_sparse = ODEProblem(ODEFunction(brusselator_2d_loop,jac_prototype = float.(jac)),
u0,(0.,11.5),p)
using IncompleteLU
Base.eltype(::IncompleteLU.ILUFactorization{Tv,Ti}) where {Tv,Ti} = Tv
function incompletelu(W,du,u,p,t,newW,Plprev,Prprev,solverdata)
if newW === nothing || newW
Pl = ilu(convert(AbstractMatrix,W), τ = 50.0)
else
Pl = Plprev
end
Pl,nothing
end
using AlgebraicMultigrid
function algebraicmultigrid(W,du,u,p,t,newW,Plprev,Prprev,solverdata)
if newW === nothing || newW
Pl = aspreconditioner(ruge_stuben(convert(AbstractMatrix,W)))
else
Pl = Plprev
end
Pl,nothing
end
function algebraicmultigrid2(W,du,u,p,t,newW,Plprev,Prprev,solverdata)
if newW === nothing || newW
A = convert(AbstractMatrix,W)
Pl = aspreconditioner(ruge_stuben(A, presmoother = AlgebraicMultigrid.Jacobi(rand(size(A,1))), postsmoother = AlgebraicMultigrid.Jacobi(rand(size(A,1)))))
else
Pl = Plprev
end
Pl,nothing
end
using Sundials, LinearAlgebra
u0 = prob_ode_brusselator_2d_mtk.u0
p = prob_ode_brusselator_2d_mtk.p
const jaccache = prob_ode_brusselator_2d_mtk.f.jac(u0,p,0.0)
const W = I - 1.0*jaccache
prectmp = ilu(W, τ = 50.0)
const preccache = Ref(prectmp)
function psetupilu(p, t, u, du, jok, jcurPtr, gamma)
if jok
prob_ode_brusselator_2d_mtk.f.jac(jaccache,u,p,t)
jcurPtr[] = true
# W = I - gamma*J
@. W = -gamma*jaccache
idxs = diagind(W)
@. @view(W[idxs]) = @view(W[idxs]) + 1
# Build preconditioner on W
preccache[] = ilu(W, τ = 5.0)
end
end
function precilu(z,r,p,t,y,fy,gamma,delta,lr)
ldiv!(z,preccache[],r)
end
prectmp2 = aspreconditioner(ruge_stuben(W, presmoother = AlgebraicMultigrid.Jacobi(rand(size(W,1))), postsmoother = AlgebraicMultigrid.Jacobi(rand(size(W,1)))))
const preccache3 = Ref(prectmp2)
function psetupamg(p, t, u, du, jok, jcurPtr, gamma)
if jok
prob_ode_brusselator_2d_mtk.f.jac(jaccache,u,p,t)
jcurPtr[] = true
# W = I - gamma*J
@. W = -gamma*jaccache
idxs = diagind(W)
@. @view(W[idxs]) = @view(W[idxs]) + 1
# Build preconditioner on W
preccache3[] = aspreconditioner(ruge_stuben(W, presmoother = AlgebraicMultigrid.Jacobi(rand(size(W,1))), postsmoother = AlgebraicMultigrid.Jacobi(rand(size(W,1)))))
end
end
function precamg(z,r,p,t,y,fy,gamma,delta,lr)
ldiv!(z,preccache3[],r)
end
# No Preconditioning
@time solve(prob_ode_brusselator_2d,KenCarp47(linsolve=KrylovJL_GMRES()),save_everystep=false);
# 0.715011 seconds (172.53 k allocations: 30.978 MiB)
@time solve(prob_ode_brusselator_2d_sparse,CVODE_BDF(linear_solver=:GMRES),save_everystep=false);
# 0.212607 seconds (58.38 k allocations: 2.846 MiB)
# ILU
@time solve(prob_ode_brusselator_2d_sparse,KenCarp47(linsolve=KrylovJL_GMRES(),precs=incompletelu,concrete_jac=true),save_everystep=false);
# 0.178704 seconds (61.76 k allocations: 61.385 MiB)
@time solve(prob_ode_brusselator_2d_sparse,CVODE_BDF(linear_solver=:GMRES,prec=precilu,psetup=psetupilu,prec_side=1),save_everystep=false);
# 0.095955 seconds (17.72 k allocations: 77.079 MiB)
#AMG
@time solve(prob_ode_brusselator_2d_sparse,KenCarp47(linsolve=KrylovJL_GMRES(),precs=algebraicmultigrid2,concrete_jac=true),save_everystep=false);
# 0.285271 seconds (65.71 k allocations: 170.192 MiB, 2.49% gc time)
@time solve(prob_ode_brusselator_2d_sparse,CVODE_BDF(linear_solver=:GMRES,prec=precamg,psetup=psetupamg,prec_side=1),save_everystep=false);
# 0.146130 seconds (30.68 k allocations: 275.589 MiB, 7.51% gc time)
Sundials.jl is still slightly ahead of OrdinaryDiffEq here, but within 2x once preconditioners are applied (and preconditioners are a hell of a lot easier to define in OrdinaryDiffEq, so in some sense it's ahead). The non-preconditioned case is still ~3.5x difference though, which points to some missing optimizations in Krylov.jl, though even the preconditioners could use some optimizations from what I can tell.
@chriselrod we can continue the discussion from https://github.com/SciML/OrdinaryDiffEq.jl/issues/1551#issuecomment-1008334603 here. The example above highlights the vs Sundials for each of the cases. The ILU case is the one that is back and forward substitution limited. The Krylov.jl without preconditioners case is the one that is a bit BLAS fishy. The AMG case, I don't know why that one is so slow without Jacobi smoothing precs = algebraicmultigrid
, that seems to be due to the Gauss-Seidel implementation. So there's a lot of code optimization to be done here.
The GMRES
method of Krylov.jl is slower than the GMRES
of IterativeSolvers.jl ?
No, it's like an order of magnitude faster than the IterativeSolvers.jl one. IterativeSolvers.jl isn't even in the running.
To optimize the performance of Krylov methods, you could use MKLSparse.jl
.
Matrix-vector products are multi-threaded (https://juliasmoothoptimizers.github.io/Krylov.jl/dev/tips/).
https://github.com/SciML/OrdinaryDiffEq.jl/issues/1551#issuecomment-1008235901 has a bunch of examples. IterativeSolvers.jl is just aggressively bad here and I haven't had time to look into it, but it was far enough away that I wasn't going to spend anymore time with it. Krylov.jl is much better, but I did see a few things:
https://github.com/SciML/OrdinaryDiffEq.jl/issues/1551#issuecomment-1008237675
1/3 of the time was in the f
call, and I think we could get that closer to 1. It was spending a lot of time in the dot
operation, but it's known that the OpenBLAS defaults tend to give you a very slow dot
, slower than just using a pure Julia one. That was 1/3 of the time, so I think it might be hitting some of that and a manual SIMD might be better. Of course, if someone is using MKL that isn't quite the case, so it should probably check the vendor etc. . Another 1/3 was in the forward and backwards substitutions in IncompleteLU.jl, which I know @chriselrod did a bunch of similar optimizations before in https://github.com/JuliaSIMD/TriangularSolve.jl so that preconditioner implementation might be able to be accelerated, but that's separate from the Krylov.jl time.
@time solve(prob_ode_brusselator_2d,Rosenbrock23(linsolve=KrylovJL_GMRES()),save_everystep=false);
# 0.891711 seconds (230.15 k allocations: 184.457 MiB, 5.10% gc time)
@time solve(prob_ode_brusselator_2d,Rosenbrock23(linsolve=IterativeSolversJL_GMRES()),save_everystep=false);
# 6.231675 seconds (1.37 M allocations: 206.219 MiB, 0.28% gc time)
@time solve(prob_ode_brusselator_2d,Rodas4(linsolve=IterativeSolversJL_GMRES()),save_everystep=false);
# 8.095367 seconds (1.82 M allocations: 296.098 MiB, 0.38% gc time)
@time solve(prob_ode_brusselator_2d,Rodas4(linsolve=KrylovJL_GMRES()),save_everystep=false);
# 1.215097 seconds (317.01 k allocations: 152.230 MiB, 3.26% gc time)
#1551 (comment) has a bunch of examples. IterativeSolvers.jl is just aggressively bad here and I haven't had time to look into it, but it was far enough away that I wasn't going to spend anymore time with it. Krylov.jl is much better, but I did see a few things:
1/3 of the time was in the
f
call, and I think we could get that closer to 1. It was spending a lot of time in thedot
operation, but it's known that the OpenBLAS defaults tend to give you a very slowdot
, slower than just using a pure Julia one. That was 1/3 of the time, so I think it might be hitting some of that and a manual SIMD might be better. Of course, if someone is using MKL that isn't quite the case, so it should probably check the vendor etc. . Another 1/3 was in the forward and backwards substitutions in IncompleteLU.jl, which I know @chriselrod did a bunch of similar optimizations before in https://github.com/JuliaSIMD/TriangularSolve.jl so that preconditioner implementation might be able to be accelerated, but that's separate from the Krylov.jl time.
I will do some tests with a pure Julia dot
, we also have this issue with the CUDA.dot
implementation for solving linear systems on GPU.
If only the MKL
version is efficient, I can easily change the dispatch to check BLAS vendor and use a pure julia dot in our Krylov macros (https://github.com/JuliaSmoothOptimizers/Krylov.jl/blob/main/src/krylov_utils.jl#L203-L220).
For GPU problems, broadcast is sometimes more efficient than CUBLAS
routines but I never checked on linear systems stored on CPU.
Specifically, this is the issue that can really hurt users of default Julia installations: https://github.com/JuliaLang/julia/issues/33409
Specifically, this is the issue that can really hurt users of default Julia installations: JuliaLang/julia#33409
You opened the issue in 2019 and it's still not fixed ?! :disappointed: I will do some benchmarks with MKL and OpenBLAS asap.
I did some benchmarks with MKL
and OpenBLAS
for dot
products.
The relative difference between them is small.
┌───────────┬────────────────────┬────────────────────┬─────────┐
│ Dimension │ OpenBLAS 4 threads │ OpenBLAS 2 threads │ MKL │
├───────────┼────────────────────┼────────────────────┼─────────┤
│ 10.0 │ 1.02666 │ 1.0 │ 1.38784 │
│ 100.0 │ 1.01277 │ 1.0 │ 1.00692 │
│ 1000.0 │ 1.04922 │ 1.04665 │ 1.0 │
│ 10000.0 │ 1.13293 │ 1.10058 │ 1.0 │
│ 100000.0 │ 1.10543 │ 1.10557 │ 1.0 │
│ 1.0e6 │ 1.03908 │ 1.0 │ 1.00173 │
│ 1.0e7 │ 1.02159 │ 1.05796 │ 1.0 │
└───────────┴────────────────────┴────────────────────┴─────────┘
Code used for the benchmarks:
using BenchmarkTools, LinearAlgebra
krylov_dot(n :: Integer, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasReal = BLAS.dot(n, x, dx, y, dy)
p = 7
time_openblas1 = zeros(p)
time_openblas2 = zeros(p)
time_mkl = zeros(p)
for i = 1:p
n = 10^i
println("Dimension: ", n)
T = Float64
x = rand(T, n)
y = rand(T, n)
dx = 1
dy = 1
@btime for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
time_openblas1[i] = @belapsed for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
end
NMAX = Sys.CPU_THREADS
N = Int(NMAX / 2)
BLAS.set_num_threads(N)
for i = 1:p
n = 10^i
println("Dimension: ", n)
T = Float64
x = rand(T, n)
y = rand(T, n)
dx = 1
dy = 1
@btime for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
time_openblas2[i] = @belapsed for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
end
using MKL
for i = 1:p
n = 10^i
println("Dimension: ", n)
T = Float64
x = rand(T, n)
y = rand(T, n)
dx = 1
dy = 1
@btime for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
time_mkl[i] = @belapsed for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
end
using PrettyTables
dimension = [10^i for i=1:p]
time = hcat(dimension, time_openblas1, time_openblas2, time_mkl)
pretty_table(time ; header = ["Dimension", "OpenBLAS $NMAX threads", "OpenBLAS $N threads", "MKL"])
for i = 1:p
val = min(time[i,2], time[i,3], time[i,4])
time[i,2] /= val
time[i,3] /= val
time[i,4] /= val
end
pretty_table(time ; header = ["Dimension", "OpenBLAS $NMAX threads", "OpenBLAS $N threads", "MKL"])
Do we have an efficient Julia implementation of dot
somewhere to easily compare with BLAS versions?
Do we have an efficient Julia implementation of
dot
somewhere to easily compare with BLAS versions?
function dot_julia(x,y)
s = zero(promote_eltype(x, y))
@fastmath for i in eachindex(x, y)
s += x[i]'*y[i]
end
return s
end
is fairly good, but will probably fall behind for large sizes vs multithreaded on x86 CPUs, because they tend to get better total memory bandwidth when more than 1 core is active.
┌───────────┬────────────────────┬────────────────────┬─────────┬─────────┐
│ Dimension │ OpenBLAS 4 threads │ OpenBLAS 2 threads │ MKL │ Julia │
├───────────┼────────────────────┼────────────────────┼─────────┼─────────┤
│ 10.0 │ 1.13365 │ 1.13204 │ 1.13028 │ 1.0 │
│ 100.0 │ 1.01151 │ 1.00737 │ 1.0 │ 1.62794 │
│ 1000.0 │ 1.01729 │ 1.01326 │ 1.0 │ 4.57984 │
│ 10000.0 │ 1.01232 │ 1.00192 │ 1.0 │ 3.5687 │
│ 100000.0 │ 1.10966 │ 1.08058 │ 1.0 │ 4.33227 │
│ 1.0e6 │ 1.02231 │ 1.02389 │ 1.0 │ 1.03161 │
│ 1.0e7 │ 1.0632 │ 1.11198 │ 1.06174 │ 1.0 │
└───────────┴────────────────────┴────────────────────┴─────────┴─────────┘
It definitely depends on the CPU.
┌───────────┬─────────────────────┬─────────────────────┬───────────┐
│ Dimension │ OpenBLAS 32 threads │ OpenBLAS 16 threads │ MKL │
├───────────┼─────────────────────┼─────────────────────┼───────────┤
│ 10.0 │ 7.95e-5 │ 8.2e-5 │ 1.1e-5 │
│ 100.0 │ 6.31e-5 │ 6.22e-5 │ 2.06e-5 │
│ 1000.0 │ 0.0001188 │ 0.0001227 │ 0.0001226 │
│ 10000.0 │ 0.0011194 │ 0.001119 │ 0.0007435 │
│ 100000.0 │ 0.0421721 │ 0.0706043 │ 0.0024566 │
│ 1.0e6 │ 0.0681789 │ 0.0977464 │ 0.0242952 │
│ 1.0e7 │ 3.81686 │ 3.74455 │ 3.88031 │
└───────────┴─────────────────────┴─────────────────────┴───────────┘
┌───────────┬─────────────────────┬─────────────────────┬─────────┐
│ Dimension │ OpenBLAS 32 threads │ OpenBLAS 16 threads │ MKL │
├───────────┼─────────────────────┼─────────────────────┼─────────┤
│ 10.0 │ 7.22727 │ 7.45455 │ 1.0 │
│ 100.0 │ 3.06311 │ 3.01942 │ 1.0 │
│ 1000.0 │ 1.0 │ 1.03283 │ 1.03199 │
│ 10000.0 │ 1.50558 │ 1.50504 │ 1.0 │
│ 100000.0 │ 17.1669 │ 28.7407 │ 1.0 │
│ 1.0e6 │ 2.80627 │ 4.02328 │ 1.0 │
│ 1.0e7 │ 1.01931 │ 1.0 │ 1.03626 │
└───────────┴─────────────────────┴─────────────────────┴─────────┘
julia> versioninfo()
Julia Version 1.7.0
Commit 3bf9d17731 (2021-11-30 12:12 UTC)
Platform Info:
OS: Windows (x86_64-w64-mingw32)
CPU: AMD Ryzen 9 5950X 16-Core Processor
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-12.0.1 (ORCJIT, znver3)
Environment:
JULIA_EDITOR = "C:\Users\accou\AppData\Local\atom\app-1.58.0\atom.exe" -a
JULIA_NUM_THREADS = 32
┌───────────┬─────────────────────┬─────────────────────┬───────────┬───────────┐
│ Dimension │ OpenBLAS 32 threads │ OpenBLAS 16 threads │ MKL │ Julia │
├───────────┼─────────────────────┼─────────────────────┼───────────┼───────────┤
│ 10.0 │ 7.95e-5 │ 8.2e-5 │ 1.1e-5 │ 3.3125e-6 │
│ 100.0 │ 6.31e-5 │ 6.22e-5 │ 2.06e-5 │ 3.45e-5 │
│ 1000.0 │ 0.0001188 │ 0.0001227 │ 0.0001226 │ 0.0003104 │
│ 10000.0 │ 0.0011194 │ 0.001119 │ 0.0007435 │ 0.003095 │
│ 100000.0 │ 0.0421721 │ 0.0706043 │ 0.0024566 │ 0.0313813 │
│ 1.0e6 │ 0.0681789 │ 0.0977464 │ 0.0242952 │ 0.315843 │
│ 1.0e7 │ 3.81686 │ 3.74455 │ 3.88031 │ 3.15861 │
└───────────┴─────────────────────┴─────────────────────┴───────────┴───────────┘
┌───────────┬─────────────────────┬─────────────────────┬─────────┬─────────┐
│ Dimension │ OpenBLAS 32 threads │ OpenBLAS 16 threads │ MKL │ Julia │
├───────────┼─────────────────────┼─────────────────────┼─────────┼─────────┤
│ 10.0 │ 24.0 │ 24.7547 │ 3.32075 │ 1.0 │
│ 100.0 │ 3.06311 │ 3.01942 │ 1.0 │ 1.67476 │
│ 1000.0 │ 1.0 │ 1.03283 │ 1.03199 │ 2.61279 │
│ 10000.0 │ 1.50558 │ 1.50504 │ 1.0 │ 4.16274 │
│ 100000.0 │ 17.1669 │ 28.7407 │ 1.0 │ 12.7743 │
│ 1.0e6 │ 2.80627 │ 4.02328 │ 1.0 │ 13.0002 │
│ 1.0e7 │ 1.2084 │ 1.18551 │ 1.22849 │ 1.0 │
└───────────┴─────────────────────┴─────────────────────┴─────────┴─────────┘
MKL
outperforms OpenBLAS
and the Julia implementation for dot
products.
I did other benchmarks yesterday with complex numbers for scal!
, axpy!
and axpby!
and MKL surpasses the other versions too (https://github.com/JuliaSmoothOptimizers/Krylov.jl/pull/481#issuecomment-1021935428, https://github.com/JuliaSmoothOptimizers/Krylov.jl/pull/481#issuecomment-1022288383).
Updated script with some fixes:
using BenchmarkTools, LinearAlgebra, LoopVectorization
# `@noinline`, otherwise the compiler optimizes it away
@noinline function dot_julia(x,y)
s = zero(Base.promote_eltype(x, y))
# I thought Julia's bound check elimination pass would be able
# to remove bounds checks because of the `eachindex`, but
@fastmath for i in eachindex(x, y)
@inbounds s += x[i]'*y[i]
end
return s
end
function dot_tturbo(x,y)
s = zero(Base.promote_eltype(x, y))
@tturbo for i in eachindex(x, y)
s += x[i]*y[i]
end
return s
end
krylov_dot(n :: Integer, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasReal = BLAS.dot(n, x, dx, y, dy)
p = 7
time_openblas1 = zeros(p);
time_openblas2 = zeros(p);
time_mkl = zeros(p);
time_julia = zeros(p);
time_tturbo = zeros(p);
function belapsed_time(br)
mintime = BenchmarkTools.time(minimum(br))
allocs = BenchmarkTools.allocs(br)
println(" ", (BenchmarkTools).prettytime(mintime), " (", allocs, " allocation", if allocs == 1
""
else
"s"
end, ": ", (BenchmarkTools).prettymemory((BenchmarkTools).memory(br)), ")")
return mintime
end
for i = 1:p
n = 10^i
println("Dimension: ", n)
T = Float64
x = rand(T, n)
y = rand(T, n)
dx = 1
dy = 1
b = @benchmark for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
time_openblas1[i] = belapsed_time(b)
end
NMAX = Sys.CPU_THREADS
N = Int(NMAX / 2)
BLAS.set_num_threads(N)
for i = 1:p
n = 10^i
println("Dimension: ", n)
T = Float64
x = rand(T, n)
y = rand(T, n)
dx = 1
dy = 1
b = @benchmark for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
time_openblas2[i] = belapsed_time(b)
end
using MKL
for i = 1:p
n = 10^i
println("Dimension: ", n)
T = Float64
x = rand(T, n)
y = rand(T, n)
dx = 1
dy = 1
b = @benchmark for k in 1:1000 krylov_dot($n, $x, $dx, $y, $dy) end
time_mkl[i] = belapsed_time(b)
end
for i = 1:p
n = 10^i
println("Dimension: ", n)
T = Float64
x = rand(T, n)
y = rand(T, n)
dx = 1
dy = 1
b = @benchmark for k in 1:1000 dot_julia($x, $y) end
time_julia[i] = belapsed_time(b)
end
for i = 1:p
n = 10^i
println("Dimension: ", n)
T = Float64
x = rand(T, n)
y = rand(T, n)
dx = 1
dy = 1
b = @benchmark for k in 1:1000 dot_tturbo($x, $y) end
time_tturbo[i] = belapsed_time(b)
end
using PrettyTables
dimension = [10^i for i=1:p];
time = hcat(dimension, time_openblas1, time_openblas2, time_mkl, time_julia, time_tturbo)
pretty_table(time ; header = ["Dimension", "OpenBLAS $NMAX threads", "OpenBLAS $N threads", "MKL", "Julia", "@tturbo"])
for i = 1:p
v = @view(time[i,2:end])
v ./= minimum(v)
end
pretty_table(time ; header = ["Dimension", "OpenBLAS $NMAX threads", "OpenBLAS $N threads", "MKL", "Julia", "@tturbo"])
versioninfo()
I get:
┌───────────┬─────────────────────┬─────────────────────┬───────────┬───────────┬───────────┐
│ Dimension │ OpenBLAS 36 threads │ OpenBLAS 18 threads │ MKL │ Julia │ @tturbo │
├───────────┼─────────────────────┼─────────────────────┼───────────┼───────────┼───────────┤
│ 10.0 │ 8624.33 │ 8708.33 │ 14244.0 │ 8806.0 │ 9252.0 │
│ 100.0 │ 12004.0 │ 11990.0 │ 15962.0 │ 12749.0 │ 12477.0 │
│ 1000.0 │ 55191.0 │ 55050.0 │ 54769.0 │ 55958.0 │ 50141.0 │
│ 10000.0 │ 905247.0 │ 907152.0 │ 1.33786e6 │ 916908.0 │ 921683.0 │
│ 100000.0 │ 4.74124e6 │ 4.11024e6 │ 2.74366e6 │ 5.47155e7 │ 3.47289e6 │
│ 1.0e6 │ 9.28879e7 │ 1.85967e7 │ 1.6299e7 │ 6.25422e8 │ 6.52952e7 │
│ 1.0e7 │ 2.13776e9 │ 2.08145e9 │ 2.08633e9 │ 1.2057e10 │ 2.08374e9 │
└───────────┴─────────────────────┴─────────────────────┴───────────┴───────────┴───────────┘
┌───────────┬─────────────────────┬─────────────────────┬─────────┬─────────┬─────────┐
│ Dimension │ OpenBLAS 36 threads │ OpenBLAS 18 threads │ MKL │ Julia │ @tturbo │
├───────────┼─────────────────────┼─────────────────────┼─────────┼─────────┼─────────┤
│ 10.0 │ 1.0 │ 1.00974 │ 1.65161 │ 1.02106 │ 1.07278 │
│ 100.0 │ 1.00117 │ 1.0 │ 1.33128 │ 1.0633 │ 1.04062 │
│ 1000.0 │ 1.10072 │ 1.0979 │ 1.0923 │ 1.11601 │ 1.0 │
│ 10000.0 │ 1.0 │ 1.0021 │ 1.4779 │ 1.01288 │ 1.01816 │
│ 100000.0 │ 1.72807 │ 1.49809 │ 1.0 │ 19.9425 │ 1.26579 │
│ 1.0e6 │ 5.69899 │ 1.14097 │ 1.0 │ 38.3718 │ 4.00608 │
│ 1.0e7 │ 1.02705 │ 1.0 │ 1.00235 │ 5.79261 │ 1.0011 │
└───────────┴─────────────────────┴─────────────────────┴─────────┴─────────┴─────────┘
julia> versioninfo()
Julia Version 1.7.2-pre.0
Commit 3f024fd0ab* (2021-12-23 18:27 UTC)
Platform Info:
OS: Linux (x86_64-redhat-linux)
CPU: Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz
@tturbo
isn't using that many threads for dot, which may be why it is falling behind for 10^6.
If it were to use all 18 cores, it would have enough L2 cache to fit:
julia> using CPUSummary
julia> CPUSummary.cache_size(Val(2)) * CPUSummary.num_l2cache() / sizeof(Float64)
2.359296e6
just under 2.36 Float64
into its L2 caches.
The 1e6 benchmark requires 2e6 Float64
, so if it were to use all 18 cores, then it could keep the vectors in L2 cache, and we'd be benchmarking L2 -> core streaming performance.
Instead, @tturbo
is only using 10 threads at that size. This doesn't fit in the L2 cache of 10 cores, meaning we're streaming memory from L3, and fewer cores are doing the streaming.
I could make LV consider this in deciding how many cores to use...
I should probably remove the loops in the above script.
@amontoison what CPU are you using? Maybe part of it is that OpenBLAS is just exceedingly awful on Ryzen, while MKL is just slacking on non-Intel chips.
┌───────────┬─────────────────────┬─────────────────────┬───────────┬───────────┬───────────┐
│ Dimension │ OpenBLAS 32 threads │ OpenBLAS 16 threads │ MKL │ Julia │ @tturbo │
├───────────┼─────────────────────┼─────────────────────┼───────────┼───────────┼───────────┤
│ 10.0 │ 75100.0 │ 80600.0 │ 10800.0 │ 7050.0 │ 8733.33 │
│ 100.0 │ 68200.0 │ 67300.0 │ 20200.0 │ 10200.0 │ 13000.0 │
│ 1000.0 │ 117800.0 │ 121400.0 │ 124800.0 │ 60100.0 │ 60200.0 │
│ 10000.0 │ 1.1182e6 │ 1.1215e6 │ 749100.0 │ 1.0459e6 │ 642200.0 │
│ 100000.0 │ 4.62663e7 │ 9.06275e7 │ 2.5587e6 │ 1.21059e7 │ 2.0268e6 │
│ 1.0e6 │ 7.3046e7 │ 1.04869e8 │ 2.50255e7 │ 1.23583e8 │ 2.11222e7 │
│ 1.0e7 │ 3.53264e9 │ 3.69242e9 │ 3.88145e9 │ 4.29649e9 │ 3.70713e9 │
└───────────┴─────────────────────┴─────────────────────┴───────────┴───────────┴───────────┘
┌───────────┬─────────────────────┬─────────────────────┬─────────┬─────────┬─────────┐
│ Dimension │ OpenBLAS 32 threads │ OpenBLAS 16 threads │ MKL │ Julia │ @tturbo │
├───────────┼─────────────────────┼─────────────────────┼─────────┼─────────┼─────────┤
│ 10.0 │ 10.6525 │ 11.4326 │ 1.53191 │ 1.0 │ 1.23877 │
│ 100.0 │ 6.68627 │ 6.59804 │ 1.98039 │ 1.0 │ 1.27451 │
│ 1000.0 │ 1.96007 │ 2.01997 │ 2.07654 │ 1.0 │ 1.00166 │
│ 10000.0 │ 1.7412 │ 1.74634 │ 1.16646 │ 1.62862 │ 1.0 │
│ 100000.0 │ 22.8273 │ 44.7146 │ 1.26243 │ 5.97291 │ 1.0 │
│ 1.0e6 │ 3.45826 │ 4.96486 │ 1.1848 │ 5.85086 │ 1.0 │
│ 1.0e7 │ 1.0 │ 1.04523 │ 1.09874 │ 1.21623 │ 1.04939 │
└───────────┴─────────────────────┴─────────────────────┴─────────┴─────────┴─────────┘
Commit 3bf9d17731 (2021-11-30 12:12 UTC)
Platform Info:
OS: Windows (x86_64-w64-mingw32)
CPU: AMD Ryzen 9 5950X 16-Core Processor
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-12.0.1 (ORCJIT, znver3)
Environment:
JULIA_EDITOR = "C:\Users\accou\AppData\Local\atom\app-1.58.0\atom.exe" -a
JULIA_NUM_THREADS = 32
Maybe part of it is that OpenBLAS is just exceedingly awful on Ryzen
If you were on Julia 1.6, this would be the case. Julia 1.6.5 used OpenBLAS 0.3.10: https://github.com/JuliaLang/julia/blob/v1.6.5/deps/openblas.version While it was 0.3.11 that added Zen3 support/detection: https://github.com/xianyi/OpenBLAS/releases/tag/v0.3.11 with this commit: https://github.com/xianyi/OpenBLAS/commit/200f5c44cc14f356d7dba6af257044016a0573da But Julia 1.7 is using 0.3.13: https://github.com/JuliaLang/julia/blob/v1.7.1/deps/openblas.version so it should be fine.
If you don't mind building from source, you could try something like this in your Make.user
:
USE_BINARYBUILDER_OPENBLAS=0
OPENBLAS_TARGET_ARCH=HASWELL
to see if pretending to be some other CPU makes OpenBLAS better.
@ChrisRackauckas I have an Intel i5-6200U (Skylake architecture)
Julia Version 1.7.1
Commit ac5cc99908 (2021-12-22 19:35 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-12.0.1 (ORCJIT, skylake)
Environment:
JULIA_PARDISO = /home/alexis/Applications/PARDISO
JULIA_NUM_THREADS = 4
The SpMVs of SuiteSparse:GraphBLAS can be 3+ times faster than MKLSparse (and probably even better on Ryzen although that's untested). What do I need to do to use the SpMV from SuiteSparseGraphBLAS.jl in Krylov? I'd prefer not to commit piracy like MKLSparse.jl unless I have to.
We could change all mul!
into @kmul!
and after add a special dispatch to spMVs of GraphBLAS when they can be used here.
We do that for all BLAS 1 operations (scal!
, axpy!
, axpby!
, etc...).
For instance, the generic @kaxpby!
was based on the broadcast in the past to have an efficient version for GPU. I wanted to avoid the generic Julia implementation of axpby!
.
@Wimmerer I created a branch for you https://github.com/JuliaSmoothOptimizers/Krylov.jl/tree/spmv_graphblas. You just need to add your methods here. :smiley:
If you open a PR, I can easily use our JSOBot
to run the benchmarks and see the improvments (https://github.com/JuliaSmoothOptimizers/Krylov.jl/pull/456).
can you print # of Krylov iterations? I'd like to make sure we all solvers are doing the same # of iterations.
https://github.com/SciML/OrdinaryDiffEq.jl/issues/1571
@vpuri3 Is it related to this issue ?
#1571 is about ordinarydiffeq passing verbose argument to linearsolve to print krylov solve stats
can you print # of Krylov iterations? I'd like to make sure we all solvers are doing the same # of iterations.
You can check sol.destats at the end to see the number of f
calls.