CUSOLVER.jl
CUSOLVER.jl copied to clipboard
Parallel svd on GPU
I'm trying to implement parallel svd on subarrays of matrix Ag
as follows:
using LinearAlgebra, SparseArrays
using CUDA
using BenchmarkTools
const MatrixView{T} = SubArray{T, 2, AbstractMatrix{T}, Tuple{UnitRange{Int}}, true}
n = 6000
m = Int(ceil(sqrt(n)))
p = 200
blocksize = 3
blocknum = Int(n/blocksize)
A = rand(m,n)
B = rand(n,p)
C = zeros(m,p)
Ag = CuArray(A)
Bg = CuArray(B)
Cg = CuArray(C)
#CPU code
@time begin
@inbounds Threads.@threads for i=1:blocknum
startidx = (i-1)*blocksize+1
endidx = i*blocksize
work = view(A,:,startidx:endidx)
Uc,Sc,Vtc = svd(work)
mul!(work,Uc,Vtc)
end
end
mul!(C,A,B)
#GPU code
function parallel_svd!(Ag,blocksize,blocknum)
i = (blockIdx().x-1)*blockDim().x+threadIdx().x
if (i<=blocknum)
startidx = (i-1)*blocksize+1
endidx = i*blocksize
work = view(Ag,:,startidx:endidx)
U,S,Vt = CUDA.CUSOLVER.gesvd!('S','S',work)
mul!(work,U,Vt)
end
return
end
begin
CUDA.@sync @cuda threads=1024 parallel_svd!(Ag,blocksize,blocknum)
end
mul!(Cg,Ag,Bg)
However, it fails on the GPU implementation with an error:
julia> include("admm_bm_Stiefel_gpu.jl")
ERROR: LoadError: InvalidIRError: compiling MethodInstance for parallel_svd!(::CuDeviceMatrix{Float64, 1}, ::Int64, ::Int64) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to gesvd!)
Stacktrace:
[1] parallel_svd!
@ D:\code\paper_code\paper_admm_bm-main\src\admm_bm_Stiefel_gpu.jl:40
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
[1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
@ GPUCompiler C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\validation.jl:149
[2] macro expansion
@ C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\driver.jl:415 [inlined]
[3] macro expansion
@ C:\Users\ddt00\.julia\packages\TimerOutputs\RsWnF\src\TimerOutput.jl:253 [inlined]
[4] macro expansion
@ C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\driver.jl:414 [inlined]
[5] emit_llvm(job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool)
@ GPUCompiler C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\utils.jl:89
[6] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
@ GPUCompiler C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\driver.jl:129
[7] codegen
@ C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\driver.jl:110 [inlined]
[8] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool)
@ GPUCompiler C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\driver.jl:106
[9] compile
@ C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\driver.jl:98 [inlined]
[10] #1037
@ C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\compiler\compilation.jl:104 [inlined]
[11] JuliaContext(f::CUDA.var"#1037#1040"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
@ GPUCompiler C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\driver.jl:47
[12] compile(job::GPUCompiler.CompilerJob)
@ CUDA C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\compiler\compilation.jl:103
[13] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\execution.jl:125
[14] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
@ GPUCompiler C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\execution.jl:103
[15] macro expansion
@ C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\compiler\execution.jl:318 [inlined]
[16] macro expansion
@ .\lock.jl:267 [inlined]
[17] cufunction(f::typeof(parallel_svd!), tt::Type{Tuple{CuDeviceMatrix{Float64, 1}, Int64, Int64}}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ CUDA C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\compiler\execution.jl:313
[18] cufunction(f::typeof(parallel_svd!), tt::Type{Tuple{CuDeviceMatrix{Float64, 1}, Int64, Int64}})
@ CUDA C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\compiler\execution.jl:310
[19] macro expansion
@ C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\compiler\execution.jl:104 [inlined]
[20] macro expansion
@ C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\utilities.jl:25 [inlined]
[21] top-level scope
@ D:\code\paper_code\paper_admm_bm-main\src\admm_bm_Stiefel_gpu.jl:47
[22] include(fname::String)
@ Base.MainInclude .\client.jl:478
[23] top-level scope
@ REPL[22]:1
[24] top-level scope
@ C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\initialization.jl:185
in expression starting at D:\code\paper_code\paper_admm_bm-main\src\admm_bm_Stiefel_gpu.jl:46
It seems that gesvd! can't be used in the kernel function?
What if I want to execute the svd for each subarray of Ag
in parallel given fixed blocksize=3
and n
is arbitrarily large?