CUSOLVER.jl Parallel svd on GPU

Parallel svd on GPU

Open yuwenchen95 opened this issue 1 year ago • 0 comments

I'm trying to implement parallel svd on subarrays of matrix Ag as follows:

using LinearAlgebra, SparseArrays
using CUDA
using BenchmarkTools

const MatrixView{T} = SubArray{T, 2, AbstractMatrix{T}, Tuple{UnitRange{Int}}, true}

n = 6000
m = Int(ceil(sqrt(n)))
p = 200
blocksize = 3
blocknum = Int(n/blocksize)
A = rand(m,n)
B = rand(n,p)
C = zeros(m,p)
Ag = CuArray(A)
Bg = CuArray(B)
Cg = CuArray(C)

#CPU code
@time begin
    @inbounds Threads.@threads for i=1:blocknum
        startidx = (i-1)*blocksize+1
        endidx = i*blocksize
        work = view(A,:,startidx:endidx)
        Uc,Sc,Vtc = svd(work)
    
        mul!(work,Uc,Vtc)
    end
end
mul!(C,A,B)

#GPU code
function parallel_svd!(Ag,blocksize,blocknum)
    i = (blockIdx().x-1)*blockDim().x+threadIdx().x

    if (i<=blocknum)
        startidx = (i-1)*blocksize+1
        endidx = i*blocksize
        work = view(Ag,:,startidx:endidx) 
        U,S,Vt = CUDA.CUSOLVER.gesvd!('S','S',work)
        mul!(work,U,Vt)
    end
    return
end

begin
    CUDA.@sync @cuda threads=1024 parallel_svd!(Ag,blocksize,blocknum)
end

mul!(Cg,Ag,Bg)

However, it fails on the GPU implementation with an error:

julia> include("admm_bm_Stiefel_gpu.jl")
ERROR: LoadError: InvalidIRError: compiling MethodInstance for parallel_svd!(::CuDeviceMatrix{Float64, 1}, ::Int64, ::Int64) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to gesvd!)
Stacktrace:
 [1] parallel_svd!
   @ D:\code\paper_code\paper_admm_bm-main\src\admm_bm_Stiefel_gpu.jl:40
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
    @ GPUCompiler C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\validation.jl:149
  [2] macro expansion
    @ C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\driver.jl:415 [inlined]
  [3] macro expansion
    @ C:\Users\ddt00\.julia\packages\TimerOutputs\RsWnF\src\TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\driver.jl:414 [inlined]
  [5] emit_llvm(job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool)
    @ GPUCompiler C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\utils.jl:89
  [6] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
    @ GPUCompiler C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\driver.jl:129
  [7] codegen
    @ C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\driver.jl:110 [inlined]
  [8] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool)
    @ GPUCompiler C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\driver.jl:106
  [9] compile
    @ C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\driver.jl:98 [inlined]
 [10] #1037
    @ C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\compiler\compilation.jl:104 [inlined]
 [11] JuliaContext(f::CUDA.var"#1037#1040"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
    @ GPUCompiler C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\driver.jl:47
 [12] compile(job::GPUCompiler.CompilerJob)
    @ CUDA C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\compiler\compilation.jl:103
 [13] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\execution.jl:125
 [14] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler C:\Users\ddt00\.julia\packages\GPUCompiler\YO8Uj\src\execution.jl:103
 [15] macro expansion
    @ C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\compiler\execution.jl:318 [inlined]
 [16] macro expansion
    @ .\lock.jl:267 [inlined]
 [17] cufunction(f::typeof(parallel_svd!), tt::Type{Tuple{CuDeviceMatrix{Float64, 1}, Int64, Int64}}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\compiler\execution.jl:313
 [18] cufunction(f::typeof(parallel_svd!), tt::Type{Tuple{CuDeviceMatrix{Float64, 1}, Int64, Int64}})
    @ CUDA C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\compiler\execution.jl:310
 [19] macro expansion
    @ C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\compiler\execution.jl:104 [inlined]
 [20] macro expansion
    @ C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\utilities.jl:25 [inlined]
 [21] top-level scope
    @ D:\code\paper_code\paper_admm_bm-main\src\admm_bm_Stiefel_gpu.jl:47
 [22] include(fname::String)
    @ Base.MainInclude .\client.jl:478
 [23] top-level scope
    @ REPL[22]:1
 [24] top-level scope
    @ C:\Users\ddt00\.julia\packages\CUDA\tVtYo\src\initialization.jl:185
in expression starting at D:\code\paper_code\paper_admm_bm-main\src\admm_bm_Stiefel_gpu.jl:46

It seems that gesvd! can't be used in the kernel function?

What if I want to execute the svd for each subarray of Ag in parallel given fixed blocksize=3 and n is arbitrarily large?

Oct 17 '23 14:10 yuwenchen95

CUSOLVER.jl CUSOLVER.jl copied to clipboard

Parallel svd on GPU

CUSOLVER.jl
CUSOLVER.jl copied to clipboard