CUDA.jl icon indicating copy to clipboard operation
CUDA.jl copied to clipboard

WMMA Float64

Open carstenbauer opened this issue 2 years ago • 2 comments

Sister PR of https://github.com/JuliaGPU/CUDA.jl/pull/1419 which tries to add TensorFloat32 (TF32) support and https://github.com/JuliaGPU/CUDA.jl/pull/1425 which tries to add BFloat16 (BF16) support.

cc: @HenriDeh, @thomasfaingnaert

carstenbauer avatar Mar 03 '22 09:03 carstenbauer

First test kernel:

function kernel_wmma_f64_lowlevel(a_dev, b_dev, c_dev, d_dev)
    a_frag = WMMA.llvm_wmma_load_a_col_m8n8k4_global_stride_f64(pointer(a_dev), 8)
    b_frag = WMMA.llvm_wmma_load_b_col_m8n8k4_global_stride_f64(pointer(b_dev), 4)
    c_frag = WMMA.llvm_wmma_load_c_col_m8n8k4_global_stride_f64(pointer(c_dev), 8)

    d_frag = WMMA.llvm_wmma_mma_col_col_m8n8k4_f64(a_frag, b_frag, c_frag)

    WMMA.llvm_wmma_store_d_col_m8n8k4_global_stride_f64(pointer(d_dev), d_frag, 8)
    return nothing
end

function call_kernel()
    m = n = 8
    k = 4
    dtype_a = dtype_b = Float64
    dtype_c = dtype_d = Float64

    d_a = CUDA.rand(dtype_a, m, k)
    d_b = CUDA.rand(dtype_b, k, n)
    d_c = CUDA.rand(dtype_c, m, n)
    d_d = CUDA.zeros(dtype_d, m, n)

    CUDA.@sync @cuda kernel_wmma_f64_lowlevel(d_a, d_b, d_c, d_d)
    return nothing
end

Current output (Julia1 1.8-beta1):

julia> call_kernel()                                                                                                                                    
                                                                                                                                                        
signal (11): Segmentation fault                                                                                                                         
in expression starting at REPL[4]:1                                                                                                                     
_ZN4llvm19MachineRegisterInfo17constrainRegClassENS_8RegisterEPKNS_19TargetRegisterClassEj at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0
-beta1+0~x64/bin/../lib/julia/libLLVM-13jl.so (unknown line)                                                                                            
_ZN4llvm12InstrEmitter18AddRegisterOperandERNS_19MachineInstrBuilderENS_7SDValueEjPKNS_11MCInstrDescERNS_8DenseMapIS3_NS_8RegisterENS_12DenseMapInfoIS3_
EENS_6detail12DenseMapPairIS3_S8_EEEEbbb at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13jl.so (unk
nown line)                                                                                                                                              
_ZN4llvm12InstrEmitter15EmitMachineNodeEPNS_6SDNodeEbbRNS_8DenseMapINS_7SDValueENS_8RegisterENS_12DenseMapInfoIS4_EENS_6detail12DenseMapPairIS4_S5_EEEE 
at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13jl.so (unknown line)                               
_ZN4llvm18ScheduleDAGSDNodes12EmitScheduleERNS_26MachineInstrBundleIteratorINS_12MachineInstrELb0EEE at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/j
ulia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13jl.so (unknown line)                                                                                  
_ZN4llvm16SelectionDAGISel17CodeGenAndEmitDAGEv at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13jl.
so (unknown line)                                                                                                                                       
_ZN4llvm16SelectionDAGISel20SelectAllBasicBlocksERKNS_8FunctionE at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/ju
lia/libLLVM-13jl.so (unknown line)                                                                                                                      
_ZN4llvm16SelectionDAGISel20runOnMachineFunctionERNS_15MachineFunctionE.part.899 at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x
64/bin/../lib/julia/libLLVM-13jl.so (unknown line)                                                                                                      
_ZN4llvm19MachineFunctionPass13runOnFunctionERNS_8FunctionE at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/l
ibLLVM-13jl.so (unknown line)                                                                                                                           
_ZN4llvm13FPPassManager13runOnFunctionERNS_8FunctionE at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM
-13jl.so (unknown line)                                                                                                                                 
_ZN4llvm13FPPassManager11runOnModuleERNS_6ModuleE at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13j
l.so (unknown line)                                                                                                                                     
_ZN4llvm6legacy15PassManagerImpl3runERNS_6ModuleE at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13j
l.so (unknown line)                                                                                                                                     
_ZL21LLVMTargetMachineEmitP23LLVMOpaqueTargetMachineP16LLVMOpaqueModuleRN4llvm17raw_pwrite_streamE19LLVMCodeGenFileTypePPc at /scratch/pc2-mitarbeiter/b
auerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13jl.so (unknown line)                                                            
LLVMTargetMachineEmitToMemoryBuffer at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13jl.so (unknown 
line)                                                                                                                                                   
LLVMTargetMachineEmitToMemoryBuffer at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/LLVM/MJqe4/lib/13/libLLVM_h.jl:947 [inlined]                     
emit at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/LLVM/MJqe4/src/targetmachine.jl:45                                                              
mcgen at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/mcgen.jl:74                                                              
unknown function (ip: 0x14ae3a0f3bff)                                                                                                                   
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2340 [inlined]                                                                         
ijl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2522 
macro expansion at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/TimerOutputs/5tW2E/src/TimerOutput.jl:252 [inlined]                                  
macro expansion at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/driver.jl:339 [inlined]                                        
macro expansion at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/TimerOutputs/5tW2E/src/TimerOutput.jl:252 [inlined]                                  
macro expansion at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/driver.jl:336 [inlined]                                        
#emit_asm#137 at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/utils.jl:64                                                      
emit_asm##kw at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/utils.jl:62 [inlined]                                             
cufunction_compile at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:326                                                      
cached_compilation at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/cache.jl:90                                                 
#cufunction#255 at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:297
cufunction at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:291 [inlined]
macro expansion at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:102 [inlined]
macro expansion at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/utilities.jl:25 [inlined]
call_kernel at ./REPL[3]:12
unknown function (ip: 0x14ae588e49bf)
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2340 [inlined]
ijl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2522 
jl_apply at /buildworker/worker/package_linux64/build/src/julia.h:1825 [inlined]
do_call at /buildworker/worker/package_linux64/build/src/interpreter.c:126
eval_value at /buildworker/worker/package_linux64/build/src/interpreter.c:215
eval_stmt_value at /buildworker/worker/package_linux64/build/src/interpreter.c:166 [inlined]
eval_body at /buildworker/worker/package_linux64/build/src/interpreter.c:612 
jl_interpret_toplevel_thunk at /buildworker/worker/package_linux64/build/src/interpreter.c:750
jl_toplevel_eval_flex at /buildworker/worker/package_linux64/build/src/toplevel.c:906
jl_toplevel_eval_flex at /buildworker/worker/package_linux64/build/src/toplevel.c:850
eval_body at /buildworker/worker/package_linux64/build/src/interpreter.c:556 
eval_body at /buildworker/worker/package_linux64/build/src/interpreter.c:522 
jl_interpret_toplevel_thunk at /buildworker/worker/package_linux64/build/src/interpreter.c:750
jl_toplevel_eval_flex at /buildworker/worker/package_linux64/build/src/toplevel.c:906
ijl_toplevel_eval_in at /buildworker/worker/package_linux64/build/src/toplevel.c:965
eval at ./boot.jl:368 [inlined]
eval_user_input at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.8/REPL/src/REPL.jl:151
repl_backend_loop at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.8/REPL/src/REPL.jl:247
start_repl_backend at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.8/REPL/src/REPL.jl:232
#run_repl#47 at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.8/REPL/src/REPL.jl:369
run_repl at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.8/REPL/src/REPL.jl:356
jfptr_run_repl_64273.clone_1 at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/lib/julia/sys.so (unknown line)
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2340 [inlined]
ijl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2522 
#960 at ./client.jl:419
jfptr_YY.960_31015.clone_1 at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/lib/julia/sys.so (unknown line)
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2340 [inlined]
ijl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2522 
jl_apply at /buildworker/worker/package_linux64/build/src/julia.h:1825 [inlined]
jl_f__call_latest at /buildworker/worker/package_linux64/build/src/builtins.c:769
#invokelatest#2 at ./essentials.jl:729 [inlined]
invokelatest at ./essentials.jl:727 [inlined]
run_main_repl at ./client.jl:404
exec_options at ./client.jl:318
_start at ./client.jl:522
jfptr__start_59889.clone_1 at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/lib/julia/sys.so (unknown line)
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2340 [inlined]
ijl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2522 
jl_apply at /buildworker/worker/package_linux64/build/src/julia.h:1825 [inlined]
true_main at /buildworker/worker/package_linux64/build/src/jlapi.c:562
jl_repl_entrypoint at /buildworker/worker/package_linux64/build/src/jlapi.c:706
main at julia-beta (unknown line)
__libc_start_main at /lib64/libc.so.6 (unknown line)
unknown function (ip: 0x400808)
Allocations: 52148354 (Pool: 52125813; Big: 22541); GC: 50
Segmentation fault (core dumped)

carstenbauer avatar Mar 03 '22 09:03 carstenbauer

After latest commit (fixing the fragment sizes) I get

julia> call_kernel()                                                                                                                                                                                                                     
ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)                                                                                                                                            
Stacktrace:                                                                                                                                                                                                                              
 [1] throw_api_error(res::CUDA.cudaError_enum)                                                                                                                                                                                           
   @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/lib/cudadrv/error.jl:91                                                                                                                                                         
 [2] isdone                                                                                                                                                                                                                              
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/lib/cudadrv/stream.jl:109 [inlined]                                                                                                                                                  
 [3] nonblocking_synchronize                                                                                                                                                                                                             
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/lib/cudadrv/stream.jl:139 [inlined]                                                                                                                                                  
 [4] nonblocking_synchronize                                                                                                                                                                                                             
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/lib/cudadrv/context.jl:325 [inlined]                                                                                                                                                 
 [5] device_synchronize()                                                                                                                                                                                                                
   @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/lib/cudadrv/context.jl:319                                                                                                                                                      
 [6] top-level scope                                                                                                                                                                                                                     
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/initialization.jl:54

caused by: LLVM error: Cannot select: intrinsic %llvm.nvvm.wmma.m8n8k4.store.d.col.stride.f64                                                                                                                                    [12/147]
Stacktrace:                                                                                                                                                                                                                              
  [1] handle_error(reason::Cstring)
    @ LLVM /scratch/pc2-mitarbeiter/bauerc/.julia/packages/LLVM/MJqe4/src/core/context.jl:105
  [2] LLVMTargetMachineEmitToMemoryBuffer
    @ /scratch/pc2-mitarbeiter/bauerc/.julia/packages/LLVM/MJqe4/lib/13/libLLVM_h.jl:947 [inlined]
  [3] emit(tm::LLVM.TargetMachine, mod::LLVM.Module, filetype::LLVM.API.LLVMCodeGenFileType)
    @ LLVM /scratch/pc2-mitarbeiter/bauerc/.julia/packages/LLVM/MJqe4/src/targetmachine.jl:45
  [4] mcgen(job::GPUCompiler.CompilerJob, mod::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/mcgen.jl:74
  [5] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/.julia/packages/TimerOutputs/5tW2E/src/TimerOutput.jl:252 [inlined]
  [6] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/driver.jl:339 [inlined]
  [7] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/.julia/packages/TimerOutputs/5tW2E/src/TimerOutput.jl:252 [inlined]
  [8] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/driver.jl:336 [inlined]
  [9] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/utils.jl:64
 [10] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:326
 [11] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/cache.jl:90
 [12] cufunction(f::typeof(kernel_wmma_f64_lowlevel), tt::Type{NTuple{4, CuDeviceMatrix{Float64, 1}}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:297
 [13] cufunction
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:291 [inlined]
[14] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:102 [inlined]
 [15] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/utilities.jl:25 [inlined]
 [16] call_kernel()
    @ Main ./REPL[4]:12
 [17] top-level scope
    @ REPL[5]:1
 [18] top-level scope
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/initialization.jl:52

carstenbauer avatar Mar 04 '22 15:03 carstenbauer