CUDA.jl
CUDA.jl copied to clipboard
WMMA Float64
Sister PR of https://github.com/JuliaGPU/CUDA.jl/pull/1419 which tries to add TensorFloat32 (TF32) support and https://github.com/JuliaGPU/CUDA.jl/pull/1425 which tries to add BFloat16 (BF16) support.
cc: @HenriDeh, @thomasfaingnaert
First test kernel:
function kernel_wmma_f64_lowlevel(a_dev, b_dev, c_dev, d_dev)
a_frag = WMMA.llvm_wmma_load_a_col_m8n8k4_global_stride_f64(pointer(a_dev), 8)
b_frag = WMMA.llvm_wmma_load_b_col_m8n8k4_global_stride_f64(pointer(b_dev), 4)
c_frag = WMMA.llvm_wmma_load_c_col_m8n8k4_global_stride_f64(pointer(c_dev), 8)
d_frag = WMMA.llvm_wmma_mma_col_col_m8n8k4_f64(a_frag, b_frag, c_frag)
WMMA.llvm_wmma_store_d_col_m8n8k4_global_stride_f64(pointer(d_dev), d_frag, 8)
return nothing
end
function call_kernel()
m = n = 8
k = 4
dtype_a = dtype_b = Float64
dtype_c = dtype_d = Float64
d_a = CUDA.rand(dtype_a, m, k)
d_b = CUDA.rand(dtype_b, k, n)
d_c = CUDA.rand(dtype_c, m, n)
d_d = CUDA.zeros(dtype_d, m, n)
CUDA.@sync @cuda kernel_wmma_f64_lowlevel(d_a, d_b, d_c, d_d)
return nothing
end
Current output (Julia1 1.8-beta1):
julia> call_kernel()
signal (11): Segmentation fault
in expression starting at REPL[4]:1
_ZN4llvm19MachineRegisterInfo17constrainRegClassENS_8RegisterEPKNS_19TargetRegisterClassEj at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0
-beta1+0~x64/bin/../lib/julia/libLLVM-13jl.so (unknown line)
_ZN4llvm12InstrEmitter18AddRegisterOperandERNS_19MachineInstrBuilderENS_7SDValueEjPKNS_11MCInstrDescERNS_8DenseMapIS3_NS_8RegisterENS_12DenseMapInfoIS3_
EENS_6detail12DenseMapPairIS3_S8_EEEEbbb at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13jl.so (unk
nown line)
_ZN4llvm12InstrEmitter15EmitMachineNodeEPNS_6SDNodeEbbRNS_8DenseMapINS_7SDValueENS_8RegisterENS_12DenseMapInfoIS4_EENS_6detail12DenseMapPairIS4_S5_EEEE
at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13jl.so (unknown line)
_ZN4llvm18ScheduleDAGSDNodes12EmitScheduleERNS_26MachineInstrBundleIteratorINS_12MachineInstrELb0EEE at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/j
ulia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13jl.so (unknown line)
_ZN4llvm16SelectionDAGISel17CodeGenAndEmitDAGEv at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13jl.
so (unknown line)
_ZN4llvm16SelectionDAGISel20SelectAllBasicBlocksERKNS_8FunctionE at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/ju
lia/libLLVM-13jl.so (unknown line)
_ZN4llvm16SelectionDAGISel20runOnMachineFunctionERNS_15MachineFunctionE.part.899 at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x
64/bin/../lib/julia/libLLVM-13jl.so (unknown line)
_ZN4llvm19MachineFunctionPass13runOnFunctionERNS_8FunctionE at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/l
ibLLVM-13jl.so (unknown line)
_ZN4llvm13FPPassManager13runOnFunctionERNS_8FunctionE at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM
-13jl.so (unknown line)
_ZN4llvm13FPPassManager11runOnModuleERNS_6ModuleE at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13j
l.so (unknown line)
_ZN4llvm6legacy15PassManagerImpl3runERNS_6ModuleE at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13j
l.so (unknown line)
_ZL21LLVMTargetMachineEmitP23LLVMOpaqueTargetMachineP16LLVMOpaqueModuleRN4llvm17raw_pwrite_streamE19LLVMCodeGenFileTypePPc at /scratch/pc2-mitarbeiter/b
auerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13jl.so (unknown line)
LLVMTargetMachineEmitToMemoryBuffer at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/bin/../lib/julia/libLLVM-13jl.so (unknown
line)
LLVMTargetMachineEmitToMemoryBuffer at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/LLVM/MJqe4/lib/13/libLLVM_h.jl:947 [inlined]
emit at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/LLVM/MJqe4/src/targetmachine.jl:45
mcgen at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/mcgen.jl:74
unknown function (ip: 0x14ae3a0f3bff)
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2340 [inlined]
ijl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2522
macro expansion at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/TimerOutputs/5tW2E/src/TimerOutput.jl:252 [inlined]
macro expansion at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/driver.jl:339 [inlined]
macro expansion at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/TimerOutputs/5tW2E/src/TimerOutput.jl:252 [inlined]
macro expansion at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/driver.jl:336 [inlined]
#emit_asm#137 at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/utils.jl:64
emit_asm##kw at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/utils.jl:62 [inlined]
cufunction_compile at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:326
cached_compilation at /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/cache.jl:90
#cufunction#255 at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:297
cufunction at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:291 [inlined]
macro expansion at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:102 [inlined]
macro expansion at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/utilities.jl:25 [inlined]
call_kernel at ./REPL[3]:12
unknown function (ip: 0x14ae588e49bf)
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2340 [inlined]
ijl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2522
jl_apply at /buildworker/worker/package_linux64/build/src/julia.h:1825 [inlined]
do_call at /buildworker/worker/package_linux64/build/src/interpreter.c:126
eval_value at /buildworker/worker/package_linux64/build/src/interpreter.c:215
eval_stmt_value at /buildworker/worker/package_linux64/build/src/interpreter.c:166 [inlined]
eval_body at /buildworker/worker/package_linux64/build/src/interpreter.c:612
jl_interpret_toplevel_thunk at /buildworker/worker/package_linux64/build/src/interpreter.c:750
jl_toplevel_eval_flex at /buildworker/worker/package_linux64/build/src/toplevel.c:906
jl_toplevel_eval_flex at /buildworker/worker/package_linux64/build/src/toplevel.c:850
eval_body at /buildworker/worker/package_linux64/build/src/interpreter.c:556
eval_body at /buildworker/worker/package_linux64/build/src/interpreter.c:522
jl_interpret_toplevel_thunk at /buildworker/worker/package_linux64/build/src/interpreter.c:750
jl_toplevel_eval_flex at /buildworker/worker/package_linux64/build/src/toplevel.c:906
ijl_toplevel_eval_in at /buildworker/worker/package_linux64/build/src/toplevel.c:965
eval at ./boot.jl:368 [inlined]
eval_user_input at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.8/REPL/src/REPL.jl:151
repl_backend_loop at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.8/REPL/src/REPL.jl:247
start_repl_backend at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.8/REPL/src/REPL.jl:232
#run_repl#47 at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.8/REPL/src/REPL.jl:369
run_repl at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.8/REPL/src/REPL.jl:356
jfptr_run_repl_64273.clone_1 at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/lib/julia/sys.so (unknown line)
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2340 [inlined]
ijl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2522
#960 at ./client.jl:419
jfptr_YY.960_31015.clone_1 at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/lib/julia/sys.so (unknown line)
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2340 [inlined]
ijl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2522
jl_apply at /buildworker/worker/package_linux64/build/src/julia.h:1825 [inlined]
jl_f__call_latest at /buildworker/worker/package_linux64/build/src/builtins.c:769
#invokelatest#2 at ./essentials.jl:729 [inlined]
invokelatest at ./essentials.jl:727 [inlined]
run_main_repl at ./client.jl:404
exec_options at ./client.jl:318
_start at ./client.jl:522
jfptr__start_59889.clone_1 at /scratch/pc2-mitarbeiter/bauerc/.julia/juliaup/julia-1.8.0-beta1+0~x64/lib/julia/sys.so (unknown line)
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2340 [inlined]
ijl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2522
jl_apply at /buildworker/worker/package_linux64/build/src/julia.h:1825 [inlined]
true_main at /buildworker/worker/package_linux64/build/src/jlapi.c:562
jl_repl_entrypoint at /buildworker/worker/package_linux64/build/src/jlapi.c:706
main at julia-beta (unknown line)
__libc_start_main at /lib64/libc.so.6 (unknown line)
unknown function (ip: 0x400808)
Allocations: 52148354 (Pool: 52125813; Big: 22541); GC: 50
Segmentation fault (core dumped)
After latest commit (fixing the fragment sizes) I get
julia> call_kernel()
ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/lib/cudadrv/error.jl:91
[2] isdone
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/lib/cudadrv/stream.jl:109 [inlined]
[3] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/lib/cudadrv/stream.jl:139 [inlined]
[4] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/lib/cudadrv/context.jl:325 [inlined]
[5] device_synchronize()
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/lib/cudadrv/context.jl:319
[6] top-level scope
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/initialization.jl:54
caused by: LLVM error: Cannot select: intrinsic %llvm.nvvm.wmma.m8n8k4.store.d.col.stride.f64 [12/147]
Stacktrace:
[1] handle_error(reason::Cstring)
@ LLVM /scratch/pc2-mitarbeiter/bauerc/.julia/packages/LLVM/MJqe4/src/core/context.jl:105
[2] LLVMTargetMachineEmitToMemoryBuffer
@ /scratch/pc2-mitarbeiter/bauerc/.julia/packages/LLVM/MJqe4/lib/13/libLLVM_h.jl:947 [inlined]
[3] emit(tm::LLVM.TargetMachine, mod::LLVM.Module, filetype::LLVM.API.LLVMCodeGenFileType)
@ LLVM /scratch/pc2-mitarbeiter/bauerc/.julia/packages/LLVM/MJqe4/src/targetmachine.jl:45
[4] mcgen(job::GPUCompiler.CompilerJob, mod::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
@ GPUCompiler /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/mcgen.jl:74
[5] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/.julia/packages/TimerOutputs/5tW2E/src/TimerOutput.jl:252 [inlined]
[6] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/driver.jl:339 [inlined]
[7] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/.julia/packages/TimerOutputs/5tW2E/src/TimerOutput.jl:252 [inlined]
[8] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/driver.jl:336 [inlined]
[9] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
@ GPUCompiler /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/utils.jl:64
[10] cufunction_compile(job::GPUCompiler.CompilerJob)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:326
[11] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
@ GPUCompiler /scratch/pc2-mitarbeiter/bauerc/.julia/packages/GPUCompiler/I9fZc/src/cache.jl:90
[12] cufunction(f::typeof(kernel_wmma_f64_lowlevel), tt::Type{NTuple{4, CuDeviceMatrix{Float64, 1}}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:297
[13] cufunction
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:291 [inlined]
[14] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/compiler/execution.jl:102 [inlined]
[15] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/utilities.jl:25 [inlined]
[16] call_kernel()
@ Main ./REPL[4]:12
[17] top-level scope
@ REPL[5]:1
[18] top-level scope
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_f64/src/initialization.jl:52