CUDA.jl
CUDA.jl copied to clipboard
WMMA BFloat16 (BF16)
Sister PR of https://github.com/JuliaGPU/CUDA.jl/pull/1419 which tries to add TensorFloat32 (TF32) support.
While TF32 has higher priority for me, I thought that, in the spirit of "while I'm at it", it would make sense to add BF16 support simultaneously.
cc: @HenriDeh, @thomasfaingnaert
First test kernel:
function kernel_wmma_bf16_lowlevel(a_dev, b_dev, c_dev, d_dev)
a_frag = WMMA.llvm_wmma_load_a_col_m16n16k16_global_stride_bf16(pointer(a_dev), 16)
b_frag = WMMA.llvm_wmma_load_b_col_m16n16k16_global_stride_bf16(pointer(b_dev), 16)
c_frag = WMMA.llvm_wmma_load_c_col_m16n16k16_global_stride_f32(pointer(c_dev), 16)
d_frag = WMMA.llvm_wmma_mma_col_col_m16n16k16_bf16(a_frag, b_frag, c_frag)
WMMA.llvm_wmma_store_d_col_m16n16k16_global_stride_f32(pointer(d_dev), d_frag, 16)
return nothing
end
function call_kernel()
m = n = k = 16
dtype_a = dtype_b = CUDA.BFloat16
dtype_c = dtype_d = Float32
d_a = CUDA.rand(dtype_a, m, k)
d_b = CUDA.rand(dtype_b, k, n)
d_c = CUDA.rand(dtype_c, m, n)
d_d = CUDA.zeros(dtype_d, m, n)
CUDA.@sync @cuda kernel_wmma_bf16_lowlevel(d_a, d_b, d_c, d_d)
return nothing
end
Current output (on Julia 1.8-beta1):
julia> call_kernel()
ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
[2] isdone
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:109 [inlined]
[3] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:139 [inlined]
[4] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/context.jl:325 [inlined]
[5] device_synchronize()
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/context.jl:319
[6] top-level scope
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/initialization.jl:54
caused by: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace: [86/791]
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
[2] isdone
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:109 [inlined]
[3] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:139 [inlined]
[4] synchronize(stream::CuStream; blocking::Nothing)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:128
[5] synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:122 [inlined]
[6] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:186 [inlined]
[7] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/error.jl:67 [inlined]
[8] curandCreateGenerator(typ::CUDA.CURAND.curandRngType)
@ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/wrappers.jl:5
[9] CUDA.CURAND.RNG(typ::CUDA.CURAND.curandRngType; stream::CuStream)
@ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/random.jl:13
[10] RNG (repeats 2 times)
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/random.jl:13 [inlined]
[11] #167
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:38 [inlined]
[12] (::CUDA.APIUtils.var"#8#11"{CUDA.CURAND.var"#167#173", CUDA.APIUtils.HandleCache{CuContext, CUDA.CURAND.RNG}, CuContext})()
@ CUDA.APIUtils /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/cache.jl:24
[13] lock(f::CUDA.APIUtils.var"#8#11"{CUDA.CURAND.var"#167#173", CUDA.APIUtils.HandleCache{CuContext, CUDA.CURAND.RNG}, CuContext}, l::ReentrantLock)
@ Base ./lock.jl:185
[14] (::CUDA.APIUtils.var"#check_cache#9"{CUDA.APIUtils.HandleCache{CuContext, CUDA.CURAND.RNG}, CuContext})(f::CUDA.CURAND.var"#167#173")
@ CUDA.APIUtils /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/cache.jl:22
[15] pop!
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/cache.jl:46 [inlined]
[16] (::CUDA.CURAND.var"#new_state#172")(cuda::NamedTuple{(:device, :context, :stream, :math_mode, :math_precision), Tuple{CuDevice, CuContext, CuStrea
m, CUDA.MathMode, Symbol}})
@ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:37
[17] #170
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:51 [inlined]
[18] get!(default::CUDA.CURAND.var"#170#176"{CUDA.CURAND.var"#new_state#172", NamedTuple{(:device, :context, :stream, :math_mode, :math_precision), Tu$
le{CuDevice, CuContext, CuStream, CUDA.MathMode, Symbol}}}, h::Dict{CuContext, NamedTuple{(:rng,), Tuple{CUDA.CURAND.RNG}}}, key::CuContext)
@ Base ./dict.jl:481
[19] default_rng()
@ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:50
[20] curand_rng
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/random.jl:229 [inlined]
[21] rand(T::Type{Float32}, dim1::Int64, dims::Int64)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/random.jl:249
[22] call_kernel()
@ Main ./REPL[2]:8
[23] top-level scope
@ REPL[3]:1
[24] top-level scope
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/initialization.jl:52
I can't add the "cuda kernels" and "enhancement" labels (being a member of JuliaGPU is apparently not enough).
Comment: I'm not sure what the fragment type (map_ptx_to_jl_frag
) and size (map_frag_sizes
) should be...
After latest commit (fixing the fragment sizes) I (still) get
julia> call_kernel()
ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
[2] isdone
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:109 [inlined]
[3] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:139 [inlined]
[4] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/context.jl:325 [inlined]
[5] device_synchronize()
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/context.jl:319
[6] top-level scope
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/initialization.jl:54
caused by: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
[2] isdone
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:109 [inlined]
[3] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:139 [inlined]
[4] synchronize(stream::CuStream; blocking::Nothing)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:128
[5] synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:122 [inlined]
[6] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:186 [inlined]
[7] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/error.jl:67 [inlined]
[8] curandCreateGenerator(typ::CUDA.CURAND.curandRngType)
@ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/wrappers.jl:5
[9] CUDA.CURAND.RNG(typ::CUDA.CURAND.curandRngType; stream::CuStream)
@ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/random.jl:13
[10] RNG (repeats 2 times)
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/random.jl:13 [inlined]
[11] #167
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:38 [inlined]
[12] (::CUDA.APIUtils.var"#8#11"{CUDA.CURAND.var"#167#173", CUDA.APIUtils.HandleCache{CuContext, CUDA.CURAND.RNG}, CuContext})()
julia>
WARNING: Error while freeing DeviceBuffer(512 bytes at 0x0000000a02000200):
CUDA.CuError(code=CUDA.cudaError_enum(0x000002bc), meta=nothing)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
[2] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:101 [inlined]
[3] cuMemFreeAsync(dptr::CUDA.Mem.DeviceBuffer, hStream::CuStream)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/call.jl:26
[4] #free#2
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/memory.jl:97 [inlined]
[5] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:58 [inlined]
[6] macro expansion
@ ./timing.jl:359 [inlined]
[7] #actual_free#189
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:57 [inlined]
[8] #_free#207
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:323 [inlined]
[9] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:288 [inlined]
[10] macro expansion
@ ./timing.jl:359 [inlined]
[11] #free#206
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:287 [inlined]
[12] #212
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/array.jl:79 [inlined]
[13] context!(f::CUDA.var"#212#213"{CuArray{Float16, 2, CUDA.Mem.DeviceBuffer}, CuStream}, ctx::CuContext; skip_destroyed::Bool)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/state.jl:164
[14] unsafe_free!(xs::CuArray{Float16, 2, CUDA.Mem.DeviceBuffer}, stream::CuStream)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/array.jl:78
[15] unsafe_finalize!(xs::CuArray{Float16, 2, CUDA.Mem.DeviceBuffer})
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/array.jl:99
error in running finalizer: CUDA.CuError(code=CUDA.cudaError_enum(0x000002bc), meta=nothing)
throw_api_error at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
macro expansion at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:101 [inlined]
cuModuleUnload at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/call.jl:26
#27 at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/module.jl:82 [inlined]
#context!#63 at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/state.jl:164
unknown function (ip: 0x7fa4a50fa68b)
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2340 [inlined]
ijl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2522
context!##kw at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/state.jl:161
unsafe_unload! at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/module.jl:81
unknown function (ip: 0x7fa4a50fa212)
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2340 [inlined]
ijl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2522
jl_apply at /buildworker/worker/package_linux64/build/src/julia.h:1825 [inlined]
run_finalizer at /buildworker/worker/package_linux64/build/src/gc.c:280
jl_gc_run_finalizers_in_list at /buildworker/worker/package_linux64/build/src/gc.c:367
run_finalizers at /buildworker/worker/package_linux64/build/src/gc.c:396
ijl_atexit_hook at /buildworker/worker/package_linux64/build/src/init.c:236
jl_repl_entrypoint at /buildworker/worker/package_linux64/build/src/jlapi.c:707
main at julia-beta (unknown line)
__libc_start_main at /lib64/libc.so.6 (unknown line)
unknown function (ip: 0x400808)
WARNING: Error while freeing DeviceBuffer(512 bytes at 0x0000000a02000000):
CUDA.CuError(code=CUDA.cudaError_enum(0x000002bc), meta=nothing)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
[2] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:101 [inlined]
[3] cuMemFreeAsync(dptr::CUDA.Mem.DeviceBuffer, hStream::CuStream)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/call.jl:26
[4] #free#2
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/memory.jl:97 [inlined]
[5] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:58 [inlined]
[6] macro expansion
@ ./timing.jl:359 [inlined]
[7] #actual_free#189
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:57 [inlined]
[8] #_free#207
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:323 [inlined]
[9] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:288 [inlined]
[10] macro expansion
@ ./timing.jl:359 [inlined]
[11] #free#206
julia> call_kernel()
ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
[2] isdone
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:109 [inlined]
[3] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:139 [inlined]
[4] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/context.jl:325 [inlined]
[5] device_synchronize()
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/context.jl:319
[6] top-level scope
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/initialization.jl:54
caused by: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
[2] isdone
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:109 [inlined]
[3] nonblocking_synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:139 [inlined]
[4] synchronize(stream::CuStream; blocking::Nothing)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:128
[5] synchronize
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:122 [inlined]
[6] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:186 [inlined]
[7] macro expansion
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/error.jl:67 [inlined]
[8] curandCreateGenerator(typ::CUDA.CURAND.curandRngType)
@ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/wrappers.jl:5
[9] CUDA.CURAND.RNG(typ::CUDA.CURAND.curandRngType; stream::CuStream)
@ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/random.jl:13
[10] RNG (repeats 2 times)
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/random.jl:13 [inlined]
[11] #167
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:38 [inlined]
[12] (::CUDA.APIUtils.var"#8#11"{CUDA.CURAND.var"#167#173", CUDA.APIUtils.HandleCache{CuContext, CUDA.CURAND.RNG}, CuContext})()
@ CUDA.APIUtils /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/cache.jl:24
[13] lock(f::CUDA.APIUtils.var"#8#11"{CUDA.CURAND.var"#167#173", CUDA.APIUtils.HandleCache{CuContext, CUDA.CURAND.RNG}, CuContext}, l::ReentrantLock)
@ Base ./lock.jl:185
[14] (::CUDA.APIUtils.var"#check_cache#9"{CUDA.APIUtils.HandleCache{CuContext, CUDA.CURAND.RNG}, CuContext})(f::CUDA.CURAND.var"#167#173")
@ CUDA.APIUtils /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/cache.jl:22
[15] pop!
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/cache.jl:46 [inlined]
[16] (::CUDA.CURAND.var"#new_state#172")(cuda::NamedTuple{(:device, :context, :stream, :math_mode, :math_precision), Tuple{CuDevice, CuContext, CuStream, CUDA.MathMode, Symbol}})
@ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:37
[17] #170
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:51 [inlined]
[18] get!(default::CUDA.CURAND.var"#170#176"{CUDA.CURAND.var"#new_state#172", NamedTuple{(:device, :context, :stream, :math_mode, :math_precision), Tuple{CuDevice, CuContext, CuStream, CUDA.MathMode, Symbol}}}, h::Dict{CuContext, NamedTuple{(:rng,), Tuple{CUDA.CURAND.RNG}}}, key::CuContext)
@ Base ./dict.jl:481
[19] default_rng()
@ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:50
[20] curand_rng
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/random.jl:229 [inlined]
[21] rand(T::Type{Float32}, dim1::Int64, dims::Int64)
@ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/random.jl:249
[22] call_kernel()
@ Main ./REPL[3]:8
[23] top-level scope
@ REPL[4]:1
[24] top-level scope
@ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/initialization.jl:52