CUDA.jl icon indicating copy to clipboard operation
CUDA.jl copied to clipboard

WMMA BFloat16 (BF16)

Open carstenbauer opened this issue 2 years ago • 4 comments

Sister PR of https://github.com/JuliaGPU/CUDA.jl/pull/1419 which tries to add TensorFloat32 (TF32) support.

While TF32 has higher priority for me, I thought that, in the spirit of "while I'm at it", it would make sense to add BF16 support simultaneously.

cc: @HenriDeh, @thomasfaingnaert

carstenbauer avatar Mar 03 '22 08:03 carstenbauer

First test kernel:

function kernel_wmma_bf16_lowlevel(a_dev, b_dev, c_dev, d_dev)                                                                                   
    a_frag = WMMA.llvm_wmma_load_a_col_m16n16k16_global_stride_bf16(pointer(a_dev), 16)                                                          
    b_frag = WMMA.llvm_wmma_load_b_col_m16n16k16_global_stride_bf16(pointer(b_dev), 16)                                                          
    c_frag = WMMA.llvm_wmma_load_c_col_m16n16k16_global_stride_f32(pointer(c_dev), 16)                                                           
                                                                                                                                                
    d_frag = WMMA.llvm_wmma_mma_col_col_m16n16k16_bf16(a_frag, b_frag, c_frag)                                                                   
                                                                                                                                                
    WMMA.llvm_wmma_store_d_col_m16n16k16_global_stride_f32(pointer(d_dev), d_frag, 16)                                                           
    return nothing                                                                                                                               
end                                                                                                                                              
                                                                                                                                                        
function call_kernel()                                                                                                                           
    m = n = k = 16                                                                                                                               
    dtype_a = dtype_b = CUDA.BFloat16                                                                                                            
    dtype_c = dtype_d = Float32                                                                                                                  
                                                                                                                                                
    d_a = CUDA.rand(dtype_a, m, k)                                                                                                               
    d_b = CUDA.rand(dtype_b, k, n)                                                                                                               
    d_c = CUDA.rand(dtype_c, m, n)                                                                                                               
    d_d = CUDA.zeros(dtype_d, m, n)                                                                                                              
                                                                                                                                                
    CUDA.@sync @cuda kernel_wmma_bf16_lowlevel(d_a, d_b, d_c, d_d)                                                                               
    return nothing                                                                                                                               
end                                                                                                                                              

Current output (on Julia 1.8-beta1):

julia> call_kernel()                                                                                                                                    
ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)                                                           
Stacktrace:                                                                                                                                             
 [1] throw_api_error(res::CUDA.cudaError_enum)                                                                                                          
   @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91                                                                   
 [2] isdone                                                                                                                                             
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:109 [inlined]                                                            
 [3] nonblocking_synchronize                                                                                                                            
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:139 [inlined]                                                            
 [4] nonblocking_synchronize                                                                                                                            
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/context.jl:325 [inlined]                                                           
 [5] device_synchronize()                                                                                                                               
   @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/context.jl:319                                                                
 [6] top-level scope                                                                                                                                    
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/initialization.jl:54                                                                       
                                                                                                                                                        
caused by: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)  
Stacktrace:                                                                                                                                     [86/791]
  [1] throw_api_error(res::CUDA.cudaError_enum)                                                                                                         
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91                                                                  
  [2] isdone                                                                                                                                            
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:109 [inlined]                                                           
  [3] nonblocking_synchronize                                                                                                                           
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:139 [inlined]                                                           
  [4] synchronize(stream::CuStream; blocking::Nothing)                                                                                                  
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:128                                                                
  [5] synchronize                                                                                                                                       
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:122 [inlined]                                                           
  [6] macro expansion                                                                                                                                   
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:186 [inlined]                                                                     
  [7] macro expansion                                                                                                                                   
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/error.jl:67 [inlined]                                                              
  [8] curandCreateGenerator(typ::CUDA.CURAND.curandRngType)                                                                                             
    @ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/wrappers.jl:5                                                          
  [9] CUDA.CURAND.RNG(typ::CUDA.CURAND.curandRngType; stream::CuStream)
    @ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/random.jl:13
 [10] RNG (repeats 2 times)
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/random.jl:13 [inlined]
 [11] #167
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:38 [inlined]
 [12] (::CUDA.APIUtils.var"#8#11"{CUDA.CURAND.var"#167#173", CUDA.APIUtils.HandleCache{CuContext, CUDA.CURAND.RNG}, CuContext})()
    @ CUDA.APIUtils /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/cache.jl:24
 [13] lock(f::CUDA.APIUtils.var"#8#11"{CUDA.CURAND.var"#167#173", CUDA.APIUtils.HandleCache{CuContext, CUDA.CURAND.RNG}, CuContext}, l::ReentrantLock)
    @ Base ./lock.jl:185
 [14] (::CUDA.APIUtils.var"#check_cache#9"{CUDA.APIUtils.HandleCache{CuContext, CUDA.CURAND.RNG}, CuContext})(f::CUDA.CURAND.var"#167#173")
    @ CUDA.APIUtils /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/cache.jl:22
 [15] pop!
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/cache.jl:46 [inlined]
 [16] (::CUDA.CURAND.var"#new_state#172")(cuda::NamedTuple{(:device, :context, :stream, :math_mode, :math_precision), Tuple{CuDevice, CuContext, CuStrea
m, CUDA.MathMode, Symbol}})
    @ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:37
 [17] #170
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:51 [inlined]
 [18] get!(default::CUDA.CURAND.var"#170#176"{CUDA.CURAND.var"#new_state#172", NamedTuple{(:device, :context, :stream, :math_mode, :math_precision), Tu$
le{CuDevice, CuContext, CuStream, CUDA.MathMode, Symbol}}}, h::Dict{CuContext, NamedTuple{(:rng,), Tuple{CUDA.CURAND.RNG}}}, key::CuContext)
    @ Base ./dict.jl:481
 [19] default_rng()
    @ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:50
 [20] curand_rng
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/random.jl:229 [inlined]
 [21] rand(T::Type{Float32}, dim1::Int64, dims::Int64)
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/random.jl:249
 [22] call_kernel()
    @ Main ./REPL[2]:8
 [23] top-level scope
    @ REPL[3]:1
 [24] top-level scope
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/initialization.jl:52

carstenbauer avatar Mar 03 '22 08:03 carstenbauer

I can't add the "cuda kernels" and "enhancement" labels (being a member of JuliaGPU is apparently not enough).

carstenbauer avatar Mar 03 '22 08:03 carstenbauer

Comment: I'm not sure what the fragment type (map_ptx_to_jl_frag) and size (map_frag_sizes) should be...

carstenbauer avatar Mar 03 '22 09:03 carstenbauer

After latest commit (fixing the fragment sizes) I (still) get

julia> call_kernel()
ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
 [1] throw_api_error(res::CUDA.cudaError_enum)
   @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
 [2] isdone
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:109 [inlined]
 [3] nonblocking_synchronize
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:139 [inlined]
 [4] nonblocking_synchronize
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/context.jl:325 [inlined]
 [5] device_synchronize()
   @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/context.jl:319
 [6] top-level scope
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/initialization.jl:54

caused by: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
  [1] throw_api_error(res::CUDA.cudaError_enum)
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
  [2] isdone
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:109 [inlined]
  [3] nonblocking_synchronize
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:139 [inlined]
  [4] synchronize(stream::CuStream; blocking::Nothing)
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:128
  [5] synchronize
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:122 [inlined]
  [6] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:186 [inlined]
  [7] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/error.jl:67 [inlined]
  [8] curandCreateGenerator(typ::CUDA.CURAND.curandRngType)
    @ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/wrappers.jl:5
  [9] CUDA.CURAND.RNG(typ::CUDA.CURAND.curandRngType; stream::CuStream)
    @ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/random.jl:13
 [10] RNG (repeats 2 times)
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/random.jl:13 [inlined]
 [11] #167
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:38 [inlined]
 [12] (::CUDA.APIUtils.var"#8#11"{CUDA.CURAND.var"#167#173", CUDA.APIUtils.HandleCache{CuContext, CUDA.CURAND.RNG}, CuContext})()
julia> 
WARNING: Error while freeing DeviceBuffer(512 bytes at 0x0000000a02000200):
CUDA.CuError(code=CUDA.cudaError_enum(0x000002bc), meta=nothing)

Stacktrace:
  [1] throw_api_error(res::CUDA.cudaError_enum)
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
  [2] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:101 [inlined]
  [3] cuMemFreeAsync(dptr::CUDA.Mem.DeviceBuffer, hStream::CuStream)
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/call.jl:26
  [4] #free#2
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/memory.jl:97 [inlined]
  [5] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:58 [inlined]
  [6] macro expansion
    @ ./timing.jl:359 [inlined]
  [7] #actual_free#189
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:57 [inlined]
  [8] #_free#207
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:323 [inlined]
  [9] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:288 [inlined]
 [10] macro expansion
    @ ./timing.jl:359 [inlined]
 [11] #free#206
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:287 [inlined]
 [12] #212
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/array.jl:79 [inlined]
 [13] context!(f::CUDA.var"#212#213"{CuArray{Float16, 2, CUDA.Mem.DeviceBuffer}, CuStream}, ctx::CuContext; skip_destroyed::Bool)
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/state.jl:164
 [14] unsafe_free!(xs::CuArray{Float16, 2, CUDA.Mem.DeviceBuffer}, stream::CuStream)
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/array.jl:78
 [15] unsafe_finalize!(xs::CuArray{Float16, 2, CUDA.Mem.DeviceBuffer})
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/array.jl:99
error in running finalizer: CUDA.CuError(code=CUDA.cudaError_enum(0x000002bc), meta=nothing)
throw_api_error at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
macro expansion at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:101 [inlined]
cuModuleUnload at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/call.jl:26
#27 at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/module.jl:82 [inlined]
#context!#63 at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/state.jl:164
unknown function (ip: 0x7fa4a50fa68b)
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2340 [inlined]
ijl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2522
context!##kw at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/state.jl:161
unsafe_unload! at /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/module.jl:81
unknown function (ip: 0x7fa4a50fa212)
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2340 [inlined]
ijl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2522
jl_apply at /buildworker/worker/package_linux64/build/src/julia.h:1825 [inlined]
run_finalizer at /buildworker/worker/package_linux64/build/src/gc.c:280
jl_gc_run_finalizers_in_list at /buildworker/worker/package_linux64/build/src/gc.c:367
run_finalizers at /buildworker/worker/package_linux64/build/src/gc.c:396
ijl_atexit_hook at /buildworker/worker/package_linux64/build/src/init.c:236
jl_repl_entrypoint at /buildworker/worker/package_linux64/build/src/jlapi.c:707
main at julia-beta (unknown line)
__libc_start_main at /lib64/libc.so.6 (unknown line)
unknown function (ip: 0x400808)
WARNING: Error while freeing DeviceBuffer(512 bytes at 0x0000000a02000000):
CUDA.CuError(code=CUDA.cudaError_enum(0x000002bc), meta=nothing)

Stacktrace:
  [1] throw_api_error(res::CUDA.cudaError_enum)
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
  [2] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:101 [inlined]
  [3] cuMemFreeAsync(dptr::CUDA.Mem.DeviceBuffer, hStream::CuStream)
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/call.jl:26
  [4] #free#2
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/memory.jl:97 [inlined]
  [5] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:58 [inlined]
  [6] macro expansion
    @ ./timing.jl:359 [inlined]
  [7] #actual_free#189
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:57 [inlined]
  [8] #_free#207
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:323 [inlined]
  [9] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:288 [inlined]
 [10] macro expansion
    @ ./timing.jl:359 [inlined]
 [11] #free#206
julia> call_kernel()
ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
 [1] throw_api_error(res::CUDA.cudaError_enum)
   @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
 [2] isdone
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:109 [inlined]
 [3] nonblocking_synchronize
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:139 [inlined]
 [4] nonblocking_synchronize
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/context.jl:325 [inlined]
 [5] device_synchronize()
   @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/context.jl:319
 [6] top-level scope
   @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/initialization.jl:54

caused by: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
  [1] throw_api_error(res::CUDA.cudaError_enum)
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/error.jl:91
  [2] isdone
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:109 [inlined]
  [3] nonblocking_synchronize
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:139 [inlined]
  [4] synchronize(stream::CuStream; blocking::Nothing)
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:128
  [5] synchronize
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/cudadrv/stream.jl:122 [inlined]
  [6] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/pool.jl:186 [inlined]
  [7] macro expansion
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/error.jl:67 [inlined]
  [8] curandCreateGenerator(typ::CUDA.CURAND.curandRngType)
    @ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/wrappers.jl:5
  [9] CUDA.CURAND.RNG(typ::CUDA.CURAND.curandRngType; stream::CuStream)
    @ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/random.jl:13
 [10] RNG (repeats 2 times)
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/random.jl:13 [inlined]
 [11] #167
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:38 [inlined]
 [12] (::CUDA.APIUtils.var"#8#11"{CUDA.CURAND.var"#167#173", CUDA.APIUtils.HandleCache{CuContext, CUDA.CURAND.RNG}, CuContext})()
    @ CUDA.APIUtils /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/cache.jl:24
 [13] lock(f::CUDA.APIUtils.var"#8#11"{CUDA.CURAND.var"#167#173", CUDA.APIUtils.HandleCache{CuContext, CUDA.CURAND.RNG}, CuContext}, l::ReentrantLock)
    @ Base ./lock.jl:185
 [14] (::CUDA.APIUtils.var"#check_cache#9"{CUDA.APIUtils.HandleCache{CuContext, CUDA.CURAND.RNG}, CuContext})(f::CUDA.CURAND.var"#167#173")
    @ CUDA.APIUtils /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/cache.jl:22
 [15] pop!
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/utils/cache.jl:46 [inlined]
 [16] (::CUDA.CURAND.var"#new_state#172")(cuda::NamedTuple{(:device, :context, :stream, :math_mode, :math_precision), Tuple{CuDevice, CuContext, CuStream, CUDA.MathMode, Symbol}})
    @ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:37
 [17] #170
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:51 [inlined]
 [18] get!(default::CUDA.CURAND.var"#170#176"{CUDA.CURAND.var"#new_state#172", NamedTuple{(:device, :context, :stream, :math_mode, :math_precision), Tuple{CuDevice, CuContext, CuStream, CUDA.MathMode, Symbol}}}, h::Dict{CuContext, NamedTuple{(:rng,), Tuple{CUDA.CURAND.RNG}}}, key::CuContext)
    @ Base ./dict.jl:481
 [19] default_rng()
    @ CUDA.CURAND /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/lib/curand/CURAND.jl:50
 [20] curand_rng
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/random.jl:229 [inlined]
 [21] rand(T::Type{Float32}, dim1::Int64, dims::Int64)
    @ CUDA /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/random.jl:249
 [22] call_kernel()
    @ Main ./REPL[3]:8
 [23] top-level scope
    @ REPL[4]:1
 [24] top-level scope
    @ /scratch/pc2-mitarbeiter/bauerc/devel/CUDA_bfloat16/src/initialization.jl:52

carstenbauer avatar Mar 04 '22 15:03 carstenbauer