KernelAbstractions.jl
KernelAbstractions.jl copied to clipboard
Illegal memory access was encountered
The following code
using KernelAbstractions
using CuArrays
struct S{FT}
dummy1::FT
dummy2::Int
x::FT
end
@kernel function kernel!(s::S, a, b, dummy3, dummy4)
sin(s.x)
@inbounds a[1] = b[1]
end
let
FT = Float32
a = CuArray(zeros(FT, 1))
b = CuArray(ones(FT, 1))
s = S{FT}(FT(1), 1, FT(1))
device = CUDA()
event = kernel!(device, 1, 1)(s, a, b, FT(0), true)
wait(event)
end
fails with CUDA error: an illegal memory access was encountered
on Julia 1.3.1 (seems to work fine on 1.4.1). Further observations that may or may not be helpful when debugging:
- it works with
FT = Float64
- it matters that
dummy2
is anInt
anddummy4
is aBool
- the argument order of
kernel!
is important, if the dummies come beforea
andb
it also works
Thanks, managed to reproduce!
I hope this is related to the illegal memory access I have yet minimize to a small example.
So it is really weird:
KernelAbstraction:
ld.param.u64 %rd10, [_Z17julia_gpu_kernel_7ContextI14__CUDACtx_Name16CompilerMetadataI10StaticSizeI4_1__E12DynamicCheckvv7NDRangeILi1E10StaticSizeI4_1__E10StaticSizeI4_1__EvvEEv14__PassType_422v12DisableHooksE12_gpu_kernel_1SI7Float32E13CuDeviceArrayI7Float32Li1E6GlobalE13CuDeviceArrayI7Float32Li1E6GlobalE7Float324Bool_param_2+8];
and.b64 %rd3, %rd10, 1099511627775;
cvta.to.global.u64 %rd17, %rd3;
ld.global.f32 %f6, [%rd17];
CUDAnative pure
ld.param.u64 %rd3, [_Z22julia_cn_kernel__183791SI7Float32E13CuDeviceArrayI7Float32Li1E6GlobalE13CuDeviceArrayI7Float32Li1E6GlobalE7Float324Bool_param_2+8];
cvta.to.global.u64 %rd16, %rd3;
ld.global.f32 %f6, [%rd16];
I don't understand why we are doing: ptr & 0x000000ffffffffff
.
The LLVM IR is:
KernelAbstraction:
define void @_Z17julia_gpu_kernel_7ContextI14__CUDACtx_Name16CompilerMetadataI10StaticSizeI4_1__E12DynamicCheckvv7NDRangeILi1E10StaticSizeI4_1__E10StaticSizeI4_1__EvvEEv14__PassType_422v12DisableHooksE12_gpu_kernel_1SI7Float32E13CuDeviceArrayI7Float32Li1E6GlobalE13CuDeviceArrayI7Float32Li1E6GlobalE7Float324Bool({ float, i64, float }, { [1 x i64], i64 }, { [1 x i64], i64 }, float, i8) local_unnamed_addr {
%.fca.1.extract = extractvalue { [1 x i64], i64 } %2, 1
%.sroa.9.0..sroa_idx32 = getelementptr inbounds { { float, i64, float }, { [1 x i64], i64 }, { [1 x i64], i64 }, float, i8 }, { { float, i64, float }, { [1 x i64], i64 }, { [1 x i64], i64 }, float, i8 }* %"##overdub_arguments#423.i", i64 0, i32 2, i32 1
store i64 %.fca.1.extract, i64* %.sroa.9.0..sroa_idx32, align
%115 = bitcast i64* %.sroa.9.0..sroa_idx32 to float**, !dbg !77
%116 = load float*, float** %115, align 8, !dbg !77, !tbaa !98
%117 = addrspacecast float* %116 to float addrspace(1)*, !dbg !77
%118 = load float, float addrspace(1)* %117, align 4, !dbg !77, !tbaa !102
CUDAnative
define void @_Z22julia_cn_kernel__183691SI7Float32E13CuDeviceArrayI7Float32Li1E6GlobalE13CuDeviceArrayI7Float32Li1E6GlobalE7Float324Bool({ float, i64, float }, { [1 x i64], i64 }, { [1 x i64], i64 }, float, i8) local_unnamed_addr {
%.fca.1.extract = extractvalue { [1 x i64], i64 } %2, 1
%111 = inttoptr i64 %.fca.1.extract to float*, !dbg !22
%112 = addrspacecast float* %111 to float addrspace(1)*, !dbg !22
%113 = load float, float addrspace(1)* %112, align 4, !dbg !22, !tbaa !38
@kernel function kernel!(s::S, a, b, dummy3, dummy4)
sin(s.x)
@inbounds a[1] = b[1]
end
function cn_kernel!(s::S, a, b, dummy3, dummy4)
CUDAnative.sin(s.x)
@inbounds a[1] = b[1]
return nothing
end
It's a bit annoying that the overdub_arguments
nonsense is still in the optimized code, but nonetheless zeroing out the upper 32bits seems wrong...