KernelAbstractions.jl Illegal memory access was encountered

The following code

using KernelAbstractions
using CuArrays

struct S{FT}
  dummy1::FT
  dummy2::Int
  x::FT
end

@kernel function kernel!(s::S, a, b, dummy3, dummy4)
  sin(s.x)
  @inbounds a[1] = b[1]
end

let
  FT = Float32

  a = CuArray(zeros(FT, 1))
  b = CuArray(ones(FT, 1))

  s = S{FT}(FT(1), 1, FT(1))

  device = CUDA()
  event = kernel!(device, 1, 1)(s, a, b, FT(0), true)
  wait(event)
end

fails with CUDA error: an illegal memory access was encountered on Julia 1.3.1 (seems to work fine on 1.4.1). Further observations that may or may not be helpful when debugging:

it works with FT = Float64
it matters that dummy2 is an Int and dummy4 is a Bool
the argument order of kernel! is important, if the dummies come before a and b it also works

May 01 '20 09:05 mwarusz

Thanks, managed to reproduce!

May 04 '20 17:05 vchuravy

I hope this is related to the illegal memory access I have yet minimize to a small example.

May 04 '20 18:05 lcw

So it is really weird:

KernelAbstraction:

	ld.param.u64 	%rd10, [_Z17julia_gpu_kernel_7ContextI14__CUDACtx_Name16CompilerMetadataI10StaticSizeI4_1__E12DynamicCheckvv7NDRangeILi1E10StaticSizeI4_1__E10StaticSizeI4_1__EvvEEv14__PassType_422v12DisableHooksE12_gpu_kernel_1SI7Float32E13CuDeviceArrayI7Float32Li1E6GlobalE13CuDeviceArrayI7Float32Li1E6GlobalE7Float324Bool_param_2+8];
and.b64  	%rd3, %rd10, 1099511627775;
	cvta.to.global.u64 	%rd17, %rd3;
	ld.global.f32 	%f6, [%rd17];

CUDAnative pure

	ld.param.u64 	%rd3, [_Z22julia_cn_kernel__183791SI7Float32E13CuDeviceArrayI7Float32Li1E6GlobalE13CuDeviceArrayI7Float32Li1E6GlobalE7Float324Bool_param_2+8];
	cvta.to.global.u64 	%rd16, %rd3;
	ld.global.f32 	%f6, [%rd16];

I don't understand why we are doing: ptr & 0x000000ffffffffff.

The LLVM IR is:

KernelAbstraction:

define void @_Z17julia_gpu_kernel_7ContextI14__CUDACtx_Name16CompilerMetadataI10StaticSizeI4_1__E12DynamicCheckvv7NDRangeILi1E10StaticSizeI4_1__E10StaticSizeI4_1__EvvEEv14__PassType_422v12DisableHooksE12_gpu_kernel_1SI7Float32E13CuDeviceArrayI7Float32Li1E6GlobalE13CuDeviceArrayI7Float32Li1E6GlobalE7Float324Bool({ float, i64, float }, { [1 x i64], i64 }, { [1 x i64], i64 }, float, i8) local_unnamed_addr {

           %.fca.1.extract = extractvalue { [1 x i64], i64 } %2, 1
           %.sroa.9.0..sroa_idx32 = getelementptr inbounds { { float, i64, float }, { [1 x i64], i64 }, { [1 x i64], i64 }, float, i8 }, { { float, i64, float }, { [1 x i64], i64 }, { [1 x i64], i64 }, float, i8 }* %"##overdub_arguments#423.i", i64 0, i32 2, i32 1
            store i64 %.fca.1.extract, i64* %.sroa.9.0..sroa_idx32, align 
            %115 = bitcast i64* %.sroa.9.0..sroa_idx32 to float**, !dbg !77
            %116 = load float*, float** %115, align 8, !dbg !77, !tbaa !98
            %117 = addrspacecast float* %116 to float addrspace(1)*, !dbg !77
            %118 = load float, float addrspace(1)* %117, align 4, !dbg !77, !tbaa !102

CUDAnative

define void @_Z22julia_cn_kernel__183691SI7Float32E13CuDeviceArrayI7Float32Li1E6GlobalE13CuDeviceArrayI7Float32Li1E6GlobalE7Float324Bool({ float, i64, float }, { [1 x i64], i64 }, { [1 x i64], i64 }, float, i8) local_unnamed_addr {
      %.fca.1.extract = extractvalue { [1 x i64], i64 } %2, 1
       %111 = inttoptr i64 %.fca.1.extract to float*, !dbg !22
       %112 = addrspacecast float* %111 to float addrspace(1)*, !dbg !22
       %113 = load float, float addrspace(1)* %112, align 4, !dbg !22, !tbaa !38

@kernel function kernel!(s::S, a, b, dummy3, dummy4)
  sin(s.x)
  @inbounds a[1] = b[1]
end


function cn_kernel!(s::S, a, b, dummy3, dummy4)
  CUDAnative.sin(s.x)
  @inbounds a[1] = b[1]
  return nothing
end

It's a bit annoying that the overdub_arguments nonsense is still in the optimized code, but nonetheless zeroing out the upper 32bits seems wrong...

May 06 '20 21:05 vchuravy