clspv Assertion in replace pointer bitcasts

struct a {
  int x;
  int y;
};

kernel void foo(global struct a* struct_out, int n) {
  struct a local_a;
  if (n == 0) {
    local_a.x = 0;
  }
  *struct_out = local_a;
}

LLVM translates this at some point the flow to have:

store i64 0, i64 addrspace(1)* %0, align 4

ReplacePointerBitcastPass sees the bitwidths are equal and tries to bitcast the i64 to a { i32, i32} which is invalid.

Assertion failed: (CastInst::castIsValid(opc, C, Ty) && "Invalid constantexpr cast!"), function getCast, file ../third_party/llvm/lib/IR/Constants.cpp, line 1440.

Aug 28 '18 21:08 alan-baker

Need to use the debug build to see the assertion. The release build generates bad code. :-(

Aug 31 '18 14:08 dneto0

I'm experimenting with adding the ReplaceLLVMIntrinsics pass before inst-combine.

The theory is that the bitcast is injected by SPIR 1.2 initial codegen to prepare an argument to a memcpy. In this case, with zero-initialization of variables, the initial store to the function-scope variable ends up being scalarized to an i64 store of a zero value. Then replace-pointer-bitcasts can't cope with the conversion between i64 pointer and pointer to a struct.

That seems to work, but causes changes in a few tests. I'm evaluating whether it's not too annoying a difference.

Aug 31 '18 16:08 dneto0

Using your example program, this is just before instcombine:

*** IR Dump Before Combine redundant instructions ***
; Function Attrs: nounwind
define spir_kernel void @foo(%struct.a addrspace(1)* %struct_out, i32 %n) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
  %local_a = alloca %struct.a, align 4
  store %struct.a zeroinitializer, %struct.a* %local_a
  %cmp = icmp eq i32 %n, 0
  br i1 %cmp, label %1, label %2

; <label>:1:                                      ; preds = %0
  %x = getelementptr inbounds %struct.a, %struct.a* %local_a, i32 0, i32 0
  store i32 0, i32* %x, align 4
  br label %2

; <label>:2:                                      ; preds = %1, %0
  %3 = bitcast %struct.a addrspace(1)* %struct_out to i8 addrspace(1)*
  %4 = bitcast %struct.a* %local_a to i8*
  call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* %3, i8* %4, i32 8, i32 4, i1 false)
  ret void
}

So far that looks good. The default-zeroinit of the local var is a single store.

But then instcombine does a few unhelpful things:

it breaks up the single store to init (look for "repack" in lib/Transforms/InstCombine to see why)
it uses a bitcast to convert the ptr-to-i64 to a ptr-to-i32 to access only the first element for the conditioned store. Ugh because that's not supported in Vulkan.
it converts the final memcpy into a i64 load and store.

source_filename = "a.cl"
target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir-unknown-unknown"

%struct.a = type { i32, i32 }

@__spirv_WorkgroupSize = addrspace(7) global <3 x i32> zeroinitializer

; Function Attrs: nounwind
define spir_kernel void @foo(%struct.a addrspace(1)* %struct_out, i32 %n) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
  %local_a = alloca i64, align 8
  %tmpcast = bitcast i64* %local_a to %struct.a*
  %local_a.repack = bitcast i64* %local_a to i32*
  store i32 0, i32* %local_a.repack, align 8
  %local_a.repack1 = getelementptr inbounds %struct.a, %struct.a* %tmpcast, i32 0, i32 1
  store i32 0, i32* %local_a.repack1, align 4
  %cmp = icmp eq i32 %n, 0
  br i1 %cmp, label %1, label %2

; <label>:1:                                      ; preds = %0
  %x = bitcast i64* %local_a to i32*
  store i32 0, i32* %x, align 8
  br label %2

; <label>:2:                                      ; preds = %1, %0
  %3 = bitcast %struct.a addrspace(1)* %struct_out to i64 addrspace(1)*
  %4 = load i64, i64* %local_a, align 8
  store i64 %4, i64 addrspace(1)* %3, align 4
  ret void
}

By the time we reach replace-pointer-bitcasts, the code is just:

*** IR Dump Before Replace Pointer Bitcast Pass ***; ModuleID = 'a.cl'
source_filename = "a.cl"
target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir-unknown-unknown"

%struct.a = type { i32, i32 }

@__spirv_WorkgroupSize = local_unnamed_addr addrspace(7) global <3 x i32> zeroinitializer

; Function Attrs: norecurse nounwind
define spir_kernel void @foo(%struct.a addrspace(1)* nocapture %struct_out, i32 %n) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
entry:
  %0 = bitcast %struct.a addrspace(1)* %struct_out to i64 addrspace(1)*
  store i64 0, i64 addrspace(1)* %0, align 4
  ret void
}

And then we hit an assert in constant generation during replace-pointer-bitcasts.

Aug 31 '18 16:08 dneto0

Instead, if we replace LLVM intrinsics early, then we convert the llvm memcpy to a spirv.copy_memory that instcombine doesn't recognize:

*** IR Dump Before Combine redundant instructions ***
; Function Attrs: nounwind
define spir_kernel void @foo(%struct.a addrspace(1)* %struct_out, i32 %n) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
  %local_a = alloca %struct.a, align 4
  store %struct.a zeroinitializer, %struct.a* %local_a
  %cmp = icmp eq i32 %n, 0
  br i1 %cmp, label %1, label %2

; <label>:1:                                      ; preds = %0
  %x = getelementptr inbounds %struct.a, %struct.a* %local_a, i32 0, i32 0
  store i32 0, i32* %x, align 4
  br label %2

; <label>:2:                                      ; preds = %1, %0
  call void @spirv.copy_memory(%struct.a addrspace(1)* %struct_out, %struct.a* %local_a, i32 4, i32 0)
  ret void
}

Instcombine still replaces the zeroinitializer store, but does not take the other two actions.

The replace pointer bitcasts pass has nothing to get confused about:

*** IR Dump Before Replace Pointer Bitcast Pass ***; ModuleID = 'a.cl'
source_filename = "a.cl"
target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir-unknown-unknown"

%struct.a = type { i32, i32 }

@__spirv_WorkgroupSize = local_unnamed_addr addrspace(7) global <3 x i32> zeroinitializer

; Function Attrs: nounwind
define spir_kernel void @foo(%struct.a addrspace(1)* %struct_out, i32 %n) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
  %local_a = alloca %struct.a, align 8
  %local_a.repack = getelementptr inbounds %struct.a, %struct.a* %local_a, i32 0, i32 0
  store i32 0, i32* %local_a.repack, align 8
  %local_a.repack1 = getelementptr inbounds %struct.a, %struct.a* %local_a, i32 0, i32 1
  store i32 0, i32* %local_a.repack1, align 4
  %cmp = icmp eq i32 %n, 0
  br i1 %cmp, label %1, label %2

; <label>:1:                                      ; preds = %0
  store i32 0, i32* %local_a.repack, align 8
  br label %2

; <label>:2:                                      ; preds = %1, %0
  call void @spirv.copy_memory(%struct.a addrspace(1)* %struct_out, %struct.a* nonnull %local_a, i32 4, i32 0) #1
  ret void
}

The code survives that way until SPIR-V code gen. That's correct, but suboptimal because it doesn't see that the conditioned store is optional. Is this a problem? Maybe not.

Aug 31 '18 16:08 dneto0

source_filename = "a.cl"
target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir-unknown-unknown"

%struct.a = type { i32, i32 }

@__spirv_WorkgroupSize = local_unnamed_addr addrspace(7) global <3 x i32> zeroinitializer

; Function Attrs: norecurse nounwind
define spir_kernel void @foo(%struct.a addrspace(1)* nocapture %struct_out, i32 %n) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
entry:
  %0 = bitcast %struct.a addrspace(1)* %struct_out to i64 addrspace(1)*
  store i64 0, i64 addrspace(1)* %0, align 4
  ret void
}

This particular IR seems like it would be reasonable for the pass to unwind. Can determine the base object %struct.a addrspace(1)* %struct_out and rewrite the store as affecting the first 64 bits of that struct (the whole thing in this case). I imagine it would get trickier if the pointer bitcasts were occurring at multiple levels of the call tree (e.g. each call bitcasts to another pointer).

Aug 31 '18 16:08 alan-baker

If I change the assignment to local_a.x to 66 from zero, I get this at replace pointer bitcasts:

define spir_kernel void @foo(%struct.a addrspace(1)* nocapture %struct_out, i32 %n) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
entry:
  %cmp = icmp eq i32 %n, 0
  %0 = bitcast %struct.a addrspace(1)* %struct_out to i64 addrspace(1)*
  %local_a.sroa.0.0.insert.ext = select i1 %cmp, i64 66, i64 0
  store i64 %local_a.sroa.0.0.insert.ext, i64 addrspace(1)* %0, align 4
  ret void
}

So there's more going on than the bitcast. There's also 64bit select that has to be rewritten somewhere. It's not pretty.

Aug 31 '18 18:08 dneto0

Also, with the 66, if I do replace LLVM intrinsics early, I get this IR before descriptor allocation, and this is reasonable:

define spir_kernel void @foo(%struct.a addrspace(1)* %struct_out, i32 %n) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
  %local_a = alloca %struct.a, align 8
  %local_a.repack = getelementptr inbounds %struct.a, %struct.a* %local_a, i32 0, i32 0
  store i32 0, i32* %local_a.repack, align 8
  %local_a.repack1 = getelementptr inbounds %struct.a, %struct.a* %local_a, i32 0, i32 1
  store i32 0, i32* %local_a.repack1, align 4
  %cmp = icmp eq i32 %n, 0
  br i1 %cmp, label %1, label %2

; <label>:1:                                      ; preds = %0
  store i32 66, i32* %local_a.repack, align 8
  br label %2

; <label>:2:                                      ; preds = %1, %0
  call void @spirv.copy_memory(%struct.a addrspace(1)* %struct_out, %struct.a* nonnull %local_a, i32 4, i32 0) #1
  ret void
}

Aug 31 '18 18:08 dneto0

Looking at changes in Clspv tests if I replace LLVM intrinsics early. The code generated is noticeably worse.

E.g. test/composite_construct.cl has this happen:

The boo function is:

 typedef struct { float a, b, c, d; } S;

S boo(float a) {
  S result;
  // This entire chain of insertions is replaced by a single 
  // OpCompositeConstruct
  result.a = a;
  result.c = a+2.0f;
  result.b = a+1.0f;
  result.d = a+3.0f;
  return result;
}

Before SROA in the current flow (delay replace LLVM intrinsics), the IR for the boo function is:

*** IR Dump Before SROA ***
; Function Attrs: nounwind readnone
define spir_func %struct.S @boo(float) local_unnamed_addr #0 {
  %retval = alloca %struct.S, align 8
  %result = alloca %struct.S, align 8
  %result.repack = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 0
  store float 0.000000e+00, float* %result.repack, align 8
  %result.repack1 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 1
  store float 0.000000e+00, float* %result.repack1, align 4
  %result.repack2 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 2
  store float 0.000000e+00, float* %result.repack2, align 8
  %result.repack3 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 3
  store float 0.000000e+00, float* %result.repack3, align 4
  %a1 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 0
  store float %0, float* %a1, align 8
  %add = fadd float %0, 2.000000e+00
  %c = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 2
  store float %add, float* %c, align 8
  %add2 = fadd float %0, 1.000000e+00
  %b = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 1
  store float %add2, float* %b, align 4
  %add3 = fadd float %0, 3.000000e+00
  %d = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 3
  store float %add3, float* %d, align 4
  %2 = bitcast %struct.S* %retval to i8*
  %3 = bitcast %struct.S* %result to i8*
  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %2, i8* %3, i32 16, i32 8, i1 false)
  %.elt = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 0
  %.unpack = load float, float* %.elt, align 8
  %4 = insertvalue %struct.S undef, float %.unpack, 0
  %.elt4 = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 1
  %.unpack5 = load float, float* %.elt4, align 4
  %5 = insertvalue %struct.S %4, float %.unpack5, 1
  %.elt6 = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 2
  %.unpack7 = load float, float* %.elt6, align 8
  %6 = insertvalue %struct.S %5, float %.unpack7, 2
  %.elt8 = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 3
  %.unpack9 = load float, float* %.elt8, align 4
  %7 = insertvalue %struct.S %6, float %.unpack9, 3
  ret %struct.S %7
}

After SROA it's this:

*** IR Dump Before Early CSE w/ MemorySSA ***
; Function Attrs: nounwind readnone
define spir_func %struct.S @boo(float) local_unnamed_addr #0 {
  %add = fadd float %0, 2.000000e+00
  %add2 = fadd float %0, 1.000000e+00
  %add3 = fadd float %0, 3.000000e+00
  %2 = insertvalue %struct.S undef, float %0, 0
  %3 = insertvalue %struct.S %2, float %add2, 1
  %4 = insertvalue %struct.S %3, float %add, 2
  %5 = insertvalue %struct.S %4, float %add3, 3
  ret %struct.S %5
}

But if I've replaced the memcpy with the spirv intrinsic, then it does not collapse at all, and we get extra variables lying around for no good reason:

*** IR Dump Before Early CSE w/ MemorySSA ***
; Function Attrs: nounwind
define spir_func %struct.S @boo(float) local_unnamed_addr #0 {
  %retval = alloca %struct.S, align 8 
  %result = alloca %struct.S, align 8
  %result.repack = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 0
  store float 0.000000e+00, float* %result.repack, align 8
  %result.repack1 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 1
  store float 0.000000e+00, float* %result.repack1, align 4
  %result.repack2 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 2
  store float 0.000000e+00, float* %result.repack2, align 8
  %result.repack3 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 3
  store float 0.000000e+00, float* %result.repack3, align 4
  %a1 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 0
  store float %0, float* %a1, align 8
  %add = fadd float %0, 2.000000e+00
  %c = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 2
  store float %add, float* %c, align 8
  %add2 = fadd float %0, 1.000000e+00
  %b = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 1
  store float %add2, float* %b, align 4
  %add3 = fadd float %0, 3.000000e+00
  %d = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 3
  store float %add3, float* %d, align 4
  call void @spirv.copy_memory(%struct.S* nonnull %retval, %struct.S* nonnull %result, i32 4, i32 0) #1
  %.elt = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 0
  %.unpack = load float, float* %.elt, align 8
  %2 = insertvalue %struct.S undef, float %.unpack, 0
  %.elt4 = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 1
  %.unpack5 = load float, float* %.elt4, align 4
  %3 = insertvalue %struct.S %2, float %.unpack5, 1
  %.elt6 = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 2
  %.unpack7 = load float, float* %.elt6, align 8
  %4 = insertvalue %struct.S %3, float %.unpack7, 2
  %.elt8 = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 3
  %.unpack9 = load float, float* %.elt8, align 4
  %5 = insertvalue %struct.S %4, float %.unpack9, 3
  ret %struct.S %5
}

And that results in extra code in the boo function. Before:

         %29 = OpFunction %_struct_2 Const %12
         %30 = OpFunctionParameter %float
         %31 = OpLabel
         %32 = OpFAdd %float %30 %float_2
         %33 = OpFAdd %float %30 %float_1
         %34 = OpFAdd %float %30 %float_3
         %35 = OpCompositeConstruct %_struct_2 %30 %33 %32 %34
               OpReturnValue %35
               OpFunctionEnd

After:

         %33 = OpFunction %_struct_2 None %15
         %34 = OpFunctionParameter %float
         %35 = OpLabel
         %36 = OpVariable %_ptr_Function__struct_2 Function
         %37 = OpVariable %_ptr_Function__struct_2 Function
         %38 = OpAccessChain %_ptr_Function_float %37 %uint_0
         %39 = OpAccessChain %_ptr_Function_float %37 %uint_1
         %40 = OpAccessChain %_ptr_Function_float %37 %uint_2
         %41 = OpAccessChain %_ptr_Function_float %37 %uint_3
               OpStore %38 %34
         %42 = OpFAdd %float %34 %float_2
               OpStore %40 %42
         %43 = OpFAdd %float %34 %float_1
               OpStore %39 %43
         %44 = OpFAdd %float %34 %float_3
               OpStore %41 %44
               OpCopyMemory %36 %37 Aligned 4
         %45 = OpAccessChain %_ptr_Function_float %36 %uint_0
         %46 = OpLoad %float %45
         %47 = OpAccessChain %_ptr_Function_float %36 %uint_1
         %48 = OpLoad %float %47
         %49 = OpAccessChain %_ptr_Function_float %36 %uint_2
         %50 = OpLoad %float %49
         %51 = OpAccessChain %_ptr_Function_float %36 %uint_3
         %52 = OpLoad %float %51
         %53 = OpCompositeConstruct %_struct_2 %46 %48 %50 %52
               OpReturnValue %53
               OpFunctionEnd

I'm disliking this approach.

Aug 31 '18 18:08 dneto0

This test is now passing

Mar 29 '23 09:03 rjodinchr