Assertion in replace pointer bitcasts
struct a {
int x;
int y;
};
kernel void foo(global struct a* struct_out, int n) {
struct a local_a;
if (n == 0) {
local_a.x = 0;
}
*struct_out = local_a;
}
LLVM translates this at some point the flow to have:
store i64 0, i64 addrspace(1)* %0, align 4
ReplacePointerBitcastPass sees the bitwidths are equal and tries to bitcast the i64 to a { i32, i32} which is invalid.
Assertion failed: (CastInst::castIsValid(opc, C, Ty) && "Invalid constantexpr cast!"), function getCast, file ../third_party/llvm/lib/IR/Constants.cpp, line 1440.
Need to use the debug build to see the assertion. The release build generates bad code. :-(
I'm experimenting with adding the ReplaceLLVMIntrinsics pass before inst-combine.
The theory is that the bitcast is injected by SPIR 1.2 initial codegen to prepare an argument to a memcpy. In this case, with zero-initialization of variables, the initial store to the function-scope variable ends up being scalarized to an i64 store of a zero value. Then replace-pointer-bitcasts can't cope with the conversion between i64 pointer and pointer to a struct.
That seems to work, but causes changes in a few tests. I'm evaluating whether it's not too annoying a difference.
Using your example program, this is just before instcombine:
*** IR Dump Before Combine redundant instructions ***
; Function Attrs: nounwind
define spir_kernel void @foo(%struct.a addrspace(1)* %struct_out, i32 %n) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
%local_a = alloca %struct.a, align 4
store %struct.a zeroinitializer, %struct.a* %local_a
%cmp = icmp eq i32 %n, 0
br i1 %cmp, label %1, label %2
; <label>:1: ; preds = %0
%x = getelementptr inbounds %struct.a, %struct.a* %local_a, i32 0, i32 0
store i32 0, i32* %x, align 4
br label %2
; <label>:2: ; preds = %1, %0
%3 = bitcast %struct.a addrspace(1)* %struct_out to i8 addrspace(1)*
%4 = bitcast %struct.a* %local_a to i8*
call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* %3, i8* %4, i32 8, i32 4, i1 false)
ret void
}
So far that looks good. The default-zeroinit of the local var is a single store.
But then instcombine does a few unhelpful things:
- it breaks up the single store to init (look for "repack" in lib/Transforms/InstCombine to see why)
- it uses a bitcast to convert the ptr-to-i64 to a ptr-to-i32 to access only the first element for the conditioned store. Ugh because that's not supported in Vulkan.
- it converts the final memcpy into a i64 load and store.
source_filename = "a.cl"
target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir-unknown-unknown"
%struct.a = type { i32, i32 }
@__spirv_WorkgroupSize = addrspace(7) global <3 x i32> zeroinitializer
; Function Attrs: nounwind
define spir_kernel void @foo(%struct.a addrspace(1)* %struct_out, i32 %n) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
%local_a = alloca i64, align 8
%tmpcast = bitcast i64* %local_a to %struct.a*
%local_a.repack = bitcast i64* %local_a to i32*
store i32 0, i32* %local_a.repack, align 8
%local_a.repack1 = getelementptr inbounds %struct.a, %struct.a* %tmpcast, i32 0, i32 1
store i32 0, i32* %local_a.repack1, align 4
%cmp = icmp eq i32 %n, 0
br i1 %cmp, label %1, label %2
; <label>:1: ; preds = %0
%x = bitcast i64* %local_a to i32*
store i32 0, i32* %x, align 8
br label %2
; <label>:2: ; preds = %1, %0
%3 = bitcast %struct.a addrspace(1)* %struct_out to i64 addrspace(1)*
%4 = load i64, i64* %local_a, align 8
store i64 %4, i64 addrspace(1)* %3, align 4
ret void
}
By the time we reach replace-pointer-bitcasts, the code is just:
*** IR Dump Before Replace Pointer Bitcast Pass ***; ModuleID = 'a.cl'
source_filename = "a.cl"
target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir-unknown-unknown"
%struct.a = type { i32, i32 }
@__spirv_WorkgroupSize = local_unnamed_addr addrspace(7) global <3 x i32> zeroinitializer
; Function Attrs: norecurse nounwind
define spir_kernel void @foo(%struct.a addrspace(1)* nocapture %struct_out, i32 %n) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
entry:
%0 = bitcast %struct.a addrspace(1)* %struct_out to i64 addrspace(1)*
store i64 0, i64 addrspace(1)* %0, align 4
ret void
}
And then we hit an assert in constant generation during replace-pointer-bitcasts.
Instead, if we replace LLVM intrinsics early, then we convert the llvm memcpy to a spirv.copy_memory that instcombine doesn't recognize:
*** IR Dump Before Combine redundant instructions ***
; Function Attrs: nounwind
define spir_kernel void @foo(%struct.a addrspace(1)* %struct_out, i32 %n) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
%local_a = alloca %struct.a, align 4
store %struct.a zeroinitializer, %struct.a* %local_a
%cmp = icmp eq i32 %n, 0
br i1 %cmp, label %1, label %2
; <label>:1: ; preds = %0
%x = getelementptr inbounds %struct.a, %struct.a* %local_a, i32 0, i32 0
store i32 0, i32* %x, align 4
br label %2
; <label>:2: ; preds = %1, %0
call void @spirv.copy_memory(%struct.a addrspace(1)* %struct_out, %struct.a* %local_a, i32 4, i32 0)
ret void
}
Instcombine still replaces the zeroinitializer store, but does not take the other two actions.
The replace pointer bitcasts pass has nothing to get confused about:
*** IR Dump Before Replace Pointer Bitcast Pass ***; ModuleID = 'a.cl'
source_filename = "a.cl"
target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir-unknown-unknown"
%struct.a = type { i32, i32 }
@__spirv_WorkgroupSize = local_unnamed_addr addrspace(7) global <3 x i32> zeroinitializer
; Function Attrs: nounwind
define spir_kernel void @foo(%struct.a addrspace(1)* %struct_out, i32 %n) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
%local_a = alloca %struct.a, align 8
%local_a.repack = getelementptr inbounds %struct.a, %struct.a* %local_a, i32 0, i32 0
store i32 0, i32* %local_a.repack, align 8
%local_a.repack1 = getelementptr inbounds %struct.a, %struct.a* %local_a, i32 0, i32 1
store i32 0, i32* %local_a.repack1, align 4
%cmp = icmp eq i32 %n, 0
br i1 %cmp, label %1, label %2
; <label>:1: ; preds = %0
store i32 0, i32* %local_a.repack, align 8
br label %2
; <label>:2: ; preds = %1, %0
call void @spirv.copy_memory(%struct.a addrspace(1)* %struct_out, %struct.a* nonnull %local_a, i32 4, i32 0) #1
ret void
}
The code survives that way until SPIR-V code gen. That's correct, but suboptimal because it doesn't see that the conditioned store is optional. Is this a problem? Maybe not.
source_filename = "a.cl"
target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "spir-unknown-unknown"
%struct.a = type { i32, i32 }
@__spirv_WorkgroupSize = local_unnamed_addr addrspace(7) global <3 x i32> zeroinitializer
; Function Attrs: norecurse nounwind
define spir_kernel void @foo(%struct.a addrspace(1)* nocapture %struct_out, i32 %n) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
entry:
%0 = bitcast %struct.a addrspace(1)* %struct_out to i64 addrspace(1)*
store i64 0, i64 addrspace(1)* %0, align 4
ret void
}
This particular IR seems like it would be reasonable for the pass to unwind. Can determine the base object %struct.a addrspace(1)* %struct_out and rewrite the store as affecting the first 64 bits of that struct (the whole thing in this case). I imagine it would get trickier if the pointer bitcasts were occurring at multiple levels of the call tree (e.g. each call bitcasts to another pointer).
If I change the assignment to local_a.x to 66 from zero, I get this at replace pointer bitcasts:
define spir_kernel void @foo(%struct.a addrspace(1)* nocapture %struct_out, i32 %n) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
entry:
%cmp = icmp eq i32 %n, 0
%0 = bitcast %struct.a addrspace(1)* %struct_out to i64 addrspace(1)*
%local_a.sroa.0.0.insert.ext = select i1 %cmp, i64 66, i64 0
store i64 %local_a.sroa.0.0.insert.ext, i64 addrspace(1)* %0, align 4
ret void
}
So there's more going on than the bitcast. There's also 64bit select that has to be rewritten somewhere. It's not pretty.
Also, with the 66, if I do replace LLVM intrinsics early, I get this IR before descriptor allocation, and this is reasonable:
define spir_kernel void @foo(%struct.a addrspace(1)* %struct_out, i32 %n) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
%local_a = alloca %struct.a, align 8
%local_a.repack = getelementptr inbounds %struct.a, %struct.a* %local_a, i32 0, i32 0
store i32 0, i32* %local_a.repack, align 8
%local_a.repack1 = getelementptr inbounds %struct.a, %struct.a* %local_a, i32 0, i32 1
store i32 0, i32* %local_a.repack1, align 4
%cmp = icmp eq i32 %n, 0
br i1 %cmp, label %1, label %2
; <label>:1: ; preds = %0
store i32 66, i32* %local_a.repack, align 8
br label %2
; <label>:2: ; preds = %1, %0
call void @spirv.copy_memory(%struct.a addrspace(1)* %struct_out, %struct.a* nonnull %local_a, i32 4, i32 0) #1
ret void
}
Looking at changes in Clspv tests if I replace LLVM intrinsics early. The code generated is noticeably worse.
E.g. test/composite_construct.cl has this happen:
The boo function is:
typedef struct { float a, b, c, d; } S;
S boo(float a) {
S result;
// This entire chain of insertions is replaced by a single
// OpCompositeConstruct
result.a = a;
result.c = a+2.0f;
result.b = a+1.0f;
result.d = a+3.0f;
return result;
}
Before SROA in the current flow (delay replace LLVM intrinsics), the IR for the boo function is:
*** IR Dump Before SROA ***
; Function Attrs: nounwind readnone
define spir_func %struct.S @boo(float) local_unnamed_addr #0 {
%retval = alloca %struct.S, align 8
%result = alloca %struct.S, align 8
%result.repack = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 0
store float 0.000000e+00, float* %result.repack, align 8
%result.repack1 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 1
store float 0.000000e+00, float* %result.repack1, align 4
%result.repack2 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 2
store float 0.000000e+00, float* %result.repack2, align 8
%result.repack3 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 3
store float 0.000000e+00, float* %result.repack3, align 4
%a1 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 0
store float %0, float* %a1, align 8
%add = fadd float %0, 2.000000e+00
%c = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 2
store float %add, float* %c, align 8
%add2 = fadd float %0, 1.000000e+00
%b = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 1
store float %add2, float* %b, align 4
%add3 = fadd float %0, 3.000000e+00
%d = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 3
store float %add3, float* %d, align 4
%2 = bitcast %struct.S* %retval to i8*
%3 = bitcast %struct.S* %result to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %2, i8* %3, i32 16, i32 8, i1 false)
%.elt = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 0
%.unpack = load float, float* %.elt, align 8
%4 = insertvalue %struct.S undef, float %.unpack, 0
%.elt4 = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 1
%.unpack5 = load float, float* %.elt4, align 4
%5 = insertvalue %struct.S %4, float %.unpack5, 1
%.elt6 = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 2
%.unpack7 = load float, float* %.elt6, align 8
%6 = insertvalue %struct.S %5, float %.unpack7, 2
%.elt8 = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 3
%.unpack9 = load float, float* %.elt8, align 4
%7 = insertvalue %struct.S %6, float %.unpack9, 3
ret %struct.S %7
}
After SROA it's this:
*** IR Dump Before Early CSE w/ MemorySSA ***
; Function Attrs: nounwind readnone
define spir_func %struct.S @boo(float) local_unnamed_addr #0 {
%add = fadd float %0, 2.000000e+00
%add2 = fadd float %0, 1.000000e+00
%add3 = fadd float %0, 3.000000e+00
%2 = insertvalue %struct.S undef, float %0, 0
%3 = insertvalue %struct.S %2, float %add2, 1
%4 = insertvalue %struct.S %3, float %add, 2
%5 = insertvalue %struct.S %4, float %add3, 3
ret %struct.S %5
}
But if I've replaced the memcpy with the spirv intrinsic, then it does not collapse at all, and we get extra variables lying around for no good reason:
*** IR Dump Before Early CSE w/ MemorySSA ***
; Function Attrs: nounwind
define spir_func %struct.S @boo(float) local_unnamed_addr #0 {
%retval = alloca %struct.S, align 8
%result = alloca %struct.S, align 8
%result.repack = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 0
store float 0.000000e+00, float* %result.repack, align 8
%result.repack1 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 1
store float 0.000000e+00, float* %result.repack1, align 4
%result.repack2 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 2
store float 0.000000e+00, float* %result.repack2, align 8
%result.repack3 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 3
store float 0.000000e+00, float* %result.repack3, align 4
%a1 = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 0
store float %0, float* %a1, align 8
%add = fadd float %0, 2.000000e+00
%c = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 2
store float %add, float* %c, align 8
%add2 = fadd float %0, 1.000000e+00
%b = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 1
store float %add2, float* %b, align 4
%add3 = fadd float %0, 3.000000e+00
%d = getelementptr inbounds %struct.S, %struct.S* %result, i32 0, i32 3
store float %add3, float* %d, align 4
call void @spirv.copy_memory(%struct.S* nonnull %retval, %struct.S* nonnull %result, i32 4, i32 0) #1
%.elt = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 0
%.unpack = load float, float* %.elt, align 8
%2 = insertvalue %struct.S undef, float %.unpack, 0
%.elt4 = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 1
%.unpack5 = load float, float* %.elt4, align 4
%3 = insertvalue %struct.S %2, float %.unpack5, 1
%.elt6 = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 2
%.unpack7 = load float, float* %.elt6, align 8
%4 = insertvalue %struct.S %3, float %.unpack7, 2
%.elt8 = getelementptr inbounds %struct.S, %struct.S* %retval, i32 0, i32 3
%.unpack9 = load float, float* %.elt8, align 4
%5 = insertvalue %struct.S %4, float %.unpack9, 3
ret %struct.S %5
}
And that results in extra code in the boo function. Before:
%29 = OpFunction %_struct_2 Const %12
%30 = OpFunctionParameter %float
%31 = OpLabel
%32 = OpFAdd %float %30 %float_2
%33 = OpFAdd %float %30 %float_1
%34 = OpFAdd %float %30 %float_3
%35 = OpCompositeConstruct %_struct_2 %30 %33 %32 %34
OpReturnValue %35
OpFunctionEnd
After:
%33 = OpFunction %_struct_2 None %15
%34 = OpFunctionParameter %float
%35 = OpLabel
%36 = OpVariable %_ptr_Function__struct_2 Function
%37 = OpVariable %_ptr_Function__struct_2 Function
%38 = OpAccessChain %_ptr_Function_float %37 %uint_0
%39 = OpAccessChain %_ptr_Function_float %37 %uint_1
%40 = OpAccessChain %_ptr_Function_float %37 %uint_2
%41 = OpAccessChain %_ptr_Function_float %37 %uint_3
OpStore %38 %34
%42 = OpFAdd %float %34 %float_2
OpStore %40 %42
%43 = OpFAdd %float %34 %float_1
OpStore %39 %43
%44 = OpFAdd %float %34 %float_3
OpStore %41 %44
OpCopyMemory %36 %37 Aligned 4
%45 = OpAccessChain %_ptr_Function_float %36 %uint_0
%46 = OpLoad %float %45
%47 = OpAccessChain %_ptr_Function_float %36 %uint_1
%48 = OpLoad %float %47
%49 = OpAccessChain %_ptr_Function_float %36 %uint_2
%50 = OpLoad %float %49
%51 = OpAccessChain %_ptr_Function_float %36 %uint_3
%52 = OpLoad %float %51
%53 = OpCompositeConstruct %_struct_2 %46 %48 %50 %52
OpReturnValue %53
OpFunctionEnd
I'm disliking this approach.
This test is now passing