numba-dpex icon indicating copy to clipboard operation
numba-dpex copied to clipboard

Numba-dpex adds extra indexing validation instructions inside kernels

Open diptorupd opened this issue 1 year ago • 0 comments

Numba-dpex generates the following LLVM IR for a one-dimensional vector addition kernel:

define internal spir_func i32 @dpex_py_devfn__3C__(
     i8** nocapture %.ret, i8* %arg.a.0, i8* %arg.a.1, i64 %arg.a.2, i64 %arg.a.3, i64 addrspace(1)* %arg.a.4, i64 %arg.a.5.0, i64 %arg.a.6.0, 
     i64 %arg.a__size0_1, 
     i8* %arg.b.0, i8* %arg.b.1, i64 %arg.b.2, i64 %arg.b.3, i64 addrspace(1)* %arg.b.4, i64 %arg.b.5.0, i64 %arg.b.6.0, 
     i8* %arg._6binary__add_2.0, i8* %arg._6binary__add_2.1, i64 %arg._6binary__add_2.2, i64 %arg._6binary__add_2.3, i64 addrspace(1)* %arg._6binary__add_2.4, i64 %arg._6binary__add_2.5.0, i64 %arg._6binary__add_2.6.0) #0 {
entry:
  %.114 = tail call spir_func i64 @_Z13get_global_idj(i32 0)
  %.136 = icmp slt i64 %.114, 0
  %.137 = select i1 %.136, i64 %arg.a.5.0, i64 0
  %.138 = add i64 %.137, %.114
  %.151 = getelementptr i64, i64 addrspace(1)* %arg.a.4, i64 %.138
  %.152 = load i64, i64 addrspace(1)* %.151, align 8
  %.173 = select i1 %.136, i64 %arg.b.5.0, i64 0
  %.174 = add i64 %.173, %.114
  %.187 = getelementptr i64, i64 addrspace(1)* %arg.b.4, i64 %.174
  %.188 = load i64, i64 addrspace(1)* %.187, align 8
  %.196 = add nsw i64 %.188, %.152
  %.220 = select i1 %.136, i64 %arg._6binary__add_2.5.0, i64 0
  %.221 = add i64 %.220, %.114
  %.234 = getelementptr i64, i64 addrspace(1)* %arg._6binary__add_2.4, i64 %.221
  store i64 %.196, i64 addrspace(1)* %.234, align 8
  store i8* null, i8** %.ret, align 8
  ret i32 0
}

The instructions: %.136 = icmp slt i64 %.114, 0 checks if the result of get_global_id is < 0 and is superfluous. We should investigate why Numba generates these checks and how can they be disabled.

diptorupd avatar Feb 26 '23 23:02 diptorupd