Polygeist
Polygeist copied to clipboard
Different implementations for increment (`i+=1` and `i++`)
Hi, we are leveraging Polygesit's frontend to generate MLIR code, but it seems it generates different code for the same semantics. Following shows an example of conv2d using +=1
to increment.
#define bs 4
#define oc 16
#define ic 6
#define ih 8
#define iw 8
#define kh 3
#define kw 3
#define oh 6
#define ow 6
void test_conv2d(float A[bs][ic][ih][iw], float B[oc][ic][kh][kw], float C[bs][oc][oh][ow]) {
#pragma scop
for (int n = 0; n < bs; n += 1) {
for (int c = 0; c < oc; c += 1) {
for (int y = 0; y < oh; y += 1) {
for (int x = 0; x < ow; x += 1) {
float sum = 0;
for (int rc = 0; rc < ic; rc += 1) {
for (int rh = 0; rh < kh; rh += 1) {
for (int rw = 0; rw < kw; rw += 1) {
sum += A[n][rc][y+rh][x+rw] * B[c][rc][rh][rw];
}}}
C[n][c][y][x] = sum;
}}}}
#pragma endscop
}
It generates the following MLIR assembly with iter_args
.
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<i64, dense<64> : vector<2xi32>>, #dlti.dl_entry<f80, dense<128> : vector<2xi32>>, #dlti.dl_entry<i1, dense<8> : vector<2xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>>, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
func @test_conv2d(%arg0: memref<4x6x8x8xf32>, %arg1: memref<16x6x3x3xf32>, %arg2: memref<4x16x6x6xf32>) attributes {llvm.linkage = #llvm.linkage<external>} {
%cst = arith.constant 0.000000e+00 : f32
affine.for %arg3 = 0 to 4 {
affine.for %arg4 = 0 to 16 {
affine.for %arg5 = 0 to 6 {
affine.for %arg6 = 0 to 6 {
%0 = affine.for %arg7 = 0 to 6 iter_args(%arg8 = %cst) -> (f32) {
%1 = affine.for %arg9 = 0 to 3 iter_args(%arg10 = %arg8) -> (f32) {
%2 = affine.for %arg11 = 0 to 3 iter_args(%arg12 = %arg10) -> (f32) {
%3 = affine.load %arg0[%arg3, %arg7, %arg5 + %arg9, %arg6 + %arg11] : memref<4x6x8x8xf32>
%4 = affine.load %arg1[%arg4, %arg7, %arg9, %arg11] : memref<16x6x3x3xf32>
%5 = arith.mulf %3, %4 : f32
%6 = arith.addf %arg12, %5 : f32
affine.yield %6 : f32
}
affine.yield %2 : f32
}
affine.yield %1 : f32
}
affine.store %0, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<4x16x6x6xf32>
}
}
}
}
return
}
}
But if I change all the +=1
to ++
, it does not use iter_args
anymore.
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<i64, dense<64> : vector<2xi32>>, #dlti.dl_entry<f80, dense<128> : vector<2xi32>>, #dlti.dl_entry<i1, dense<8> : vector<2xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>>, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
func @test_conv2d(%arg0: memref<4x6x8x8xf32>, %arg1: memref<16x6x3x3xf32>, %arg2: memref<4x16x6x6xf32>) attributes {llvm.linkage = #llvm.linkage<external>} {
%cst = arith.constant 0.000000e+00 : f32
%0 = memref.alloca() : memref<1xf32>
%1 = llvm.mlir.undef : f32
affine.store %1, %0[0] : memref<1xf32>
affine.for %arg3 = 0 to 4 {
affine.for %arg4 = 0 to 16 {
affine.for %arg5 = 0 to 6 {
affine.for %arg6 = 0 to 6 {
affine.store %cst, %0[0] : memref<1xf32>
affine.for %arg7 = 0 to 6 {
affine.for %arg8 = 0 to 3 {
affine.for %arg9 = 0 to 3 {
%3 = affine.load %arg0[%arg3, %arg7, %arg5 + %arg8, %arg6 + %arg9] : memref<4x6x8x8xf32>
%4 = affine.load %arg1[%arg4, %arg7, %arg8, %arg9] : memref<16x6x3x3xf32>
%5 = arith.mulf %3, %4 : f32
%6 = affine.load %0[0] : memref<1xf32>
%7 = arith.addf %6, %5 : f32
affine.store %7, %0[0] : memref<1xf32>
}
}
}
%2 = affine.load %0[0] : memref<1xf32>
affine.store %2, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<4x16x6x6xf32>
}
}
}
}
return
}
}
So what's the rationale behind this? Why not use step
to implement for the first case?
What happens if you remove the #pragma scop
in both cases?
Regardless, this seems to just be a different ordering of optimizations question (or implementation difference from the pragma scop special case handler).
Okay, after removing #pragma scop
, both +=1
and ++
generate the first MLIR snippet with iter_args
. So when should I add the pragma?