Polygeist icon indicating copy to clipboard operation
Polygeist copied to clipboard

Different implementations for increment (`i+=1` and `i++`)

Open chhzh123 opened this issue 2 years ago • 2 comments

Hi, we are leveraging Polygesit's frontend to generate MLIR code, but it seems it generates different code for the same semantics. Following shows an example of conv2d using +=1 to increment.

#define bs 4
#define oc 16
#define ic 6
#define ih 8
#define iw 8
#define kh 3
#define kw 3
#define oh 6
#define ow 6

void test_conv2d(float A[bs][ic][ih][iw], float B[oc][ic][kh][kw], float C[bs][oc][oh][ow]) {
#pragma scop
  for (int n = 0; n < bs; n += 1) {
    for (int c = 0; c < oc; c += 1) {
      for (int y = 0; y < oh; y += 1) {
        for (int x = 0; x < ow; x += 1) {
          float sum = 0;
          for (int rc = 0; rc < ic; rc += 1) {
            for (int rh = 0; rh < kh; rh += 1) {
              for (int rw = 0; rw < kw; rw += 1) {
                sum += A[n][rc][y+rh][x+rw] * B[c][rc][rh][rw];
          }}}
          C[n][c][y][x] = sum;
}}}}
#pragma endscop
}

It generates the following MLIR assembly with iter_args.

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<i64, dense<64> : vector<2xi32>>, #dlti.dl_entry<f80, dense<128> : vector<2xi32>>, #dlti.dl_entry<i1, dense<8> : vector<2xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>>, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
  func @test_conv2d(%arg0: memref<4x6x8x8xf32>, %arg1: memref<16x6x3x3xf32>, %arg2: memref<4x16x6x6xf32>) attributes {llvm.linkage = #llvm.linkage<external>} {
    %cst = arith.constant 0.000000e+00 : f32
    affine.for %arg3 = 0 to 4 {
      affine.for %arg4 = 0 to 16 {
        affine.for %arg5 = 0 to 6 {
          affine.for %arg6 = 0 to 6 {
            %0 = affine.for %arg7 = 0 to 6 iter_args(%arg8 = %cst) -> (f32) {
              %1 = affine.for %arg9 = 0 to 3 iter_args(%arg10 = %arg8) -> (f32) {
                %2 = affine.for %arg11 = 0 to 3 iter_args(%arg12 = %arg10) -> (f32) {
                  %3 = affine.load %arg0[%arg3, %arg7, %arg5 + %arg9, %arg6 + %arg11] : memref<4x6x8x8xf32>
                  %4 = affine.load %arg1[%arg4, %arg7, %arg9, %arg11] : memref<16x6x3x3xf32>
                  %5 = arith.mulf %3, %4 : f32
                  %6 = arith.addf %arg12, %5 : f32
                  affine.yield %6 : f32
                }
                affine.yield %2 : f32
              }
              affine.yield %1 : f32
            }
            affine.store %0, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<4x16x6x6xf32>
          }
        }
      }
    }
    return
  }
}

But if I change all the +=1 to ++, it does not use iter_args anymore.

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<i64, dense<64> : vector<2xi32>>, #dlti.dl_entry<f80, dense<128> : vector<2xi32>>, #dlti.dl_entry<i1, dense<8> : vector<2xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>>, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
  func @test_conv2d(%arg0: memref<4x6x8x8xf32>, %arg1: memref<16x6x3x3xf32>, %arg2: memref<4x16x6x6xf32>) attributes {llvm.linkage = #llvm.linkage<external>} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = memref.alloca() : memref<1xf32>
    %1 = llvm.mlir.undef : f32
    affine.store %1, %0[0] : memref<1xf32>
    affine.for %arg3 = 0 to 4 {
      affine.for %arg4 = 0 to 16 {
        affine.for %arg5 = 0 to 6 {
          affine.for %arg6 = 0 to 6 {
            affine.store %cst, %0[0] : memref<1xf32>
            affine.for %arg7 = 0 to 6 {
              affine.for %arg8 = 0 to 3 {
                affine.for %arg9 = 0 to 3 {
                  %3 = affine.load %arg0[%arg3, %arg7, %arg5 + %arg8, %arg6 + %arg9] : memref<4x6x8x8xf32>
                  %4 = affine.load %arg1[%arg4, %arg7, %arg8, %arg9] : memref<16x6x3x3xf32>
                  %5 = arith.mulf %3, %4 : f32
                  %6 = affine.load %0[0] : memref<1xf32>
                  %7 = arith.addf %6, %5 : f32
                  affine.store %7, %0[0] : memref<1xf32>
                }
              }
            }
            %2 = affine.load %0[0] : memref<1xf32>
            affine.store %2, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<4x16x6x6xf32>
          }
        }
      }
    }
    return
  }
}

So what's the rationale behind this? Why not use step to implement for the first case?

chhzh123 avatar May 13 '22 01:05 chhzh123

What happens if you remove the #pragma scop in both cases?

Regardless, this seems to just be a different ordering of optimizations question (or implementation difference from the pragma scop special case handler).

wsmoses avatar May 13 '22 01:05 wsmoses

Okay, after removing #pragma scop, both +=1 and ++ generate the first MLIR snippet with iter_args. So when should I add the pragma?

chhzh123 avatar May 13 '22 01:05 chhzh123