Polygeist copied to clipboard
affine.for can't be convert to affine.parallel if scf is raised through polygeist-opt
for the following case, if it raise to affine through cgeist, matmul can be parallized by polygeist-opt(affine-parallize), but DSPF_sp_mat_trans_cplx can't be parallize, if it raise to affine through polygeist, then both funcions(DSPF_sp_mat_trans_cplx and mamtul) can't be parallized
cgeist test.c -S | polygeist-opt --raise-scf-to-affine --affine-parallelize result: DSPF_sp_mat_trans_cplx and matmul can't be parallized through --affine-parallelize
cgeist test.c -S --raise-scf-to-affine | polygeist-opt --affine-parallelize result: matmul can be parallized through --affine-parallelize but DSPF_sp_mat_trans_cplx still can't be
example code:
void DSPF_sp_mat_trans_cplx( float* x, int rows, int cols, float * r)
int i, j;
for (i = 0; i < cols; i++)
for (j = 0; j < rows; j++)
r[(i * rows + j) * 2] = x[(i + cols * j) * 2];
r[(i * rows + j) * 2 + 1] = x[(i + cols * j) * 2 + 1];
#define N 200
#define M 300
#define K 400
#define DATA_TYPE float
void matmul(DATA_TYPE A[N][K], DATA_TYPE B[K][M], DATA_TYPE C[N][M], int N_L, int M_L, int K_L) {
int i, j, k;
for (int i = 0; i < N_L; i++) {
for (int j = 0; j < M_L; j++) {
for (int k = 0; k < K_L; k++) {
C[i][j] += A[i][k] * B[k][j];
int main() {
float a[N][K];
float b[K][M];
float c[N][M];
matmul(a, b, c, N, M, K);
DSPF_sp_mat_trans_cplx(a, N, M, b);
return 0;
cgeist test.c -S | polygeist-opt --raise-scf-to-affine --affine-parallelize result:
func.func @matmul(%arg0: memref<?x400xf32>, %arg1: memref<?x300xf32>, %arg2: memref<?x300xf32>, %arg3: i32, %arg4: i32, %arg5: i32) attributes {llvm.linkage = #llvm.linkage<external>} {
%0 = arith.index_cast %arg3 : i32 to index
%1 = arith.index_cast %arg4 : i32 to index
%2 = arith.index_cast %arg5 : i32 to index
affine.for %arg6 = 0 to %0 {
affine.for %arg7 = 0 to %1 {
affine.for %arg8 = 0 to %2 {
%3 = memref.load %arg0[%arg6, %arg8] : memref<?x400xf32>
%4 = memref.load %arg1[%arg8, %arg7] : memref<?x300xf32>
%5 = arith.mulf %3, %4 : f32
%6 = memref.load %arg2[%arg6, %arg7] : memref<?x300xf32>
%7 = arith.addf %6, %5 : f32
memref.store %7, %arg2[%arg6, %arg7] : memref<?x300xf32>
func.func @DSPF_sp_mat_trans_cplx(%arg0: memref<?xf32>, %arg1: i32, %arg2: i32, %arg3: memref<?xf32>) attributes {llvm.linkage = #llvm.linkage<external>} {
%c1_i32 = arith.constant 1 : i32
%c2_i32 = arith.constant 2 : i32
%0 = arith.index_cast %arg2 : i32 to index
%1 = arith.index_cast %arg1 : i32 to index
affine.for %arg4 = 0 to %0 {
%2 = arith.index_cast %arg4 : index to i32
%3 = arith.muli %2, %arg1 : i32
affine.for %arg5 = 0 to %1 {
%4 = arith.index_cast %arg5 : index to i32
%5 = arith.addi %3, %4 : i32
%6 = arith.muli %5, %c2_i32 : i32
%7 = arith.index_cast %6 : i32 to index
%8 = arith.muli %arg2, %4 : i32
%9 = arith.addi %2, %8 : i32
%10 = arith.muli %9, %c2_i32 : i32
%11 = arith.index_cast %10 : i32 to index
%12 = memref.load %arg0[%11] : memref<?xf32>
memref.store %12, %arg3[%7] : memref<?xf32>
%13 = arith.addi %6, %c1_i32 : i32
%14 = arith.index_cast %13 : i32 to index
%15 = arith.addi %10, %c1_i32 : i32
%16 = arith.index_cast %15 : i32 to index
%17 = memref.load %arg0[%16] : memref<?xf32>
memref.store %17, %arg3[%14] : memref<?xf32>
cgeist test.c -S --raise-scf-to-affine | polygeist-opt --affine-parallelize result:
func.func @matmul(%arg0: memref<?x400xf32>, %arg1: memref<?x300xf32>, %arg2: memref<?x300xf32>, %arg3: i32, %arg4: i32, %arg5: i32) attributes {llvm.linkage = #llvm.linkage<external>} {
%0 = arith.index_cast %arg3 : i32 to index
%1 = arith.index_cast %arg4 : i32 to index
%2 = arith.index_cast %arg5 : i32 to index
affine.parallel (%arg6) = (0) to (symbol(%0)) {
affine.parallel (%arg7) = (0) to (symbol(%1)) {
affine.for %arg8 = 0 to %2 {
%3 = affine.load %arg0[%arg6, %arg8] : memref<?x400xf32>
%4 = affine.load %arg1[%arg8, %arg7] : memref<?x300xf32>
%5 = arith.mulf %3, %4 : f32
%6 = affine.load %arg2[%arg6, %arg7] : memref<?x300xf32>
%7 = arith.addf %6, %5 : f32
affine.store %7, %arg2[%arg6, %arg7] : memref<?x300xf32>
func.func @DSPF_sp_mat_trans_cplx(%arg0: memref<?xf32>, %arg1: i32, %arg2: i32, %arg3: memref<?xf32>) attributes {llvm.linkage = #llvm.linkage<external>} {
%0 = arith.index_cast %arg2 : i32 to index
%1 = arith.index_cast %arg1 : i32 to index
affine.for %arg4 = 0 to %0 {
affine.for %arg5 = 0 to %1 {
%2 = affine.load %arg0[%arg4 * 2 + (%arg5 * symbol(%0)) * 2] : memref<?xf32>
affine.store %2, %arg3[%arg5 * 2 + (%arg4 * symbol(%1)) * 2] : memref<?xf32>
%3 = affine.load %arg0[%arg4 * 2 + (%arg5 * symbol(%0)) * 2 + 1] : memref<?xf32>
affine.store %3, %arg3[%arg5 * 2 + (%arg4 * symbol(%1)) * 2 + 1] : memref<?xf32>