Polygeist
Polygeist copied to clipboard
instruction on how to run polymerpar
I saw the instruction in the Polygeist-Script project (https://github.com/wsmoses/Polygeist-Script/):
polymerpar)
mlir-clang $CFLAGS $TEST.c -o $TEST.$TOOL.in.mlir
polymer-opt --demote-loop-reduction \
--extract-scop-stmt \
--pluto-opt='parallelize=1' \
--inline \
--canonicalize $TEST.$TOOL.in.mlir 2>/dev/null > $TEST.$TOOL.out.mlir
mlir-opt -mem2reg -detect-reduction -mem2reg -canonicalize -affine-parallelize -lower-affine -convert-scf-to-openmp -convert-scf-to-std -convert-openmp-to-llvm $TEST.$TOOL.out.mlir | mlir-translate -mlir-to-llvmir > $OUT
;;
and I used the almost same instruction to run the mlir file but used updated polymer(e87c27c36b3d346612e505a1b5d7939e6b6aeb41 updated on 2022.1.3)
./bin/polymer-opt --demote-loop-reduction --extract-scop-stmt --pluto-opt='parallelize=1' --inline --canonicalize in.mlir 2>/dev/null > out.mlir
mlir-opt -affine-parallelize -lower-affine -convert-scf-to-openmp -convert-scf-to-std -convert-openmp-to-llvm out.mlir | mlir-translate -mlir-to-llvmir > out.ll
Then I got a quiet different polyhedral optimization results and it was running nearly 8 times slower. Is there any mistake here ?
Can you please add the commands you use and the generated IR? BTW if you are trying to replicate our results you can also try out the docker image: https://github.com/wsmoses/Polygeist-Script/
I used the hand-made heat-3d mlir, and I wanted to use the polymer to optimize it. the mlir file is as follow:
#map = affine_map<()[s0] -> (s0 - 1)>
module {
func private@heat_3d(%arg0: memref<200x200x200xf64>, %arg1: memref<200x200x200xf64>, %arg6:i32) attributes {llvm.emit_c_interface} {
%0 = arith.index_cast %arg6 : i32 to index
affine.for %arg5 = 0 to 1000{
affine.for %arg2 = 1 to #map()[%0] {
affine.for %arg3 = 1 to #map()[%0] {
affine.for %arg4 = 1 to #map()[%0] {
%cst = arith.constant 1.250000e-01 : f64
%1 = affine.load %arg0[%arg2 + 1, %arg3, %arg4] : memref<200x200x200xf64>
%cst_0 = arith.constant 2.000000e+00 : f64
%2 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%3 = arith.mulf %cst_0, %2 : f64
%4 = arith.subf %1, %3 : f64
%5 = affine.load %arg0[%arg2 - 1, %arg3, %arg4] : memref<200x200x200xf64>
%6 = arith.addf %4, %5 : f64
%7 = arith.mulf %cst, %6 : f64
%cst_1 = arith.constant 1.250000e-01 : f64
%8 = affine.load %arg0[%arg2, %arg3 + 1, %arg4] : memref<200x200x200xf64>
%cst_2 = arith.constant 2.000000e+00 : f64
%9 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%10 = arith.mulf %cst_2, %9 : f64
%11 = arith.subf %8, %10 : f64
%12 = affine.load %arg0[%arg2, %arg3 - 1, %arg4] : memref<200x200x200xf64>
%13 = arith.addf %11, %12 : f64
%14 = arith.mulf %cst_1, %13 : f64
%15 = arith.addf %7, %14 : f64
%cst_3 = arith.constant 1.250000e-01 : f64
%16 = affine.load %arg0[%arg2, %arg3, %arg4 + 1] : memref<200x200x200xf64>
%cst_4 = arith.constant 2.000000e+00 : f64
%17 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%18 = arith.mulf %cst_4, %17 : f64
%19 = arith.subf %16, %18 : f64
%20 = affine.load %arg0[%arg2, %arg3, %arg4 - 1] : memref<200x200x200xf64>
%21 = arith.addf %19, %20 : f64
%22 = arith.mulf %cst_3, %21 : f64
%23 = arith.addf %15, %22 : f64
%24 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%25 = arith.addf %23, %24 : f64
affine.store %25, %arg1[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
}
}
}
affine.for %arg2 = 1 to #map()[%0] {
affine.for %arg3 = 1 to #map()[%0] {
affine.for %arg4 = 1 to #map()[%0] {
%cst = arith.constant 1.250000e-01 : f64
%1 = affine.load %arg1[%arg2 + 1, %arg3, %arg4] : memref<200x200x200xf64>
%cst_0 = arith.constant 2.000000e+00 : f64
%2 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%3 = arith.mulf %cst_0, %2 : f64
%4 = arith.subf %1, %3 : f64
%5 = affine.load %arg1[%arg2 - 1, %arg3, %arg4] : memref<200x200x200xf64>
%6 = arith.addf %4, %5 : f64
%7 = arith.mulf %cst, %6 : f64
%cst_1 = arith.constant 1.250000e-01 : f64
%8 = affine.load %arg1[%arg2, %arg3 + 1, %arg4] : memref<200x200x200xf64>
%cst_2 = arith.constant 2.000000e+00 : f64
%9 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%10 = arith.mulf %cst_2, %9 : f64
%11 = arith.subf %8, %10 : f64
%12 = affine.load %arg1[%arg2, %arg3 - 1, %arg4] : memref<200x200x200xf64>
%13 = arith.addf %11, %12 : f64
%14 = arith.mulf %cst_1, %13 : f64
%15 = arith.addf %7, %14 : f64
%cst_3 = arith.constant 1.250000e-01 : f64
%16 = affine.load %arg1[%arg2, %arg3, %arg4 + 1] : memref<200x200x200xf64>
%cst_4 = arith.constant 2.000000e+00 : f64
%17 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%18 = arith.mulf %cst_4, %17 : f64
%19 = arith.subf %16, %18 : f64
%20 = affine.load %arg1[%arg2, %arg3, %arg4 - 1] : memref<200x200x200xf64>
%21 = arith.addf %19, %20 : f64
%22 = arith.mulf %cst_3, %21 : f64
%23 = arith.addf %15, %22 : f64
%24 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%25 = arith.addf %23, %24 : f64
affine.store %25, %arg0[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
}
}
}
}
return
}
func @heat_3d_iteration(%arg0: memref<200x200x200xf64>, %arg1: memref<200x200x200xf64>)attributes {llvm.emit_c_interface}{
%cst_200 = arith.constant 200 : i32
call @heat_3d(%arg0, %arg1, %cst_200) : (memref<200x200x200xf64>, memref<200x200x200xf64>, i32) -> ()
return
}
}
if I changed this mlir file to fit Polygeist-Script's llvm version(like change arith.add to add) and ran the instructions above, I can got a good optimization effect. But I wanted to achieve your optimization effect in Polygeist-Script with a higher version polymer to fit my project.
The whole compilation process is as follow:
./bin/polymer-opt --demote-loop-reduction --extract-scop-stmt --pluto-opt='parallelize=1' --inline --canonicalize in.mlir 2>/dev/null > out.mlir
mlir-opt -affine-parallelize -lower-affine -convert-scf-to-openmp -convert-scf-to-std -convert-openmp-to-llvm out.mlir | mlir-translate -mlir-to-llvmir > out.ll
clang main.c -O3 out.ll -o out.exe -lm -fopenmp
numactl --physcpubind=1-8 ./out.exe
and main.c :
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
struct ThreeDMemrefF64 {
double *ptrToData;
double *alignedPtrToData;
long offset;
long shape[3];
long stride[3];
};
#define M 200
#define N 200
#define P 200
struct timeval begin, end;
void tic()
{
gettimeofday(&begin, NULL);
}
double tok()
{
gettimeofday(&end, NULL);
double elapsedTime = (end.tv_sec - begin.tv_sec)*1e3 + \
(end.tv_usec - begin.tv_usec)*1e-3;
return elapsedTime;
}
extern void _mlir_ciface_heat_3d_iteration(struct ThreeDMemrefF64 *,
struct ThreeDMemrefF64 *);
int main(int argc, char *argv[]) {
int i, j, k;
double (*A)[200][200] = calloc(8000000, sizeof(double));
double (*B)[200][200] = calloc(8000000, sizeof(double));
double sumtime = 0;
for (i = 0; i < M; i++) {
for (j = 0; j < N; j++) {
for(k = 0; k< P; k++){
A[i][j][k] = ((double)i + j + k) / (i + j + k + 1);
B[i][j][k] = (double)0;
}
}
}
struct ThreeDMemrefF64 A_mem = {&A[0][0][0], &A[0][0][0], 0, {M, N, P}, {N*P, P, 1}};
struct ThreeDMemrefF64 B_mem = {&B[0][0][0], &B[0][0][0], 0, {M, N, P}, {N*P, P, 1}};
tic();
_mlir_ciface_heat_3d_iteration(&A_mem, &B_mem);
double elapsedTime = tok();
sumtime += elapsedTime;
printf("Time: %lf (ms)\n", elapsedTime);
return 0;
}
Can you please check to get the same schedule/IR from Polymer using the provided docker and your newer version? If the schedules are different, that could be an explanation. Also, which version of mlir-opt are you using? Do you also have the slow-down if you run the same Polymer version available in the docker?
I got the different schedule from the provided docker and my newer verison(updated polymer). This is the point of my confusion. I used the same instruction and almost same input(only modified some ir expression to fit llvm like addf->arith.addf as some std op changed to arith dialect). And I also think may be this is why I got a slow down(if I don't use the wrong instruction), but this result even slower than not having to optimize. I think that is abnormal.
or can you update your Polygeist-Script project?
I got the different schedule from the provided docker and my newer verison(updated polymer). This is the point of my confusion. I used the same instruction and almost same input(only modified some ir expression to fit llvm like addf->arith.addf as some std op changed to arith dialect). And I also think may be this is why I got a slow down(if I don't use the wrong instruction), but this result even slower than not having to optimize. I think that is abnormal.
Polymer depends on Polygeist and MLIR. These tools change daily, so having the same inputs and command-line options is sometimes not sufficient to obtain the same performance. We probably have some performance regression when updating Polymer to newer Polygesit and MLIR versions.