ComputeLibrary NEGEMM needs `configure` calling for each `run`

It seems that configure needs to be called per each run for NEGEMM. Because of this issue we have work around in oneDNN fork: https://github.com/openvinotoolkit/oneDNN/commit/a5ca47f7860b3cc44b24b4d5841c628cce28be48

It could be reproduced on ACL 24.08

Reproducer:

#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/utils/misc/MMappedFile.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "tests/Utils.h"
#include "tests/AssetsLibrary.h"
#include "tests/NEON/Accessor.h"
#include "tests/validation/Validation.h"
#include <iostream>
#include <vector>

using namespace arm_compute;
using namespace arm_compute::test;
using namespace arm_compute::test::validation;

void fillTensor(Tensor& t, float min, float max) {
  AssetsLibrary library(".", std::random_device()());
  std::uniform_real_distribution<> dist{ min, max };
  library.fill(Accessor(t), dist, 0);
}

int main(int argc, char *argv[]) {
  GEMMInfo gemmInfo;
  TensorInfo srcTensorInfo = TensorInfo(TensorShape(8, 1, 1, 4), 1, DataType::F32, DataLayout::NCHW);
  TensorInfo weiTensorInfo = TensorInfo(TensorShape(3, 8, 4), 1, DataType::F32, DataLayout::NCHW);
  TensorInfo dstTensorInfo = TensorInfo(TensorShape(3, 1, 1, 4), 1, DataType::F32, DataLayout::NCHW);

  auto status = NEGEMM::validate(&srcTensorInfo, &weiTensorInfo, nullptr, &dstTensorInfo, 1.0f, 0.0f, gemmInfo);
  if(status.error_code() != ErrorCode::OK) {
    std::cout << "ERROR: " << status.error_description().c_str() << std::endl;
    exit(1);
  }
  std::cout << "PASSED VALIDATION" << std::endl;

  Tensor srcTensor;
  Tensor weiTensor;
  Tensor dstTensorSingleConfigure;
  Tensor dstTensorConfigurePerRun;
  srcTensor.allocator()->init(srcTensorInfo);
  weiTensor.allocator()->init(weiTensorInfo);
  dstTensorSingleConfigure.allocator()->init(dstTensorInfo);
  dstTensorConfigurePerRun.allocator()->init(dstTensorInfo);

  NEGEMM gemmSingleConfigure;
  NEGEMM gemmConfigurePerRun;

  gemmSingleConfigure.configure(&srcTensor, &weiTensor, nullptr, &dstTensorSingleConfigure, 1.0f, 0.0f, gemmInfo);
  gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo);
  std::cout << "PASSED CONFIGURATION" << std::endl;

  srcTensor.allocator()->allocate();
  weiTensor.allocator()->allocate();
  dstTensorSingleConfigure.allocator()->allocate();
  dstTensorConfigurePerRun.allocator()->allocate();

  fillTensor(srcTensor, 0.5f, 5.0f);
  fillTensor(weiTensor, 5.0f, 10.0f);
  gemmSingleConfigure.run();
  gemmConfigurePerRun.run();
  std::cout << "PASSED RUN #1" << std::endl;
  std::cout << "dstTensorSingleConfigure" << std::endl;
  dstTensorSingleConfigure.print(std::cout);
  std::cout << "dstTensorConfigurePerRun" << std::endl;
  dstTensorConfigurePerRun.print(std::cout);


  fillTensor(srcTensor, 0.5f, 10.0f);
  fillTensor(weiTensor, 5.0f, 20.0f);
  gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo);
  gemmSingleConfigure.run();
  gemmConfigurePerRun.run();
  std::cout << "PASSED RUN #2" << std::endl;
  std::cout << "dstTensorSingleConfigure" << std::endl;
  dstTensorSingleConfigure.print(std::cout);
  std::cout << "dstTensorConfigurePerRun" << std::endl;
  dstTensorConfigurePerRun.print(std::cout);

  srcTensor.allocator()->free();
  weiTensor.allocator()->free();
  dstTensorSingleConfigure.allocator()->free();
  dstTensorConfigurePerRun.allocator()->free();

  return 0;
}

dst tensors dstTensorSingleConfigure and dstTensorConfigurePerRun are equal after the 1st run, but after the 2nd run they are different. Do we require to call configure for each NEGEMM run?

How ACL was built:

scons arch=armv8.6-a neon=1 os=linux opencl=0 build=native -j 32 debug=1 Werror=false validation_tests=1 fixed_format_kernels=1 multi_isa=1 openmp=0 cppthreads=1  --silent

How to build reproducer:

g++ -O2 -g -I./ComputeLibrary -I./ComputeLibrary/include ~/avoron/acl_gemm.cpp -o bug ./ComputeLibrary/build/tests/AssetsLibrary.o ./ComputeLibrary/build/tests/RawTensor.o -L./ComputeLibrary/build/tests/ -L./ComputeLibrary/build/tests/framework/ -L./ComputeLibrary/build/ -larm_compute_validation_framework -larm_compute_test_framework -larm_compute -std=c++17

Aug 22 '24 17:08 alvoron

Hi @alvoron

There is no need to call configure again if you just update the weights and input.

In your test make sure you set the flag reshape_b_only_on_first_run to false in GEMMInfo

 GEMMInfo gemmInfo
        (false,  //     is_a_reshaped
         false,  // is_b_reshaped,
         false // reshape_b_only_on_first_run,
        );

See the result below

[gemm]$ LD_LIBRARY_PATH=../ComputeLibrary/build/:$LD_LIBRARY_PATH ./test2 
PASSED VALIDATION
 [ComputeLibrary][12-09-2024 02:16:06][INFO]  arm_compute::cpu::CpuGemm::configure() : 
 a: Shape=8,DataLayout=NCHW,DataType=F32
 b: Shape=3,8,4,DataLayout=NCHW,DataType=F32
 c: nullptr
 d: Shape=3,DataLayout=NCHW,DataType=F32
 alpha: 1.000000
 beta: 0.000000
 gemm_info: {is_a_reshaped=0,is_b_reshaped=0,reshape_b_only_on_first_run=0,depth_output_gemm3d=0,reinterpret_input_as_3d=0,retain_internal_weights=0,fp_mixed_precision=0,broadcast_bias=0,pretranspose_B=0,}
 
 [ComputeLibrary][12-09-2024 02:16:06][INFO]  arm_compute::cpu::CpuGemm::configure() : 
 a: Shape=8,DataLayout=NCHW,DataType=F32
 b: Shape=3,8,4,DataLayout=NCHW,DataType=F32
 c: nullptr
 d: Shape=3,DataLayout=NCHW,DataType=F32
 alpha: 1.000000
 beta: 0.000000
 gemm_info: {is_a_reshaped=0,is_b_reshaped=0,reshape_b_only_on_first_run=0,depth_output_gemm3d=0,reinterpret_input_as_3d=0,retain_internal_weights=0,fp_mixed_precision=0,broadcast_bias=0,pretranspose_B=0,}
 
PASSED CONFIGURATION
PASSED RUN #1
dstTensorSingleConfigure
138.176 119.903 134.778 

dstTensorConfigurePerRun
138.176 119.903 134.778 

 [ComputeLibrary][12-09-2024 02:16:06][INFO]  arm_compute::cpu::CpuGemm::configure() : 
 a: Shape=8,DataLayout=NCHW,DataType=F32
 b: Shape=3,8,4,DataLayout=NCHW,DataType=F32
 c: nullptr
 d: Shape=3,DataLayout=NCHW,DataType=F32
 alpha: 1.000000
 beta: 0.000000
 gemm_info: {is_a_reshaped=0,is_b_reshaped=0,reshape_b_only_on_first_run=0,depth_output_gemm3d=0,reinterpret_input_as_3d=0,retain_internal_weights=0,fp_mixed_precision=0,broadcast_bias=0,pretranspose_B=0,}
 
PASSED RUN #2
dstTensorSingleConfigure
423.563  522.41  432.69 

dstTensorConfigurePerRun
423.563  522.41  432.69

Hope this helps

Sep 12 '24 13:09 morgolock

@morgolock Did you modify my reproducer somehow except gemmInfo initialization? I've got the error Maximum number of dimensions expected 3 but dimension 3 is not empty if I initialize gemmInfo and skip the 2nd configuration in the reproducer. I also noticed GEMMInfo is not initialized in oneDNN: https://github.com/oneapi-src/oneDNN/blob/main/src/cpu/aarch64/matmul/acl_matmul_utils.hpp#L66 Does it mean we need to apply the patch to oneDNN to fix this issue with the 2nd configuration?

Sep 23 '24 11:09 alvoron

Hi @alvoron

Did you modify my reproducer somehow except gemmInfo initialization? I've got the error Maximum number of dimensions expected 3 but dimension 3 is not empty if I initialize gemmInfo and skip the 2nd configuration in the reproducer.

Yes see below, I dropped the 4th dimension.

 TensorInfo srcTensorInfo = TensorInfo(TensorShape(8, 1, 1), 1, DataType::F32, DataLayout::NCHW);
  TensorInfo weiTensorInfo = TensorInfo(TensorShape(3, 8, 4), 1, DataType::F32, DataLayout::NCHW);
  TensorInfo dstTensorInfo = TensorInfo(TensorShape(3, 1, 1), 1, DataType::F32, DataLayout::NCHW);
  auto status = NEGEMM::validate(&srcTensorInfo, &weiTensorInfo, nullptr, &dstTensorInfo, 1.0f, 0.0f, gemmInfo);
  if(status.error_code() != ErrorCode::OK) {
    std::cout << "ERROR: " << status.error_description().c_str() << std::endl;
    exit(1);
  }
  std::cout << "PASSED VALIDATION" << std::endl;

Sep 23 '24 12:09 morgolock

@morgolock in theory, configure call per inference should not break accuracy, right?

I compared dst tensors for 2 cases:

configure per inference + reshape_b_only_on_first_run=true
the single configure + reshape_b_only_on_first_run=false

In these 2 cases I've got different results for the 1st and for the 2nd run:

PASSED RUN #1
dstTensorSingleConfigure
178.938  183.26  180.34 

dstTensorConfigurePerRun
 157.27 192.243 178.135 

PASSED RUN #2
dstTensorSingleConfigure
516.653 679.692 532.997 

dstTensorConfigurePerRun
571.231 567.867 578.185

Modified reproducer:

#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/utils/misc/MMappedFile.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
//#include "arm_compute/function_info/GEMMInfo.h"
#include "tests/Utils.h"
#include "tests/AssetsLibrary.h"
#include "tests/NEON/Accessor.h"
#include "tests/validation/Validation.h"
#include <iostream>
#include <vector>

using namespace arm_compute;
using namespace arm_compute::test;
using namespace arm_compute::test::validation;

void fillTensor(Tensor& t, float min, float max) {
  AssetsLibrary library(".", std::random_device()());
  std::uniform_real_distribution<> dist{ min, max };
  library.fill(Accessor(t), dist, 0);
}

int main(int argc, char *argv[]) {
 
  GEMMInfo gemmInfo_reshape_always(false, false, false);
  GEMMInfo gemmInfo_reshape_1st_run_only;
  TensorInfo srcTensorInfo = TensorInfo(TensorShape(8, 1, 1), 1, DataType::F32, DataLayout::NCHW);
  TensorInfo weiTensorInfo = TensorInfo(TensorShape(3, 8, 4), 1, DataType::F32, DataLayout::NCHW);
  TensorInfo dstTensorInfo = TensorInfo(TensorShape(3, 1, 1), 1, DataType::F32, DataLayout::NCHW);

  auto status = NEGEMM::validate(&srcTensorInfo, &weiTensorInfo, nullptr, &dstTensorInfo, 1.0f, 0.0f, gemmInfo_reshape_1st_run_only);
  if(status.error_code() != ErrorCode::OK) {
    std::cout << "ERROR: " << status.error_description().c_str() << std::endl;
    exit(1);
  }
  std::cout << "PASSED VALIDATION #1" << std::endl;
  status = NEGEMM::validate(&srcTensorInfo, &weiTensorInfo, nullptr, &dstTensorInfo, 1.0f, 0.0f, gemmInfo_reshape_always);
  if(status.error_code() != ErrorCode::OK) {
    std::cout << "ERROR: " << status.error_description().c_str() << std::endl;
    exit(1);
  }
  std::cout << "PASSED VALIDATION #2" << std::endl;

  Tensor srcTensor;
  Tensor weiTensor;
  Tensor dstTensorSingleConfigure;
  Tensor dstTensorConfigurePerRun;
  srcTensor.allocator()->init(srcTensorInfo);
  weiTensor.allocator()->init(weiTensorInfo);
  dstTensorSingleConfigure.allocator()->init(dstTensorInfo);
  dstTensorConfigurePerRun.allocator()->init(dstTensorInfo);

  NEGEMM gemmSingleConfigure;
  NEGEMM gemmConfigurePerRun;

  gemmSingleConfigure.configure(&srcTensor, &weiTensor, nullptr, &dstTensorSingleConfigure, 1.0f, 0.0f, gemmInfo_reshape_always);
  gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo_reshape_1st_run_only);
  std::cout << "PASSED CONFIGURATION" << std::endl;

  srcTensor.allocator()->allocate();
  weiTensor.allocator()->allocate();
  dstTensorSingleConfigure.allocator()->allocate();
  dstTensorConfigurePerRun.allocator()->allocate();

  fillTensor(srcTensor, 0.5f, 5.0f);
  fillTensor(weiTensor, 5.0f, 10.0f);
  gemmSingleConfigure.run();
  gemmConfigurePerRun.run();
  std::cout << "PASSED RUN #1" << std::endl;
  std::cout << "dstTensorSingleConfigure" << std::endl;
  dstTensorSingleConfigure.print(std::cout);
  std::cout << "dstTensorConfigurePerRun" << std::endl;
  dstTensorConfigurePerRun.print(std::cout);


  fillTensor(srcTensor, 0.5f, 10.0f);
  fillTensor(weiTensor, 5.0f, 20.0f);
  gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo_reshape_1st_run_only);
  gemmSingleConfigure.run();
  gemmConfigurePerRun.run();
  std::cout << "PASSED RUN #2" << std::endl;
  std::cout << "dstTensorSingleConfigure" << std::endl;
  dstTensorSingleConfigure.print(std::cout);
  std::cout << "dstTensorConfigurePerRun" << std::endl;
  dstTensorConfigurePerRun.print(std::cout);

  srcTensor.allocator()->free();
  weiTensor.allocator()->free();
  dstTensorSingleConfigure.allocator()->free();
  dstTensorConfigurePerRun.allocator()->free();

  return 0;
}

May we ask somebody to review oneDNN integration as well? As I mentioned, GEMMInfo is not initialized there, so reshape_b_only_on_first_run is true and it should impact accuracy, since configure is not called per inference in oneDNN.

Sep 23 '24 16:09 alvoron

Hi @alvoron

Please see the modified example below where the results match exactly

#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/utils/misc/MMappedFile.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "tests/Utils.h"
#include "tests/AssetsLibrary.h"
#include "tests/NEON/Accessor.h"
#include "tests/validation/Validation.h"
#include <iostream>
#include <vector>

using namespace arm_compute;
using namespace arm_compute::test;
using namespace arm_compute::test::validation;

void fillTensor(Tensor& t, float min, float max) {
  AssetsLibrary library(".", std::random_device()());
  std::uniform_real_distribution<> dist{ min, max };
  library.fill(Accessor(t), dist, 0);
}

int main(int argc, char *argv[]) {
  GEMMInfo gemmInfo
        (false,  //     is_a_reshaped
         false,  // is_b_reshaped,
         false // reshape_b_only_on_first_run,
        );
  TensorInfo srcTensorInfo = TensorInfo(TensorShape(8, 1, 1), 1, DataType::F32, DataLayout::NCHW);
  TensorInfo weiTensorInfo = TensorInfo(TensorShape(3, 8, 4), 1, DataType::F32, DataLayout::NCHW);
  TensorInfo dstTensorInfo = TensorInfo(TensorShape(3, 1, 1), 1, DataType::F32, DataLayout::NCHW);
  auto status = NEGEMM::validate(&srcTensorInfo, &weiTensorInfo, nullptr, &dstTensorInfo, 1.0f, 0.0f, gemmInfo);
  if(status.error_code() != ErrorCode::OK) {
    std::cout << "ERROR: " << status.error_description().c_str() << std::endl;
    exit(1);
  }
  std::cout << "PASSED VALIDATION" << std::endl;

  Tensor srcTensor;
  Tensor weiTensor;
  Tensor dstTensorSingleConfigure;
  Tensor dstTensorConfigurePerRun;
  srcTensor.allocator()->init(srcTensorInfo);
  weiTensor.allocator()->init(weiTensorInfo);
  dstTensorSingleConfigure.allocator()->init(dstTensorInfo);
  dstTensorConfigurePerRun.allocator()->init(dstTensorInfo);

  NEGEMM gemmSingleConfigure;
  NEGEMM gemmConfigurePerRun;

  gemmSingleConfigure.configure(&srcTensor, &weiTensor, nullptr, &dstTensorSingleConfigure, 1.0f, 0.0f, gemmInfo);
  gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo);
  std::cout << "PASSED CONFIGURATION" << std::endl;

  srcTensor.allocator()->allocate();
  weiTensor.allocator()->allocate();
  dstTensorSingleConfigure.allocator()->allocate();
  dstTensorConfigurePerRun.allocator()->allocate();

  fillTensor(srcTensor, 0.5f, 5.0f);
  fillTensor(weiTensor, 5.0f, 10.0f);
  gemmSingleConfigure.run();
  gemmConfigurePerRun.run();
  std::cout << "PASSED RUN #1" << std::endl;
  std::cout << "dstTensorSingleConfigure" << std::endl;
  dstTensorSingleConfigure.print(std::cout);
  std::cout << "dstTensorConfigurePerRun" << std::endl;
  dstTensorConfigurePerRun.print(std::cout);


  fillTensor(srcTensor, 0.5f, 10.0f);
  fillTensor(weiTensor, 5.0f, 20.0f);
  gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo);
  gemmSingleConfigure.run();
  gemmConfigurePerRun.run();
  std::cout << "PASSED RUN #2" << std::endl;
  std::cout << "dstTensorSingleConfigure" << std::endl;
  dstTensorSingleConfigure.print(std::cout);
  std::cout << "dstTensorConfigurePerRun" << std::endl;
  dstTensorConfigurePerRun.print(std::cout);

  srcTensor.allocator()->free();
  weiTensor.allocator()->free();
  dstTensorSingleConfigure.allocator()->free();
  dstTensorConfigurePerRun.allocator()->free();

  return 0;
}

And the output

g++ -O2 -g -I../ComputeLibrary -I../ComputeLibrary/include ./test.cpp -o test2 ../ComputeLibrary/build/tests/AssetsLibrary.o ../ComputeLibrary/build/tests/RawTensor.o -L../ComputeLibrary/build/tests/ -L../ComputeLibrary/build/tests/framework/ -L../ComputeLibrary/build/ -larm_compute_validation_framework -larm_compute_test_framework -larm_compute -std=c++17
[pabtel01@ampere-man-01 openvino_gemm]$ LD_LIBRARY_PATH=../ComputeLibrary/build/:$LD_LIBRARY_PATH ./test2 
PASSED VALIDATION
 [ComputeLibrary][24-09-2024 02:32:46][INFO]  arm_compute::cpu::CpuGemm::configure() : 
 a: Shape=8,DataLayout=NCHW,DataType=F32
 b: Shape=3,8,4,DataLayout=NCHW,DataType=F32
 c: nullptr
 d: Shape=3,DataLayout=NCHW,DataType=F32
 alpha: 1.000000
 beta: 0.000000
 gemm_info: {is_a_reshaped=0,is_b_reshaped=0,reshape_b_only_on_first_run=0,depth_output_gemm3d=0,reinterpret_input_as_3d=0,retain_internal_weights=0,fp_mixed_precision=0,broadcast_bias=0,pretranspose_B=0,}
 
 [ComputeLibrary][24-09-2024 02:32:46][INFO]  arm_compute::cpu::CpuGemm::configure() : 
 a: Shape=8,DataLayout=NCHW,DataType=F32
 b: Shape=3,8,4,DataLayout=NCHW,DataType=F32
 c: nullptr
 d: Shape=3,DataLayout=NCHW,DataType=F32
 alpha: 1.000000
 beta: 0.000000
 gemm_info: {is_a_reshaped=0,is_b_reshaped=0,reshape_b_only_on_first_run=0,depth_output_gemm3d=0,reinterpret_input_as_3d=0,retain_internal_weights=0,fp_mixed_precision=0,broadcast_bias=0,pretranspose_B=0,}
 
PASSED CONFIGURATION
PASSED RUN #1
dstTensorSingleConfigure
142.591 118.595 120.105 

dstTensorConfigurePerRun
142.591 118.595 120.105 

 [ComputeLibrary][24-09-2024 02:32:46][INFO]  arm_compute::cpu::CpuGemm::configure() : 
 a: Shape=8,DataLayout=NCHW,DataType=F32
 b: Shape=3,8,4,DataLayout=NCHW,DataType=F32
 c: nullptr
 d: Shape=3,DataLayout=NCHW,DataType=F32
 alpha: 1.000000
 beta: 0.000000
 gemm_info: {is_a_reshaped=0,is_b_reshaped=0,reshape_b_only_on_first_run=0,depth_output_gemm3d=0,reinterpret_input_as_3d=0,retain_internal_weights=0,fp_mixed_precision=0,broadcast_bias=0,pretranspose_B=0,}
 
PASSED RUN #2
dstTensorSingleConfigure
471.933 592.889  518.01 

dstTensorConfigurePerRun
471.933 592.889  518.01

May we ask somebody to review oneDNN integration as well? As I mentioned, GEMMInfo is not initialized there, so reshape_b_only_on_first_run is true and it should impact accuracy, since configure is not called per inference in oneDNN.

We'll have a look at the code in OneDNN.

Sep 24 '24 13:09 morgolock

Hi @alvoron,

We'll have a look at the code in OneDNN.

Since oneDNN 3.6 we are using stateless operators from ACL so in implementation of oneDNN matmul primitive we are not calling NEGEMM but experimental::op::ll::CpuGemmAssemblyDispatch as you can see per here: https://github.com/oneapi-src/oneDNN/blob/42b0d5cc48e0d826792762a0cb53b4c8b9b9caa6/src/cpu/aarch64/matmul/acl_matmul_utils.hpp#L43. I have put updated code below that you have shared where this operator is called instead of NEGEMM and you can see that configure() is called only once and then before calling run() we pack vectors (ITensorPack) so that (their updates) are passed to the operator.

cc: @theComputeKid

#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/utils/misc/MMappedFile.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h"
#include "tests/Utils.h"
#include "tests/AssetsLibrary.h"
#include "tests/NEON/Accessor.h"
#include "tests/validation/Validation.h"
#include <iostream>
#include <vector>

using namespace arm_compute;
using namespace arm_compute::test;
using namespace arm_compute::test::validation;

void fillTensor(Tensor& t, float min, float max) {
  AssetsLibrary library(".", std::random_device()());
  std::uniform_real_distribution<> dist{ min, max };
  library.fill(Accessor(t), dist, 0);
}

int main(int argc, char *argv[]) {
  GEMMInfo gemmInfo;
  TensorInfo srcTensorInfo = TensorInfo(TensorShape(8, 4), 1, DataType::F32, DataLayout::NCHW);
  TensorInfo weiTensorInfo = TensorInfo(TensorShape(3, 8, 4), 1, DataType::F32, DataLayout::NCHW);
  TensorInfo dstTensorInfo = TensorInfo(TensorShape(3, 4), 1, DataType::F32, DataLayout::NCHW);

  auto status = NEGEMM::validate(&srcTensorInfo, &weiTensorInfo, nullptr, &dstTensorInfo, 1.0f, 0.0f, gemmInfo);
  if(status.error_code() != ErrorCode::OK) {
    std::cout << "ERROR: " << status.error_description().c_str() << std::endl;
    exit(1);
  }
  std::cout << "PASSED VALIDATION" << std::endl;

  Tensor srcTensor;
  Tensor weiTensor;
  Tensor dstTensorSingleConfigure;
  Tensor dstTensorConfigurePerRun;
  srcTensor.allocator()->init(srcTensorInfo);
  weiTensor.allocator()->init(weiTensorInfo);
  dstTensorSingleConfigure.allocator()->init(dstTensorInfo);
  dstTensorConfigurePerRun.allocator()->init(dstTensorInfo);

  experimental::op::ll::CpuGemmAssemblyDispatch gemmSingleConfigure;
  NEGEMM gemmConfigurePerRun;

  gemmSingleConfigure.configure(&srcTensorInfo, &weiTensorInfo, nullptr, &dstTensorInfo, gemmInfo);

  gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo);
  std::cout << "PASSED CONFIGURATION" << std::endl;

  srcTensor.allocator()->allocate();
  weiTensor.allocator()->allocate();
  dstTensorSingleConfigure.allocator()->allocate();
  dstTensorConfigurePerRun.allocator()->allocate();

  fillTensor(srcTensor, 0.5f, 5.0f);
  fillTensor(weiTensor, 5.0f, 10.0f);

  ITensorPack matmul_pack_1;
  matmul_pack_1.add_const_tensor(TensorType::ACL_SRC_0, &srcTensor);
  matmul_pack_1.add_const_tensor(TensorType::ACL_SRC_1, &weiTensor);
  matmul_pack_1.add_tensor(TensorType::ACL_DST, &dstTensorSingleConfigure);
  gemmSingleConfigure.run(matmul_pack_1);
  gemmConfigurePerRun.run();
  std::cout << "PASSED RUN #1" << std::endl;
  std::cout << "dstTensorSingleConfigure" << std::endl;
  dstTensorSingleConfigure.print(std::cout);
  std::cout << "dstTensorConfigurePerRun" << std::endl;
  dstTensorConfigurePerRun.print(std::cout);


  fillTensor(srcTensor, 0.5f, 10.0f);
  fillTensor(weiTensor, 5.0f, 20.0f);
  gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo);
  ITensorPack matmul_pack_2;
  matmul_pack_2.add_const_tensor(TensorType::ACL_SRC_0, &srcTensor);
  matmul_pack_2.add_const_tensor(TensorType::ACL_SRC_1, &weiTensor);
  matmul_pack_2.add_tensor(TensorType::ACL_DST, &dstTensorSingleConfigure);
  //gemmSingleConfigure.run(matmul_pack_2);
  gemmConfigurePerRun.run();
  std::cout << "PASSED RUN #2" << std::endl;
  std::cout << "dstTensorSingleConfigure" << std::endl;
  dstTensorSingleConfigure.print(std::cout);
  std::cout << "dstTensorConfigurePerRun" << std::endl;
  dstTensorConfigurePerRun.print(std::cout);

  srcTensor.allocator()->free();
  weiTensor.allocator()->free();
  dstTensorSingleConfigure.allocator()->free();
  dstTensorConfigurePerRun.allocator()->free();

  return 0;
}

Nov 20 '24 22:11 milpuz01

@milpuz01 thank you for the explanation. Let's wait for oneDNN upgrade in OpenVINO then.

Dec 04 '24 12:12 alvoron