NEGEMM needs `configure` calling for each `run`
It seems that configure needs to be called per each run for NEGEMM.
Because of this issue we have work around in oneDNN fork: https://github.com/openvinotoolkit/oneDNN/commit/a5ca47f7860b3cc44b24b4d5841c628cce28be48
It could be reproduced on ACL 24.08
Reproducer:
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/utils/misc/MMappedFile.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "tests/Utils.h"
#include "tests/AssetsLibrary.h"
#include "tests/NEON/Accessor.h"
#include "tests/validation/Validation.h"
#include <iostream>
#include <vector>
using namespace arm_compute;
using namespace arm_compute::test;
using namespace arm_compute::test::validation;
void fillTensor(Tensor& t, float min, float max) {
AssetsLibrary library(".", std::random_device()());
std::uniform_real_distribution<> dist{ min, max };
library.fill(Accessor(t), dist, 0);
}
int main(int argc, char *argv[]) {
GEMMInfo gemmInfo;
TensorInfo srcTensorInfo = TensorInfo(TensorShape(8, 1, 1, 4), 1, DataType::F32, DataLayout::NCHW);
TensorInfo weiTensorInfo = TensorInfo(TensorShape(3, 8, 4), 1, DataType::F32, DataLayout::NCHW);
TensorInfo dstTensorInfo = TensorInfo(TensorShape(3, 1, 1, 4), 1, DataType::F32, DataLayout::NCHW);
auto status = NEGEMM::validate(&srcTensorInfo, &weiTensorInfo, nullptr, &dstTensorInfo, 1.0f, 0.0f, gemmInfo);
if(status.error_code() != ErrorCode::OK) {
std::cout << "ERROR: " << status.error_description().c_str() << std::endl;
exit(1);
}
std::cout << "PASSED VALIDATION" << std::endl;
Tensor srcTensor;
Tensor weiTensor;
Tensor dstTensorSingleConfigure;
Tensor dstTensorConfigurePerRun;
srcTensor.allocator()->init(srcTensorInfo);
weiTensor.allocator()->init(weiTensorInfo);
dstTensorSingleConfigure.allocator()->init(dstTensorInfo);
dstTensorConfigurePerRun.allocator()->init(dstTensorInfo);
NEGEMM gemmSingleConfigure;
NEGEMM gemmConfigurePerRun;
gemmSingleConfigure.configure(&srcTensor, &weiTensor, nullptr, &dstTensorSingleConfigure, 1.0f, 0.0f, gemmInfo);
gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo);
std::cout << "PASSED CONFIGURATION" << std::endl;
srcTensor.allocator()->allocate();
weiTensor.allocator()->allocate();
dstTensorSingleConfigure.allocator()->allocate();
dstTensorConfigurePerRun.allocator()->allocate();
fillTensor(srcTensor, 0.5f, 5.0f);
fillTensor(weiTensor, 5.0f, 10.0f);
gemmSingleConfigure.run();
gemmConfigurePerRun.run();
std::cout << "PASSED RUN #1" << std::endl;
std::cout << "dstTensorSingleConfigure" << std::endl;
dstTensorSingleConfigure.print(std::cout);
std::cout << "dstTensorConfigurePerRun" << std::endl;
dstTensorConfigurePerRun.print(std::cout);
fillTensor(srcTensor, 0.5f, 10.0f);
fillTensor(weiTensor, 5.0f, 20.0f);
gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo);
gemmSingleConfigure.run();
gemmConfigurePerRun.run();
std::cout << "PASSED RUN #2" << std::endl;
std::cout << "dstTensorSingleConfigure" << std::endl;
dstTensorSingleConfigure.print(std::cout);
std::cout << "dstTensorConfigurePerRun" << std::endl;
dstTensorConfigurePerRun.print(std::cout);
srcTensor.allocator()->free();
weiTensor.allocator()->free();
dstTensorSingleConfigure.allocator()->free();
dstTensorConfigurePerRun.allocator()->free();
return 0;
}
dst tensors dstTensorSingleConfigure and dstTensorConfigurePerRun are equal after the 1st run, but after the 2nd run they are different.
Do we require to call configure for each NEGEMM run?
How ACL was built:
scons arch=armv8.6-a neon=1 os=linux opencl=0 build=native -j 32 debug=1 Werror=false validation_tests=1 fixed_format_kernels=1 multi_isa=1 openmp=0 cppthreads=1 --silent
How to build reproducer:
g++ -O2 -g -I./ComputeLibrary -I./ComputeLibrary/include ~/avoron/acl_gemm.cpp -o bug ./ComputeLibrary/build/tests/AssetsLibrary.o ./ComputeLibrary/build/tests/RawTensor.o -L./ComputeLibrary/build/tests/ -L./ComputeLibrary/build/tests/framework/ -L./ComputeLibrary/build/ -larm_compute_validation_framework -larm_compute_test_framework -larm_compute -std=c++17
Hi @alvoron
There is no need to call configure again if you just update the weights and input.
In your test make sure you set the flag reshape_b_only_on_first_run to false in GEMMInfo
GEMMInfo gemmInfo
(false, // is_a_reshaped
false, // is_b_reshaped,
false // reshape_b_only_on_first_run,
);
See the result below
[gemm]$ LD_LIBRARY_PATH=../ComputeLibrary/build/:$LD_LIBRARY_PATH ./test2
PASSED VALIDATION
[ComputeLibrary][12-09-2024 02:16:06][INFO] arm_compute::cpu::CpuGemm::configure() :
a: Shape=8,DataLayout=NCHW,DataType=F32
b: Shape=3,8,4,DataLayout=NCHW,DataType=F32
c: nullptr
d: Shape=3,DataLayout=NCHW,DataType=F32
alpha: 1.000000
beta: 0.000000
gemm_info: {is_a_reshaped=0,is_b_reshaped=0,reshape_b_only_on_first_run=0,depth_output_gemm3d=0,reinterpret_input_as_3d=0,retain_internal_weights=0,fp_mixed_precision=0,broadcast_bias=0,pretranspose_B=0,}
[ComputeLibrary][12-09-2024 02:16:06][INFO] arm_compute::cpu::CpuGemm::configure() :
a: Shape=8,DataLayout=NCHW,DataType=F32
b: Shape=3,8,4,DataLayout=NCHW,DataType=F32
c: nullptr
d: Shape=3,DataLayout=NCHW,DataType=F32
alpha: 1.000000
beta: 0.000000
gemm_info: {is_a_reshaped=0,is_b_reshaped=0,reshape_b_only_on_first_run=0,depth_output_gemm3d=0,reinterpret_input_as_3d=0,retain_internal_weights=0,fp_mixed_precision=0,broadcast_bias=0,pretranspose_B=0,}
PASSED CONFIGURATION
PASSED RUN #1
dstTensorSingleConfigure
138.176 119.903 134.778
dstTensorConfigurePerRun
138.176 119.903 134.778
[ComputeLibrary][12-09-2024 02:16:06][INFO] arm_compute::cpu::CpuGemm::configure() :
a: Shape=8,DataLayout=NCHW,DataType=F32
b: Shape=3,8,4,DataLayout=NCHW,DataType=F32
c: nullptr
d: Shape=3,DataLayout=NCHW,DataType=F32
alpha: 1.000000
beta: 0.000000
gemm_info: {is_a_reshaped=0,is_b_reshaped=0,reshape_b_only_on_first_run=0,depth_output_gemm3d=0,reinterpret_input_as_3d=0,retain_internal_weights=0,fp_mixed_precision=0,broadcast_bias=0,pretranspose_B=0,}
PASSED RUN #2
dstTensorSingleConfigure
423.563 522.41 432.69
dstTensorConfigurePerRun
423.563 522.41 432.69
Hope this helps
@morgolock
Did you modify my reproducer somehow except gemmInfo initialization?
I've got the error Maximum number of dimensions expected 3 but dimension 3 is not empty if I initialize gemmInfo and skip the 2nd configuration in the reproducer.
I also noticed GEMMInfo is not initialized in oneDNN: https://github.com/oneapi-src/oneDNN/blob/main/src/cpu/aarch64/matmul/acl_matmul_utils.hpp#L66
Does it mean we need to apply the patch to oneDNN to fix this issue with the 2nd configuration?
Hi @alvoron
Did you modify my reproducer somehow except gemmInfo initialization? I've got the error Maximum number of dimensions expected 3 but dimension 3 is not empty if I initialize gemmInfo and skip the 2nd configuration in the reproducer.
Yes see below, I dropped the 4th dimension.
TensorInfo srcTensorInfo = TensorInfo(TensorShape(8, 1, 1), 1, DataType::F32, DataLayout::NCHW);
TensorInfo weiTensorInfo = TensorInfo(TensorShape(3, 8, 4), 1, DataType::F32, DataLayout::NCHW);
TensorInfo dstTensorInfo = TensorInfo(TensorShape(3, 1, 1), 1, DataType::F32, DataLayout::NCHW);
auto status = NEGEMM::validate(&srcTensorInfo, &weiTensorInfo, nullptr, &dstTensorInfo, 1.0f, 0.0f, gemmInfo);
if(status.error_code() != ErrorCode::OK) {
std::cout << "ERROR: " << status.error_description().c_str() << std::endl;
exit(1);
}
std::cout << "PASSED VALIDATION" << std::endl;
@morgolock in theory, configure call per inference should not break accuracy, right?
I compared dst tensors for 2 cases:
configureper inference +reshape_b_only_on_first_run=true- the single
configure+reshape_b_only_on_first_run=false
In these 2 cases I've got different results for the 1st and for the 2nd run:
PASSED RUN #1
dstTensorSingleConfigure
178.938 183.26 180.34
dstTensorConfigurePerRun
157.27 192.243 178.135
PASSED RUN #2
dstTensorSingleConfigure
516.653 679.692 532.997
dstTensorConfigurePerRun
571.231 567.867 578.185
Modified reproducer:
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/utils/misc/MMappedFile.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
//#include "arm_compute/function_info/GEMMInfo.h"
#include "tests/Utils.h"
#include "tests/AssetsLibrary.h"
#include "tests/NEON/Accessor.h"
#include "tests/validation/Validation.h"
#include <iostream>
#include <vector>
using namespace arm_compute;
using namespace arm_compute::test;
using namespace arm_compute::test::validation;
void fillTensor(Tensor& t, float min, float max) {
AssetsLibrary library(".", std::random_device()());
std::uniform_real_distribution<> dist{ min, max };
library.fill(Accessor(t), dist, 0);
}
int main(int argc, char *argv[]) {
GEMMInfo gemmInfo_reshape_always(false, false, false);
GEMMInfo gemmInfo_reshape_1st_run_only;
TensorInfo srcTensorInfo = TensorInfo(TensorShape(8, 1, 1), 1, DataType::F32, DataLayout::NCHW);
TensorInfo weiTensorInfo = TensorInfo(TensorShape(3, 8, 4), 1, DataType::F32, DataLayout::NCHW);
TensorInfo dstTensorInfo = TensorInfo(TensorShape(3, 1, 1), 1, DataType::F32, DataLayout::NCHW);
auto status = NEGEMM::validate(&srcTensorInfo, &weiTensorInfo, nullptr, &dstTensorInfo, 1.0f, 0.0f, gemmInfo_reshape_1st_run_only);
if(status.error_code() != ErrorCode::OK) {
std::cout << "ERROR: " << status.error_description().c_str() << std::endl;
exit(1);
}
std::cout << "PASSED VALIDATION #1" << std::endl;
status = NEGEMM::validate(&srcTensorInfo, &weiTensorInfo, nullptr, &dstTensorInfo, 1.0f, 0.0f, gemmInfo_reshape_always);
if(status.error_code() != ErrorCode::OK) {
std::cout << "ERROR: " << status.error_description().c_str() << std::endl;
exit(1);
}
std::cout << "PASSED VALIDATION #2" << std::endl;
Tensor srcTensor;
Tensor weiTensor;
Tensor dstTensorSingleConfigure;
Tensor dstTensorConfigurePerRun;
srcTensor.allocator()->init(srcTensorInfo);
weiTensor.allocator()->init(weiTensorInfo);
dstTensorSingleConfigure.allocator()->init(dstTensorInfo);
dstTensorConfigurePerRun.allocator()->init(dstTensorInfo);
NEGEMM gemmSingleConfigure;
NEGEMM gemmConfigurePerRun;
gemmSingleConfigure.configure(&srcTensor, &weiTensor, nullptr, &dstTensorSingleConfigure, 1.0f, 0.0f, gemmInfo_reshape_always);
gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo_reshape_1st_run_only);
std::cout << "PASSED CONFIGURATION" << std::endl;
srcTensor.allocator()->allocate();
weiTensor.allocator()->allocate();
dstTensorSingleConfigure.allocator()->allocate();
dstTensorConfigurePerRun.allocator()->allocate();
fillTensor(srcTensor, 0.5f, 5.0f);
fillTensor(weiTensor, 5.0f, 10.0f);
gemmSingleConfigure.run();
gemmConfigurePerRun.run();
std::cout << "PASSED RUN #1" << std::endl;
std::cout << "dstTensorSingleConfigure" << std::endl;
dstTensorSingleConfigure.print(std::cout);
std::cout << "dstTensorConfigurePerRun" << std::endl;
dstTensorConfigurePerRun.print(std::cout);
fillTensor(srcTensor, 0.5f, 10.0f);
fillTensor(weiTensor, 5.0f, 20.0f);
gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo_reshape_1st_run_only);
gemmSingleConfigure.run();
gemmConfigurePerRun.run();
std::cout << "PASSED RUN #2" << std::endl;
std::cout << "dstTensorSingleConfigure" << std::endl;
dstTensorSingleConfigure.print(std::cout);
std::cout << "dstTensorConfigurePerRun" << std::endl;
dstTensorConfigurePerRun.print(std::cout);
srcTensor.allocator()->free();
weiTensor.allocator()->free();
dstTensorSingleConfigure.allocator()->free();
dstTensorConfigurePerRun.allocator()->free();
return 0;
}
May we ask somebody to review oneDNN integration as well?
As I mentioned, GEMMInfo is not initialized there, so reshape_b_only_on_first_run is true and it should impact accuracy, since configure is not called per inference in oneDNN.
Hi @alvoron
Please see the modified example below where the results match exactly
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/utils/misc/MMappedFile.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "tests/Utils.h"
#include "tests/AssetsLibrary.h"
#include "tests/NEON/Accessor.h"
#include "tests/validation/Validation.h"
#include <iostream>
#include <vector>
using namespace arm_compute;
using namespace arm_compute::test;
using namespace arm_compute::test::validation;
void fillTensor(Tensor& t, float min, float max) {
AssetsLibrary library(".", std::random_device()());
std::uniform_real_distribution<> dist{ min, max };
library.fill(Accessor(t), dist, 0);
}
int main(int argc, char *argv[]) {
GEMMInfo gemmInfo
(false, // is_a_reshaped
false, // is_b_reshaped,
false // reshape_b_only_on_first_run,
);
TensorInfo srcTensorInfo = TensorInfo(TensorShape(8, 1, 1), 1, DataType::F32, DataLayout::NCHW);
TensorInfo weiTensorInfo = TensorInfo(TensorShape(3, 8, 4), 1, DataType::F32, DataLayout::NCHW);
TensorInfo dstTensorInfo = TensorInfo(TensorShape(3, 1, 1), 1, DataType::F32, DataLayout::NCHW);
auto status = NEGEMM::validate(&srcTensorInfo, &weiTensorInfo, nullptr, &dstTensorInfo, 1.0f, 0.0f, gemmInfo);
if(status.error_code() != ErrorCode::OK) {
std::cout << "ERROR: " << status.error_description().c_str() << std::endl;
exit(1);
}
std::cout << "PASSED VALIDATION" << std::endl;
Tensor srcTensor;
Tensor weiTensor;
Tensor dstTensorSingleConfigure;
Tensor dstTensorConfigurePerRun;
srcTensor.allocator()->init(srcTensorInfo);
weiTensor.allocator()->init(weiTensorInfo);
dstTensorSingleConfigure.allocator()->init(dstTensorInfo);
dstTensorConfigurePerRun.allocator()->init(dstTensorInfo);
NEGEMM gemmSingleConfigure;
NEGEMM gemmConfigurePerRun;
gemmSingleConfigure.configure(&srcTensor, &weiTensor, nullptr, &dstTensorSingleConfigure, 1.0f, 0.0f, gemmInfo);
gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo);
std::cout << "PASSED CONFIGURATION" << std::endl;
srcTensor.allocator()->allocate();
weiTensor.allocator()->allocate();
dstTensorSingleConfigure.allocator()->allocate();
dstTensorConfigurePerRun.allocator()->allocate();
fillTensor(srcTensor, 0.5f, 5.0f);
fillTensor(weiTensor, 5.0f, 10.0f);
gemmSingleConfigure.run();
gemmConfigurePerRun.run();
std::cout << "PASSED RUN #1" << std::endl;
std::cout << "dstTensorSingleConfigure" << std::endl;
dstTensorSingleConfigure.print(std::cout);
std::cout << "dstTensorConfigurePerRun" << std::endl;
dstTensorConfigurePerRun.print(std::cout);
fillTensor(srcTensor, 0.5f, 10.0f);
fillTensor(weiTensor, 5.0f, 20.0f);
gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo);
gemmSingleConfigure.run();
gemmConfigurePerRun.run();
std::cout << "PASSED RUN #2" << std::endl;
std::cout << "dstTensorSingleConfigure" << std::endl;
dstTensorSingleConfigure.print(std::cout);
std::cout << "dstTensorConfigurePerRun" << std::endl;
dstTensorConfigurePerRun.print(std::cout);
srcTensor.allocator()->free();
weiTensor.allocator()->free();
dstTensorSingleConfigure.allocator()->free();
dstTensorConfigurePerRun.allocator()->free();
return 0;
}
And the output
g++ -O2 -g -I../ComputeLibrary -I../ComputeLibrary/include ./test.cpp -o test2 ../ComputeLibrary/build/tests/AssetsLibrary.o ../ComputeLibrary/build/tests/RawTensor.o -L../ComputeLibrary/build/tests/ -L../ComputeLibrary/build/tests/framework/ -L../ComputeLibrary/build/ -larm_compute_validation_framework -larm_compute_test_framework -larm_compute -std=c++17
[pabtel01@ampere-man-01 openvino_gemm]$ LD_LIBRARY_PATH=../ComputeLibrary/build/:$LD_LIBRARY_PATH ./test2
PASSED VALIDATION
[ComputeLibrary][24-09-2024 02:32:46][INFO] arm_compute::cpu::CpuGemm::configure() :
a: Shape=8,DataLayout=NCHW,DataType=F32
b: Shape=3,8,4,DataLayout=NCHW,DataType=F32
c: nullptr
d: Shape=3,DataLayout=NCHW,DataType=F32
alpha: 1.000000
beta: 0.000000
gemm_info: {is_a_reshaped=0,is_b_reshaped=0,reshape_b_only_on_first_run=0,depth_output_gemm3d=0,reinterpret_input_as_3d=0,retain_internal_weights=0,fp_mixed_precision=0,broadcast_bias=0,pretranspose_B=0,}
[ComputeLibrary][24-09-2024 02:32:46][INFO] arm_compute::cpu::CpuGemm::configure() :
a: Shape=8,DataLayout=NCHW,DataType=F32
b: Shape=3,8,4,DataLayout=NCHW,DataType=F32
c: nullptr
d: Shape=3,DataLayout=NCHW,DataType=F32
alpha: 1.000000
beta: 0.000000
gemm_info: {is_a_reshaped=0,is_b_reshaped=0,reshape_b_only_on_first_run=0,depth_output_gemm3d=0,reinterpret_input_as_3d=0,retain_internal_weights=0,fp_mixed_precision=0,broadcast_bias=0,pretranspose_B=0,}
PASSED CONFIGURATION
PASSED RUN #1
dstTensorSingleConfigure
142.591 118.595 120.105
dstTensorConfigurePerRun
142.591 118.595 120.105
[ComputeLibrary][24-09-2024 02:32:46][INFO] arm_compute::cpu::CpuGemm::configure() :
a: Shape=8,DataLayout=NCHW,DataType=F32
b: Shape=3,8,4,DataLayout=NCHW,DataType=F32
c: nullptr
d: Shape=3,DataLayout=NCHW,DataType=F32
alpha: 1.000000
beta: 0.000000
gemm_info: {is_a_reshaped=0,is_b_reshaped=0,reshape_b_only_on_first_run=0,depth_output_gemm3d=0,reinterpret_input_as_3d=0,retain_internal_weights=0,fp_mixed_precision=0,broadcast_bias=0,pretranspose_B=0,}
PASSED RUN #2
dstTensorSingleConfigure
471.933 592.889 518.01
dstTensorConfigurePerRun
471.933 592.889 518.01
May we ask somebody to review oneDNN integration as well? As I mentioned, GEMMInfo is not initialized there, so reshape_b_only_on_first_run is true and it should impact accuracy, since configure is not called per inference in oneDNN.
We'll have a look at the code in OneDNN.
Hi @alvoron,
We'll have a look at the code in OneDNN.
Since oneDNN 3.6 we are using stateless operators from ACL so in implementation of oneDNN matmul primitive we are not calling NEGEMM but experimental::op::ll::CpuGemmAssemblyDispatch as you can see per here: https://github.com/oneapi-src/oneDNN/blob/42b0d5cc48e0d826792762a0cb53b4c8b9b9caa6/src/cpu/aarch64/matmul/acl_matmul_utils.hpp#L43. I have put updated code below that you have shared where this operator is called instead of NEGEMM and you can see that configure() is called only once and then before calling run() we pack vectors (ITensorPack) so that (their updates) are passed to the operator.
cc: @theComputeKid
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/utils/misc/MMappedFile.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h"
#include "tests/Utils.h"
#include "tests/AssetsLibrary.h"
#include "tests/NEON/Accessor.h"
#include "tests/validation/Validation.h"
#include <iostream>
#include <vector>
using namespace arm_compute;
using namespace arm_compute::test;
using namespace arm_compute::test::validation;
void fillTensor(Tensor& t, float min, float max) {
AssetsLibrary library(".", std::random_device()());
std::uniform_real_distribution<> dist{ min, max };
library.fill(Accessor(t), dist, 0);
}
int main(int argc, char *argv[]) {
GEMMInfo gemmInfo;
TensorInfo srcTensorInfo = TensorInfo(TensorShape(8, 4), 1, DataType::F32, DataLayout::NCHW);
TensorInfo weiTensorInfo = TensorInfo(TensorShape(3, 8, 4), 1, DataType::F32, DataLayout::NCHW);
TensorInfo dstTensorInfo = TensorInfo(TensorShape(3, 4), 1, DataType::F32, DataLayout::NCHW);
auto status = NEGEMM::validate(&srcTensorInfo, &weiTensorInfo, nullptr, &dstTensorInfo, 1.0f, 0.0f, gemmInfo);
if(status.error_code() != ErrorCode::OK) {
std::cout << "ERROR: " << status.error_description().c_str() << std::endl;
exit(1);
}
std::cout << "PASSED VALIDATION" << std::endl;
Tensor srcTensor;
Tensor weiTensor;
Tensor dstTensorSingleConfigure;
Tensor dstTensorConfigurePerRun;
srcTensor.allocator()->init(srcTensorInfo);
weiTensor.allocator()->init(weiTensorInfo);
dstTensorSingleConfigure.allocator()->init(dstTensorInfo);
dstTensorConfigurePerRun.allocator()->init(dstTensorInfo);
experimental::op::ll::CpuGemmAssemblyDispatch gemmSingleConfigure;
NEGEMM gemmConfigurePerRun;
gemmSingleConfigure.configure(&srcTensorInfo, &weiTensorInfo, nullptr, &dstTensorInfo, gemmInfo);
gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo);
std::cout << "PASSED CONFIGURATION" << std::endl;
srcTensor.allocator()->allocate();
weiTensor.allocator()->allocate();
dstTensorSingleConfigure.allocator()->allocate();
dstTensorConfigurePerRun.allocator()->allocate();
fillTensor(srcTensor, 0.5f, 5.0f);
fillTensor(weiTensor, 5.0f, 10.0f);
ITensorPack matmul_pack_1;
matmul_pack_1.add_const_tensor(TensorType::ACL_SRC_0, &srcTensor);
matmul_pack_1.add_const_tensor(TensorType::ACL_SRC_1, &weiTensor);
matmul_pack_1.add_tensor(TensorType::ACL_DST, &dstTensorSingleConfigure);
gemmSingleConfigure.run(matmul_pack_1);
gemmConfigurePerRun.run();
std::cout << "PASSED RUN #1" << std::endl;
std::cout << "dstTensorSingleConfigure" << std::endl;
dstTensorSingleConfigure.print(std::cout);
std::cout << "dstTensorConfigurePerRun" << std::endl;
dstTensorConfigurePerRun.print(std::cout);
fillTensor(srcTensor, 0.5f, 10.0f);
fillTensor(weiTensor, 5.0f, 20.0f);
gemmConfigurePerRun.configure(&srcTensor, &weiTensor, nullptr, &dstTensorConfigurePerRun, 1.0f, 0.0f, gemmInfo);
ITensorPack matmul_pack_2;
matmul_pack_2.add_const_tensor(TensorType::ACL_SRC_0, &srcTensor);
matmul_pack_2.add_const_tensor(TensorType::ACL_SRC_1, &weiTensor);
matmul_pack_2.add_tensor(TensorType::ACL_DST, &dstTensorSingleConfigure);
//gemmSingleConfigure.run(matmul_pack_2);
gemmConfigurePerRun.run();
std::cout << "PASSED RUN #2" << std::endl;
std::cout << "dstTensorSingleConfigure" << std::endl;
dstTensorSingleConfigure.print(std::cout);
std::cout << "dstTensorConfigurePerRun" << std::endl;
dstTensorConfigurePerRun.print(std::cout);
srcTensor.allocator()->free();
weiTensor.allocator()->free();
dstTensorSingleConfigure.allocator()->free();
dstTensorConfigurePerRun.allocator()->free();
return 0;
}
@milpuz01 thank you for the explanation. Let's wait for oneDNN upgrade in OpenVINO then.