cudnn-frontend
cudnn-frontend copied to clipboard
Wrong result of tensor addition with broadcasting
Describe the bug I run the following code
#include "cudnn-frontend/include/cudnn_frontend.h"
namespace fe = cudnn_frontend;
int main()
{
cudnnHandle_t handle;
assert(cudnnCreate(&handle) == CUDNN_STATUS_SUCCESS);
std::vector<float> A = {1.0, 2.0, 3.0, 4.0}; // |1.0 2.0|
// |3.0 4.0|
std::vector<float> B = {10.0, 20.0}; // |10.0|
// |20.0|
auto graph = std::make_shared<fe::graph::Graph>();
auto inA = graph->tensor(
fe::graph::Tensor_attributes()
.set_name("A")
.set_dim({1, 2, 2})
.set_stride({4, 2, 1})
.set_data_type(fe::DataType_t::FLOAT));
auto inB = graph->tensor(
fe::graph::Tensor_attributes()
.set_name("B")
.set_dim({1, 2, 1})
.set_stride({2, 1, 1})
.set_data_type(fe::DataType_t::FLOAT));
auto out = graph->pointwise(
inA,
inB,
fe::graph::Pointwise_attributes()
.set_mode(fe::PointwiseMode_t::ADD)
.set_compute_data_type(fe::DataType_t::FLOAT));
out->set_data_type(fe::DataType_t::FLOAT);
out->set_output(true);
assert(graph->validate().is_good());
assert(graph->build_operation_graph(handle).is_good());
assert(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
assert(graph->build_plans(handle, fe::BuildPlanPolicy_t::ALL).is_good());
void* inAPtr;
assert(cudaMalloc(&inAPtr, 4 * sizeof(float)) == cudaSuccess);
assert(cudaMemcpy(inAPtr, A.data(), 4 * sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
void* inBPtr;
assert(cudaMalloc(&inBPtr, 2 * sizeof(float)) == cudaSuccess);
assert(cudaMemcpy(inBPtr, B.data(), 2 * sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
void* outPtr;
assert(cudaMalloc(&outPtr, 4 * sizeof(float)) == cudaSuccess);
void* workspacePtr;
assert(cudaMalloc(&workspacePtr, graph->get_workspace_size()) == cudaSuccess);
std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> tensorMap;
tensorMap[inA] = inAPtr;
tensorMap[inB] = inBPtr;
tensorMap[out] = outPtr;
auto r = graph->execute(handle, tensorMap, workspacePtr);
if (!r.is_good()) {
std::cerr << r.get_message() << std::endl;
}
assert(cudaDeviceSynchronize() == cudaSuccess);
std::vector<float> outData(4);
assert(cudaMemcpy(outData.data(), outPtr, 4 * sizeof(float), cudaMemcpyDeviceToHost) == cudaSuccess);
for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
std::cout << outData[i * 2 + j] << " ";
}
std::cout << std::endl;
}
assert(cudaFree(inAPtr) == cudaSuccess);
assert(cudaFree(inBPtr) == cudaSuccess);
assert(cudaFree(outPtr) == cudaSuccess);
assert(cudaFree(workspacePtr) == cudaSuccess);
assert(cudnnDestroy(handle) == CUDNN_STATUS_SUCCESS);
}
The program is compiled as nvcc example.cpp -lcudnn -o example
It sums tensor of size 1 x 2 x 2
and tensor of size 1 x 2 x 1
. I expect broadcasting, so the expected behavior is [[[1, 2], [3, 4]]] + [[[10], [20]]] = [[[11, 12], [23, 24]]]
. However the program prints
13 14
23 24
so it performs something like broadcasting of the second row of the first matrix with the second matrix.
Expected behavior
I expect program to print
11 12
23 24
System Environment (please complete the following information):
- cudnn_frontend version: 9f8cc9ae049f099b55c558e4e4d8e75b38ac6b94 commit, technically v1.6.1
- cudnn_backend version: v9.4.0
- GPU arch: H100
- cuda runtime version: 12.1
- cuda driver version: 535.161.08
- host compiler: nvcc
cuda_12.1.r12.1/compiler.32688072_0
- OS: ubuntu22.04
API logs
To Reproduce
nvcc example.cpp -lcudnn -o example && ./example
Additional context
I looked a little bit to the backend logs and do not see anything suspicious here, so probably the problem is not on the frontend side, but in the cuDNN.