cudnn-frontend Wrong result of tensor addition with broadcasting

Wrong result of tensor addition with broadcasting

Open gritukan opened this issue 5 months ago • 0 comments

Describe the bug I run the following code

#include "cudnn-frontend/include/cudnn_frontend.h"

namespace fe = cudnn_frontend;

int main()
{
    cudnnHandle_t handle;
    assert(cudnnCreate(&handle) == CUDNN_STATUS_SUCCESS);


    std::vector<float> A = {1.0, 2.0, 3.0, 4.0}; // |1.0 2.0|
                                                 // |3.0 4.0|
    std::vector<float> B = {10.0, 20.0};         // |10.0|
                                                 // |20.0|

    auto graph = std::make_shared<fe::graph::Graph>();
    auto inA = graph->tensor(
        fe::graph::Tensor_attributes()
            .set_name("A")
            .set_dim({1, 2, 2})
            .set_stride({4, 2, 1})
            .set_data_type(fe::DataType_t::FLOAT));
    auto inB = graph->tensor(
        fe::graph::Tensor_attributes()
            .set_name("B")
            .set_dim({1, 2, 1})
            .set_stride({2, 1, 1})
            .set_data_type(fe::DataType_t::FLOAT));
    auto out = graph->pointwise(
        inA,
        inB,
        fe::graph::Pointwise_attributes()
            .set_mode(fe::PointwiseMode_t::ADD)
            .set_compute_data_type(fe::DataType_t::FLOAT));
    out->set_data_type(fe::DataType_t::FLOAT);
    out->set_output(true);

    assert(graph->validate().is_good());
    assert(graph->build_operation_graph(handle).is_good());
    assert(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
    assert(graph->build_plans(handle, fe::BuildPlanPolicy_t::ALL).is_good());

    void* inAPtr;
    assert(cudaMalloc(&inAPtr, 4 * sizeof(float)) == cudaSuccess);
    assert(cudaMemcpy(inAPtr, A.data(), 4 * sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);

    void* inBPtr;
    assert(cudaMalloc(&inBPtr, 2 * sizeof(float)) == cudaSuccess);
    assert(cudaMemcpy(inBPtr, B.data(), 2 * sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);

    void* outPtr;
    assert(cudaMalloc(&outPtr, 4 * sizeof(float)) == cudaSuccess);

    void* workspacePtr;
    assert(cudaMalloc(&workspacePtr, graph->get_workspace_size()) == cudaSuccess);

    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> tensorMap;
    tensorMap[inA] = inAPtr;
    tensorMap[inB] = inBPtr;
    tensorMap[out] = outPtr;

    auto r = graph->execute(handle, tensorMap, workspacePtr);
    if (!r.is_good()) {
        std::cerr << r.get_message() << std::endl;
    }

    assert(cudaDeviceSynchronize() == cudaSuccess);

    std::vector<float> outData(4);
    assert(cudaMemcpy(outData.data(), outPtr, 4 * sizeof(float), cudaMemcpyDeviceToHost) == cudaSuccess);
    for (int i = 0; i < 2; i++) {
        for (int j = 0; j < 2; j++) {
            std::cout << outData[i * 2 + j] << " ";
        }
        std::cout << std::endl;
    }

    assert(cudaFree(inAPtr) == cudaSuccess);
    assert(cudaFree(inBPtr) == cudaSuccess);
    assert(cudaFree(outPtr) == cudaSuccess);
    assert(cudaFree(workspacePtr) == cudaSuccess);

    assert(cudnnDestroy(handle) == CUDNN_STATUS_SUCCESS);
}

The program is compiled as nvcc example.cpp -lcudnn -o example

It sums tensor of size 1 x 2 x 2 and tensor of size 1 x 2 x 1. I expect broadcasting, so the expected behavior is [[[1, 2], [3, 4]]] + [[[10], [20]]] = [[[11, 12], [23, 24]]]. However the program prints

13 14 
23 24

so it performs something like broadcasting of the second row of the first matrix with the second matrix.

Expected behavior

I expect program to print

11 12
23 24

System Environment (please complete the following information):

cudnn_frontend version: 9f8cc9ae049f099b55c558e4e4d8e75b38ac6b94 commit, technically v1.6.1
cudnn_backend version: v9.4.0
GPU arch: H100
cuda runtime version: 12.1
cuda driver version: 535.161.08
host compiler: nvcc cuda_12.1.r12.1/compiler.32688072_0
OS: ubuntu22.04

API logs

fe.log be.log

To Reproduce nvcc example.cpp -lcudnn -o example && ./example

Additional context

I looked a little bit to the backend logs and do not see anything suspicious here, so probably the problem is not on the frontend side, but in the cuDNN.

Sep 07 '24 18:09 gritukan

cudnn-frontend cudnn-frontend copied to clipboard

Wrong result of tensor addition with broadcasting

cudnn-frontend
cudnn-frontend copied to clipboard