oneMKL icon indicating copy to clipboard operation
oneMKL copied to clipboard

[DFT][cuFFT] Invalid strides error on CUDA

Open vlad-perevezentsev opened this issue 8 months ago • 1 comments

Summary

When performing DPF transform with oneapi::mkl::dft on NVIDIA GPU for multidimensional arrays >=2D where the last dimension is 1 [(2,1), (5,1), (2,3,1)....] The computation fails with oneMKL: dft/backends/cufft/commit: Invalid strides

This problem does not occur when running on Intel.

Version

I am using the latest release version v0.6

Environment

  • HW you use : NVIDIA A100 80GB PCIe
  • Backend library version: Build cuda_12.4.r12.4/compiler.33961263_0
  • OS name and version : Linux Ubuntu 24.04 LTS 6.8.0-51-generic
  • Compiler version : Intel(R) oneAPI DPC++/C++ Compiler 2025.0.4 (2025.0.4.20241205)
$ sycl-ls
[opencl:cpu][opencl:0] Intel(R) OpenCL, Intel(R) Xeon(R) Platinum 8480+ OpenCL 3.0 (Build 0) [2024.18.12.0.05_160000]
[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100 80GB PCIe 8.0 [CUDA 12.7]

Steps to reproduce

Building:

cmake .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DMKL_ROOT=${MKLROOT} -DBUILD_FUNCTIONAL_TESTS=OFF              \
    -DENABLE_CUBLAS_BACKEND=True  -DENABLE_CUSOLVER_BACKEND=True -DENABLE_CURAND_BACKEND=True -DENABLE_CUFFT_BACKEND=True

# build
cmake --build .

# install
export MKL_INTERFACE_ROOT=$(pwd)/../install
cmake --install . --prefix=${MKL_INTERFACE_ROOT}

Compiling:

icpx -fsycl fft_repro.cpp -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown -I${MKL_INTERFACE_ROOT}/include
\
-L${MKL_INTERFACE_ROOT}/lib -lonemkl -Wl,-rpath,${MKL_INTERFACE_ROOT}/lib -o run_repro

Running:

ONEAPI_DEVICE_SELECTOR=cuda:gpu ./run_repro
ONEAPI_DEVICE_SELECTOR=opencl:cpu ./run_repro 
Reproducer here
#include <iostream>
#include <vector>
#include <sycl/sycl.hpp>
#include <oneapi/mkl/dft.hpp>
#include <complex>

int main() {
    namespace dft = oneapi::mkl::dft;
    sycl::queue q(sycl::default_selector{});

    auto device = q.get_device();
    std::cout << "Device name: " << device.get_info<sycl::info::device::name>() << "\n";

    std::vector<std::int64_t> shape = {3, 1};
    const std::int64_t size = shape[0] * shape[1];

    std::vector<std::complex<float>> h_data(size, {1.0f, 0.0f});
    std::vector<std::complex<float>> h_output(size);

    std::complex<float>* d_data = sycl::malloc_device<std::complex<float>>(size, q);
    std::complex<float>* d_output = sycl::malloc_device<std::complex<float>>(size, q);

    q.memcpy(d_data, h_data.data(), size * sizeof(std::complex<float>)).wait();

    dft::descriptor<dft::precision::SINGLE, dft::domain::COMPLEX> desc(shape);

    std::vector<std::int64_t> fwd_strides = {0, 1, 1};
    std::vector<std::int64_t> bwd_strides = {0, 1, 1};

    desc.set_value(dft::config_param::PLACEMENT, dft::config_value::NOT_INPLACE);
    desc.set_value(dft::config_param::NUMBER_OF_TRANSFORMS, 1);
    // desc.set_value(dft::config_param::FWD_STRIDES, fwd_strides.data());
    // desc.set_value(dft::config_param::BWD_STRIDES, bwd_strides.data());

    desc.commit(q);

    sycl::event fft_event = dft::compute_forward(desc, d_data, d_output);
    fft_event.wait();

    q.memcpy(h_output.data(), d_output, size * sizeof(std::complex<float>)).wait();

    std::cout << "FFT Result:\n";
    for (std::int64_t i = 0; i < size; i++) {
        std::cout << h_output[i] << "\n";
    }

    sycl::free(d_data, q);
    sycl::free(d_output, q);

    return 0;
}

Observed behavior

When executing the attached code on NVIDIA the program fails with an error

Example output

$ ONEAPI_DEVICE_SELECTOR=cuda:gpu ./run_repro 

Device name: NVIDIA A100 80GB PCIe
terminate called after throwing an instance of 'oneapi::mkl::exception'
  what():  oneMKL: dft/backends/cufft/commit: Invalid strides.
Aborted (core dumped)

However, running the same code on Intel works as expected

$ ONEAPI_DEVICE_SELECTOR=opencl:cpu ./run_repro 

Device name: Intel(R) Xeon(R) Platinum 8480+
FFT Result:
(3,0)
(0,0)
(0,0)

Expected behavior

Similar implementation that uses cuFFT directly works correctly

$ ./fft_cuda 
FFT Result:
(3, 0)
(0, 0)
(0, 0)
Reproducer here
nvcc -o fft_cuda fft_cuda.cpp -lcufft
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
#include <cufft.h>
#include <complex>


int main() {
    const int NX = 1;
    const int NY = 3;
    const int size = NX * NY;

    std::vector<std::complex<float>> h_data(size, {1.0f, 0.0f});

    cufftComplex* d_data;
    cudaMalloc(reinterpret_cast<void**>(&d_data), size * sizeof(cufftComplex));

    std::vector<cufftComplex> h_cufft_data(size);
    for (int i = 0; i < size; i++) {
        h_cufft_data[i].x = std::real(h_data[i]);
        h_cufft_data[i].y = std::imag(h_data[i]);
    }

    cudaMemcpy(d_data, h_cufft_data.data(), size * sizeof(cufftComplex), cudaMemcpyHostToDevice);

    cufftHandle plan;
    cufftPlan2d(&plan, NY, NX, CUFFT_C2C);

    cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);

    cudaMemcpy(h_cufft_data.data(), d_data, size * sizeof(cufftComplex), cudaMemcpyDeviceToHost);

    std::cout << "FFT Result:\n";
    for (int i = 0; i < size; i++) {
        std::cout << "(" << h_cufft_data[i].x << ", " << h_cufft_data[i].y << ")\n";
    }

    cufftDestroy(plan);
    cudaFree(d_data);

    return 0;
}

vlad-perevezentsev avatar Feb 18 '25 13:02 vlad-perevezentsev