opencv_contrib icon indicating copy to clipboard operation
opencv_contrib copied to clipboard

cv::cuda::StereoSGM giving different result than cv::StereoSGBM

Open Jerry-Master opened this issue 8 months ago • 0 comments

I have compiled opencv with cuda support in windows with version 4.11. I tried the following code to compare GPU and CPU versions of stereo SGBM matcher:

#include <opencv2/opencv.hpp>
#include <opencv2/cudastereo.hpp>
#include <iostream>
#include <chrono>

int main() {
    // Load stereo images
    cv::Mat left = cv::imread("C:/atp9_scripted/renders/v7/pipeline_v2_recon03_12_2025_10_16_32/3_aligned/shadowless/sample.1/imL.png");
    cv::Mat right = cv::imread("C:/atp9_scripted/renders/v7/pipeline_v2_recon03_12_2025_10_16_32/3_aligned/shadowless/sample.1/imR.png");
    cv::cvtColor(left, left, cv::COLOR_BGR2GRAY);
    cv::cvtColor(right, right, cv::COLOR_BGR2GRAY);

    if (left.empty() || right.empty()) {
        std::cerr << "Error loading images" << std::endl;
        return -1;
    }

    // Upload to GPU

    // Create CUDA StereoSGBM
    int minDisparity = 0;
    int blockSize = 8;
    int uniquenessRatio = 5;
    int speckleRange = 1;
    int spleckleWindowSize = 50;
    int preFilterCap = 25;

    auto stereo = cv::cuda::createStereoSGM(
        minDisparity,
        64,
        8 * 3 * (blockSize / 2) * (blockSize / 2), // P1
        32 * 3 * (blockSize / 2) * (blockSize / 2), // P2
        uniquenessRatio,
        cv::cuda::StereoSGM::MODE_HH4
    );
    stereo->setBlockSize(blockSize);
    stereo->setDisp12MaxDiff(1);
    stereo->setPreFilterCap(preFilterCap);
    stereo->setSpeckleRange(speckleRange);
    stereo->setSpeckleWindowSize(spleckleWindowSize);
    stereo->setPreFilterCap(preFilterCap);

    // Compute disparity
    auto start = std::chrono::high_resolution_clock::now();
    cv::cuda::GpuMat d_left, d_right, d_disp;
    d_left.upload(left);
    d_right.upload(right);
    stereo->compute(d_left, d_right, d_disp);

    // Download result
    cv::Mat disparity;
    d_disp.download(disparity);
    auto end = std::chrono::high_resolution_clock::now();
    double elapsed = (double)(end - start).count();
    std::cout << elapsed / 1000000 << " ms" << std::endl;
    disparity.convertTo(disparity, CV_32F);
    disparity /= 16.0;
    disparity.setTo(0, disparity < 0);
    cv::patchNaNs(disparity, 0);

    cv::Ptr<cv::StereoSGBM> stereoLeft = cv::StereoSGBM::create(
        0, // minDisparity
        64,
        blockSize,
        8 * 3 * (blockSize / 2) * (blockSize / 2), // P1
        32 * 3 * (blockSize / 2) * (blockSize / 2), // P2
        1, // disp12MaxDiff
        preFilterCap,
        uniquenessRatio,
        spleckleWindowSize,
        speckleRange,
        cv::StereoSGBM::MODE_HH4 // mode
    );
    // Compute disparity for left image
    cv::Mat dispL;
    start = std::chrono::high_resolution_clock::now();
    stereoLeft->compute(left, right, dispL);
    end = std::chrono::high_resolution_clock::now();
    elapsed = (double)(end - start).count();
    std::cout << elapsed / 1000000 << " ms" << std::endl;
    dispL.convertTo(dispL, CV_32F);
    dispL /= 16.0;
    dispL.setTo(0, dispL < 0);
    cv::patchNaNs(dispL, 0);


    cv::Mat diff;
    cv::absdiff(disparity, dispL, diff);
    diff.convertTo(diff, CV_8U, 255 / 40);
    cv::imshow("diff", diff);

    dispL.convertTo(dispL, CV_8U, 255 / 40);
    cv::imshow("dispL", dispL);

    // Normalize and show
    cv::Mat disp8;
    disparity.convertTo(disp8, CV_8U, 255 / 40);
    cv::imshow("Disparity", disp8);
    cv::waitKey(0);

    cv::imwrite("../disp_gpu.png", disp8);
    cv::imwrite("../disp_cpu.png", dispL);
    cv::imwrite("../disp_diff.png", diff);

    return 0;
}

The result is below:

Image Image Image

The first one is the gpu, second is cpu and last one is absolute difference. The speedup is noticeable, up to 10 times faster. And if I use default penalties for GPU version the result is reasonable, as you can see below:

Image

However, my concern is that using the exact same parameters we get very different results. Is the CUDA version supposed to be the same as the CPU version, are the P1 and P2 penalties the same or do the mean a different thing? Below are the input images also in case you want to test it.

Image Image

Jerry-Master avatar Mar 14 '25 11:03 Jerry-Master