opencv_contrib
opencv_contrib copied to clipboard
cv::cuda::StereoSGM giving different result than cv::StereoSGBM
I have compiled opencv with cuda support in windows with version 4.11. I tried the following code to compare GPU and CPU versions of stereo SGBM matcher:
#include <opencv2/opencv.hpp>
#include <opencv2/cudastereo.hpp>
#include <iostream>
#include <chrono>
int main() {
// Load stereo images
cv::Mat left = cv::imread("C:/atp9_scripted/renders/v7/pipeline_v2_recon03_12_2025_10_16_32/3_aligned/shadowless/sample.1/imL.png");
cv::Mat right = cv::imread("C:/atp9_scripted/renders/v7/pipeline_v2_recon03_12_2025_10_16_32/3_aligned/shadowless/sample.1/imR.png");
cv::cvtColor(left, left, cv::COLOR_BGR2GRAY);
cv::cvtColor(right, right, cv::COLOR_BGR2GRAY);
if (left.empty() || right.empty()) {
std::cerr << "Error loading images" << std::endl;
return -1;
}
// Upload to GPU
// Create CUDA StereoSGBM
int minDisparity = 0;
int blockSize = 8;
int uniquenessRatio = 5;
int speckleRange = 1;
int spleckleWindowSize = 50;
int preFilterCap = 25;
auto stereo = cv::cuda::createStereoSGM(
minDisparity,
64,
8 * 3 * (blockSize / 2) * (blockSize / 2), // P1
32 * 3 * (blockSize / 2) * (blockSize / 2), // P2
uniquenessRatio,
cv::cuda::StereoSGM::MODE_HH4
);
stereo->setBlockSize(blockSize);
stereo->setDisp12MaxDiff(1);
stereo->setPreFilterCap(preFilterCap);
stereo->setSpeckleRange(speckleRange);
stereo->setSpeckleWindowSize(spleckleWindowSize);
stereo->setPreFilterCap(preFilterCap);
// Compute disparity
auto start = std::chrono::high_resolution_clock::now();
cv::cuda::GpuMat d_left, d_right, d_disp;
d_left.upload(left);
d_right.upload(right);
stereo->compute(d_left, d_right, d_disp);
// Download result
cv::Mat disparity;
d_disp.download(disparity);
auto end = std::chrono::high_resolution_clock::now();
double elapsed = (double)(end - start).count();
std::cout << elapsed / 1000000 << " ms" << std::endl;
disparity.convertTo(disparity, CV_32F);
disparity /= 16.0;
disparity.setTo(0, disparity < 0);
cv::patchNaNs(disparity, 0);
cv::Ptr<cv::StereoSGBM> stereoLeft = cv::StereoSGBM::create(
0, // minDisparity
64,
blockSize,
8 * 3 * (blockSize / 2) * (blockSize / 2), // P1
32 * 3 * (blockSize / 2) * (blockSize / 2), // P2
1, // disp12MaxDiff
preFilterCap,
uniquenessRatio,
spleckleWindowSize,
speckleRange,
cv::StereoSGBM::MODE_HH4 // mode
);
// Compute disparity for left image
cv::Mat dispL;
start = std::chrono::high_resolution_clock::now();
stereoLeft->compute(left, right, dispL);
end = std::chrono::high_resolution_clock::now();
elapsed = (double)(end - start).count();
std::cout << elapsed / 1000000 << " ms" << std::endl;
dispL.convertTo(dispL, CV_32F);
dispL /= 16.0;
dispL.setTo(0, dispL < 0);
cv::patchNaNs(dispL, 0);
cv::Mat diff;
cv::absdiff(disparity, dispL, diff);
diff.convertTo(diff, CV_8U, 255 / 40);
cv::imshow("diff", diff);
dispL.convertTo(dispL, CV_8U, 255 / 40);
cv::imshow("dispL", dispL);
// Normalize and show
cv::Mat disp8;
disparity.convertTo(disp8, CV_8U, 255 / 40);
cv::imshow("Disparity", disp8);
cv::waitKey(0);
cv::imwrite("../disp_gpu.png", disp8);
cv::imwrite("../disp_cpu.png", dispL);
cv::imwrite("../disp_diff.png", diff);
return 0;
}
The result is below:
The first one is the gpu, second is cpu and last one is absolute difference. The speedup is noticeable, up to 10 times faster. And if I use default penalties for GPU version the result is reasonable, as you can see below:
However, my concern is that using the exact same parameters we get very different results. Is the CUDA version supposed to be the same as the CPU version, are the P1 and P2 penalties the same or do the mean a different thing? Below are the input images also in case you want to test it.