CV-CUDA [BUG] pillowresize not run like transforms.Resize in BILINEAR and resize not run like cv2 resize

Describe the bug Can't align

Steps/Code to reproduce bug

I use same image and mse to test the result


def np_to_cuda_buffer(host_data, dtype=None) -> torch.Tensor:
    """Convert host data to a CUDA buffer

    Args:
        host_data (numpy array): Host data

    Returns:
        CudaBuffer: The converted CUDA buffer
    """
    dtype = to_torch_dtype(dtype) if dtype else None
    return torch.as_tensor(host_data, dtype=dtype, device="cuda").cuda()


class CVCudaUtils:
    # current_cache_limit = nvcv.get_cache_limit_inbytes()
    # print(current_cache_limit)
    stream = cvcuda.Stream()
    PIL_INTERPOLATION_MODE = {
        InterpolationMode.NEAREST: cvcuda.Interp.NEAREST,
        InterpolationMode.BILINEAR: cvcuda.Interp.LINEAR,
        InterpolationMode.BICUBIC: cvcuda.Interp.CUBIC,
        InterpolationMode.BOX: cvcuda.Interp.BOX,
        InterpolationMode.HAMMING: cvcuda.Interp.HAMMING,
        InterpolationMode.LANCZOS: cvcuda.Interp.LANCZOS,
    }

    @classmethod
    def _normalize(cls, cv_tensor, mean_params=(0.485, 0.456, 0.406), std_params=(0.229, 0.224, 0.225),
                   dtype=np.float32):
        """
        Normalize an image array and rearrange dimensions.
        """
        # https://github.com/CVCUDA/CV-CUDA/issues/260 Only use float32 work
        mean_cp = np_to_cuda_buffer(mean_params, dtype=np.float32).reshape(1, 1, 3)
        std_cp = np_to_cuda_buffer(std_params, dtype=np.float32).reshape(1, 1, 3)
        mean_tensor = cvcuda.as_tensor(mean_cp, nvcv.TensorLayout.HWC)
        std_tensor = cvcuda.as_tensor(std_cp, nvcv.TensorLayout.HWC)
        # Convert image to numpy array and scale to [0,1]
        cv_tensor = cvcuda.convertto(cv_tensor, nvcv.Type.F32, scale=1.0 / 255.0, stream=cls.stream)

        # Normalize using mean and std (broadcast across height and width)
        cv_tensor = cvcuda.normalize(cv_tensor,
                                     base=mean_tensor,
                                     scale=std_tensor,
                                     flags=cvcuda.NormalizeFlags.SCALE_IS_STDDEV, stream=cls.stream)

        # Rearrange dimensions from HWC to CHW and add a batch dimension
        cv_tensor = cvcuda.reformat(cv_tensor, nvcv.TensorLayout.CHW, stream=cls.stream)
        cp_img = cp.asarray(cv_tensor.cuda())[None].astype(cp.dtype(str(np.dtype(dtype))))
        return cp_img

    @classmethod
    def resize_normalize(cls, img, resize, interp=InterpolationMode.BILINEAR,
                         mean_params=(0.485, 0.456, 0.406),
                         std_params=(0.229, 0.224, 0.225),
                         dtype=np.float32):
        cv_tensor = cvcuda.as_tensor(np_to_cuda_buffer(img), nvcv.TensorLayout.HWC)
        h, w, _ = cv_tensor.shape
        # 根据最短边，按比例缩放
        if isinstance(resize, int):
            scale = resize / min(h, w)
            new_h = int(h * scale)
            new_w = int(w * scale)
        elif isinstance(resize, tuple) and len(resize) == 2:
            new_h, new_w = resize
        else:
            raise ValueError(f"Invalid resize: {resize}")

        cv_tensor = cvcuda.pillowresize(cv_tensor, (new_h, new_w, 3), format=cvcuda.Format.RGB8,
                                        interp=cls.PIL_INTERPOLATION_MODE[interp], stream=cls.stream)
        cp_img = cls._normalize(cv_tensor, mean_params, std_params, dtype)
        return cp_img







def mse(a, b):
    print(a.shape)
    print(b.shape)
    atol = 1e-6
    diff = np.mean((a.astype(np.float32) - b.astype(np.float32)) ** 2)
    abs_diff = np.allclose(a, b, atol=atol)
    print(
        f'MSE: {diff}, | Max diff: {np.abs(a.astype(np.int32) - b.astype(np.int32)).max()}| abs_diff {atol}: {abs_diff} | abs_equal| {TestUtils.equal(a, b, logger=logger)}')
    return diff < atol


def test_resize_normalize_cvcuda():
    image_url = 'xxxxx'
   
    img = get_image(image_url)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    mean = (0.485, 0.456, 0.406)
    std = (0.229, 0.224, 0.225)
    resize = (768, 512)
    # resize = 320
    crop_size = 320
    dtype = np.float16
   

    
    def torch_preprocess(img):
        img = pil_utils.exif_image(img)
        torch_transform = transforms.Compose([
            transforms.Resize(size=resize),
            # transforms.CenterCrop(size=crop_size),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])

        img = torch_transform(img)
        img = img.unsqueeze(0)
        return img.numpy().astype(dtype)

    def cvcuda_preprocess(img):
        # mean_cp = cp.asarray(mean, dtype=cp.float32).reshape(1, 1, 3)
        # std_cp = cp.asarray(std, dtype=cp.float32).reshape(1, 1, 3)
        # return CVCudaUtils.resize_center_crop_normalize(img, resize=resize, crop_size=crop_size, dtype=np.float32)
        # img = pil_utils.exif_image(img)
        # img = np.asarray(img)

        # img = pil_utils.exif_image(img)
        # torch_transform = transforms.Compose([
        #     transforms.Resize(size=resize),
        #     # transforms.CenterCrop(size=crop_size),
        #     # transforms.ToTensor(),
        #     # transforms.Normalize(mean, std)
        # ])
        # img = np.array(torch_transform(img))

        return CVCudaUtils.resize_normalize(img, resize=resize, dtype=dtype)

    a = torch_preprocess(img)
    b = cvcuda_preprocess(img)
    print(mse(a, b.get()))


torch_preprocess()

When use

resize=320 It can align but I must calculate new_h and new_w by myself which is very ridiculous.

(1, 3, 320, 320)
(1, 3, 320, 320)
MSE: 2.348394367857054e-08, | Max diff: 0| abs_diff 1e-06: False | abs_equal| None
SDK 2025-08-01 03:50:11.150 triton_sdk.framework.util - equal - line:68 INFO TestUtils equal False
True

but use resize = (768, 512) MSE become bigger than 1e-6 and this will affect my model results


(1, 3, 768, 512)
(1, 3, 768, 512)
MSE: 2.0567511000990635e-06, | Max diff: 1| abs_diff 1e-06: False | abs_equal| None
SDK 2025-08-01 03:51:06.764 triton_sdk.framework.util - equal - line:68 INFO TestUtils equal False
False

Expected behavior https://developer.nvidia.com/zh-cn/blog/cv-cuda-high-performance-image-processing/

I had read this blog and it told me that cvcuda can replace the lib like opencv and torchvision and its result had been align, but I don't think so.

Environment overview (please complete the following information)

Environment location: [Bare-metal, Docker, Cloud(specify cloud provider)]
Method of cuDF install: [Docker, pip, or from source]
- If method of install is [Docker], provide docker pull & docker run commands used pip install cvcuda-cu12==0.15.0 Environment details Please run and paste the output of the cvcuda/print_env.sh script here, to gather any other relevant environment details

Additional context Add any other context about the problem here.

Aug 01 '25 03:08 631068264

Same error when use resize

CV_INTERPOLATION_MODE = {
    cv2.INTER_NEAREST: cvcuda.Interp.NEAREST,
    cv2.INTER_LINEAR: cvcuda.Interp.LINEAR,
    cv2.INTER_CUBIC: cvcuda.Interp.CUBIC,
    cv2.INTER_AREA: cvcuda.Interp.AREA,
}

def test_cv_resize_normalize_cvcuda():

    image_url = 'xxxxx'
    img = get_image(image_url)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    mean = (0.485, 0.456, 0.406)
    std = (0.229, 0.224, 0.225)
    interp = cv2.INTER_LINEAR
    # interp = cv2.INTER_CUBIC
    # resize = (768, 512)
    resize = (320, 320)
    crop_size = (320, 320)
    dtype = np.float32
    current_cache_limit = nvcv.get_cache_limit_inbytes()
    print(current_cache_limit)

    def torch_preprocess(img):
        image = cv2.resize(img, resize, interpolation=interp)
        # mean_np = np.array(mean, dtype=dtype)
        # std_np = np.array(std, dtype=dtype)
        # image = (image - mean_np) / std_np
        # image = image.transpose(2, 0, 1)  # HWC -> CHW
        # numpy_array = image[None].astype(dtype)
        return image

    def cvcuda_preprocess(img):
        cv_tensor = cvcuda.as_tensor(np_to_cuda_buffer(img), nvcv.TensorLayout.HWC)
        cv_tensor = cvcuda.resize(cv_tensor,
                                  (resize[1], resize[0], 3),
                                  interp=CV_INTERPOLATION_MODE[interp])
        # cp_img = cp.asarray(cv_tensor.cuda())[None].astype(cp.dtype(str(np.dtype(dtype))))
        cp_img = cp.asarray(cv_tensor.cuda())

        # return (CVCudaTorchUtils(img)
        #         .resize(resize, interp=interp)
        #         # .normalize(mean, std)
        #         # .hwc2chw()
        #         .add_batch(dtype))

        return cp_img

    a = torch_preprocess(img)
    b = cvcuda_preprocess(img)
    print(mse(a, b.get()))

without any align

root@9833b2254c4b:/workspace# python3 test_utils.py 
25448022016
(512, 768, 3)
(512, 768, 3)
MSE: 0.10085296630859375, | Max diff: 1| abs_diff 1e-06: False | abs_equal| None
SDK 2025-08-02 05:34:42.503 triton_sdk.framework.util - equal - line:68 INFO TestUtils equal False
False

root@9833b2254c4b:/workspace# python3 test_utils.py 
25448022016
(320, 320, 3)
(320, 320, 3)
MSE: 0.06836914271116257, | Max diff: 1| abs_diff 1e-06: False | abs_equal| None
SDK 2025-08-02 05:35:25.278 triton_sdk.framework.util - equal - line:68 INFO TestUtils equal False
False

Aug 02 '25 05:08 631068264

Hi @631068264

Do you have more information about your environment such as the hardware/OS? You can use the cvcuda/print_env.sh script for this.

Looking at the output you report, I notice that there is a maximum difference of 1 at a per-pixel level between the resize operations you test. We expect that due to precision differences that there may be small differences in output.

Do you have more context on your application which requires 1:1 correspondence?

Nov 14 '25 18:11 justincdavis