[BUG] pillowresize not run like transforms.Resize in BILINEAR and resize not run like cv2 resize
Describe the bug Can't align
Steps/Code to reproduce bug
I use same image and mse to test the result
def np_to_cuda_buffer(host_data, dtype=None) -> torch.Tensor:
"""Convert host data to a CUDA buffer
Args:
host_data (numpy array): Host data
Returns:
CudaBuffer: The converted CUDA buffer
"""
dtype = to_torch_dtype(dtype) if dtype else None
return torch.as_tensor(host_data, dtype=dtype, device="cuda").cuda()
class CVCudaUtils:
# current_cache_limit = nvcv.get_cache_limit_inbytes()
# print(current_cache_limit)
stream = cvcuda.Stream()
PIL_INTERPOLATION_MODE = {
InterpolationMode.NEAREST: cvcuda.Interp.NEAREST,
InterpolationMode.BILINEAR: cvcuda.Interp.LINEAR,
InterpolationMode.BICUBIC: cvcuda.Interp.CUBIC,
InterpolationMode.BOX: cvcuda.Interp.BOX,
InterpolationMode.HAMMING: cvcuda.Interp.HAMMING,
InterpolationMode.LANCZOS: cvcuda.Interp.LANCZOS,
}
@classmethod
def _normalize(cls, cv_tensor, mean_params=(0.485, 0.456, 0.406), std_params=(0.229, 0.224, 0.225),
dtype=np.float32):
"""
Normalize an image array and rearrange dimensions.
"""
# https://github.com/CVCUDA/CV-CUDA/issues/260 Only use float32 work
mean_cp = np_to_cuda_buffer(mean_params, dtype=np.float32).reshape(1, 1, 3)
std_cp = np_to_cuda_buffer(std_params, dtype=np.float32).reshape(1, 1, 3)
mean_tensor = cvcuda.as_tensor(mean_cp, nvcv.TensorLayout.HWC)
std_tensor = cvcuda.as_tensor(std_cp, nvcv.TensorLayout.HWC)
# Convert image to numpy array and scale to [0,1]
cv_tensor = cvcuda.convertto(cv_tensor, nvcv.Type.F32, scale=1.0 / 255.0, stream=cls.stream)
# Normalize using mean and std (broadcast across height and width)
cv_tensor = cvcuda.normalize(cv_tensor,
base=mean_tensor,
scale=std_tensor,
flags=cvcuda.NormalizeFlags.SCALE_IS_STDDEV, stream=cls.stream)
# Rearrange dimensions from HWC to CHW and add a batch dimension
cv_tensor = cvcuda.reformat(cv_tensor, nvcv.TensorLayout.CHW, stream=cls.stream)
cp_img = cp.asarray(cv_tensor.cuda())[None].astype(cp.dtype(str(np.dtype(dtype))))
return cp_img
@classmethod
def resize_normalize(cls, img, resize, interp=InterpolationMode.BILINEAR,
mean_params=(0.485, 0.456, 0.406),
std_params=(0.229, 0.224, 0.225),
dtype=np.float32):
cv_tensor = cvcuda.as_tensor(np_to_cuda_buffer(img), nvcv.TensorLayout.HWC)
h, w, _ = cv_tensor.shape
# 根据最短边,按比例缩放
if isinstance(resize, int):
scale = resize / min(h, w)
new_h = int(h * scale)
new_w = int(w * scale)
elif isinstance(resize, tuple) and len(resize) == 2:
new_h, new_w = resize
else:
raise ValueError(f"Invalid resize: {resize}")
cv_tensor = cvcuda.pillowresize(cv_tensor, (new_h, new_w, 3), format=cvcuda.Format.RGB8,
interp=cls.PIL_INTERPOLATION_MODE[interp], stream=cls.stream)
cp_img = cls._normalize(cv_tensor, mean_params, std_params, dtype)
return cp_img
def mse(a, b):
print(a.shape)
print(b.shape)
atol = 1e-6
diff = np.mean((a.astype(np.float32) - b.astype(np.float32)) ** 2)
abs_diff = np.allclose(a, b, atol=atol)
print(
f'MSE: {diff}, | Max diff: {np.abs(a.astype(np.int32) - b.astype(np.int32)).max()}| abs_diff {atol}: {abs_diff} | abs_equal| {TestUtils.equal(a, b, logger=logger)}')
return diff < atol
def test_resize_normalize_cvcuda():
image_url = 'xxxxx'
img = get_image(image_url)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
resize = (768, 512)
# resize = 320
crop_size = 320
dtype = np.float16
def torch_preprocess(img):
img = pil_utils.exif_image(img)
torch_transform = transforms.Compose([
transforms.Resize(size=resize),
# transforms.CenterCrop(size=crop_size),
transforms.ToTensor(),
transforms.Normalize(mean, std)
])
img = torch_transform(img)
img = img.unsqueeze(0)
return img.numpy().astype(dtype)
def cvcuda_preprocess(img):
# mean_cp = cp.asarray(mean, dtype=cp.float32).reshape(1, 1, 3)
# std_cp = cp.asarray(std, dtype=cp.float32).reshape(1, 1, 3)
# return CVCudaUtils.resize_center_crop_normalize(img, resize=resize, crop_size=crop_size, dtype=np.float32)
# img = pil_utils.exif_image(img)
# img = np.asarray(img)
# img = pil_utils.exif_image(img)
# torch_transform = transforms.Compose([
# transforms.Resize(size=resize),
# # transforms.CenterCrop(size=crop_size),
# # transforms.ToTensor(),
# # transforms.Normalize(mean, std)
# ])
# img = np.array(torch_transform(img))
return CVCudaUtils.resize_normalize(img, resize=resize, dtype=dtype)
a = torch_preprocess(img)
b = cvcuda_preprocess(img)
print(mse(a, b.get()))
torch_preprocess()
When use
resize=320
It can align but I must calculate new_h and new_w by myself which is very ridiculous.
(1, 3, 320, 320)
(1, 3, 320, 320)
MSE: 2.348394367857054e-08, | Max diff: 0| abs_diff 1e-06: False | abs_equal| None
SDK 2025-08-01 03:50:11.150 triton_sdk.framework.util - equal - line:68 INFO TestUtils equal False
True
but use resize = (768, 512) MSE become bigger than 1e-6 and this will affect my model results
(1, 3, 768, 512)
(1, 3, 768, 512)
MSE: 2.0567511000990635e-06, | Max diff: 1| abs_diff 1e-06: False | abs_equal| None
SDK 2025-08-01 03:51:06.764 triton_sdk.framework.util - equal - line:68 INFO TestUtils equal False
False
Expected behavior https://developer.nvidia.com/zh-cn/blog/cv-cuda-high-performance-image-processing/
I had read this blog and it told me that cvcuda can replace the lib like opencv and torchvision and its result had been align, but I don't think so.
Environment overview (please complete the following information)
- Environment location: [Bare-metal, Docker, Cloud(specify cloud provider)]
- Method of cuDF install: [Docker, pip, or from source]
- If method of install is [Docker], provide
docker pull&docker runcommands usedpip install cvcuda-cu12==0.15.0Environment details Please run and paste the output of thecvcuda/print_env.shscript here, to gather any other relevant environment details
- If method of install is [Docker], provide
Additional context Add any other context about the problem here.
Same error when use resize
CV_INTERPOLATION_MODE = {
cv2.INTER_NEAREST: cvcuda.Interp.NEAREST,
cv2.INTER_LINEAR: cvcuda.Interp.LINEAR,
cv2.INTER_CUBIC: cvcuda.Interp.CUBIC,
cv2.INTER_AREA: cvcuda.Interp.AREA,
}
def test_cv_resize_normalize_cvcuda():
image_url = 'xxxxx'
img = get_image(image_url)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
interp = cv2.INTER_LINEAR
# interp = cv2.INTER_CUBIC
# resize = (768, 512)
resize = (320, 320)
crop_size = (320, 320)
dtype = np.float32
current_cache_limit = nvcv.get_cache_limit_inbytes()
print(current_cache_limit)
def torch_preprocess(img):
image = cv2.resize(img, resize, interpolation=interp)
# mean_np = np.array(mean, dtype=dtype)
# std_np = np.array(std, dtype=dtype)
# image = (image - mean_np) / std_np
# image = image.transpose(2, 0, 1) # HWC -> CHW
# numpy_array = image[None].astype(dtype)
return image
def cvcuda_preprocess(img):
cv_tensor = cvcuda.as_tensor(np_to_cuda_buffer(img), nvcv.TensorLayout.HWC)
cv_tensor = cvcuda.resize(cv_tensor,
(resize[1], resize[0], 3),
interp=CV_INTERPOLATION_MODE[interp])
# cp_img = cp.asarray(cv_tensor.cuda())[None].astype(cp.dtype(str(np.dtype(dtype))))
cp_img = cp.asarray(cv_tensor.cuda())
# return (CVCudaTorchUtils(img)
# .resize(resize, interp=interp)
# # .normalize(mean, std)
# # .hwc2chw()
# .add_batch(dtype))
return cp_img
a = torch_preprocess(img)
b = cvcuda_preprocess(img)
print(mse(a, b.get()))
without any align
root@9833b2254c4b:/workspace# python3 test_utils.py
25448022016
(512, 768, 3)
(512, 768, 3)
MSE: 0.10085296630859375, | Max diff: 1| abs_diff 1e-06: False | abs_equal| None
SDK 2025-08-02 05:34:42.503 triton_sdk.framework.util - equal - line:68 INFO TestUtils equal False
False
root@9833b2254c4b:/workspace# python3 test_utils.py
25448022016
(320, 320, 3)
(320, 320, 3)
MSE: 0.06836914271116257, | Max diff: 1| abs_diff 1e-06: False | abs_equal| None
SDK 2025-08-02 05:35:25.278 triton_sdk.framework.util - equal - line:68 INFO TestUtils equal False
False
Hi @631068264
Do you have more information about your environment such as the hardware/OS? You can use the cvcuda/print_env.sh script for this.
Looking at the output you report, I notice that there is a maximum difference of 1 at a per-pixel level between the resize operations you test. We expect that due to precision differences that there may be small differences in output.
Do you have more context on your application which requires 1:1 correspondence?