Failure of converting the `torch.nn.functional.interpolate` from Pytorch to CoreML
I'm trying to convert my Pytorch model which takes a flexible-size image and bounding box coordinates as input to a Coreml model. However, I failed to do that. I think the main problem should be the line that I used bbox_img = FU.interpolate(bbox_img, size=(self.resize_h - pad_top - pad_bottom, self.resize_w - pad_left - pad_right)) which is torch.nn.functional.interpolate. (I have commented out that line with "----- Problem Line ---------")I wonder does coreml support this function?
Error Msg:
Traceback (most recent call last):
File "model_flex.py", line 350, in <module>
main()
File "model_flex.py", line 324, in main
image_encoder_model = ct.convert(
File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/_converters_entry.py", line 492, in convert
mlmodel = mil_convert(
File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/converter.py", line 188, in mil_convert
return _mil_convert(model, convert_from, convert_to, ConverterRegistry, MLModel, compute_units, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/converter.py", line 212, in _mil_convert
proto, mil_program = mil_convert_to_proto(
File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/converter.py", line 285, in mil_convert_to_proto
prog = frontend_converter(model, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/converter.py", line 108, in __call__
return load(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/frontend/torch/load.py", line 63, in load
return _perform_torch_convert(converter, debug)
File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/frontend/torch/load.py", line 102, in _perform_torch_convert
prog = converter.convert()
File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/frontend/torch/converter.py", line 284, in convert
convert_nodes(self.context, self.graph)
File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/frontend/torch/ops.py", line 88, in convert_nodes
add_op(context, node)
File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/frontend/torch/ops.py", line 2953, in upsample_nearest2d
upsample_nearest2d = mb.upsample_nearest_neighbor(
File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/mil/ops/registry.py", line 182, in add_op
return cls._add_op(op_cls_to_add, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/mil/builder.py", line 162, in _add_op
kwargs.update(cls._create_vars(
File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/mil/builder.py", line 145, in _create_vars
var = cls._add_const(val, new_var_name, before_op)
File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/mil/builder.py", line 76, in _add_const
raise ValueError("Cannot add const {}".format(val))
ValueError: Cannot add const 224.0001/is10
Code to Reproduce:
Model Definition:
class ClipDemographics(torch.nn.Module):
def __init__(self, checkpoint_path, image_size):
super(ClipDemographics, self).__init__()
labels = [
['female','gender_other','male'],
['age-ok','age-other'],
['primary','secondary'],
[0]*100]
# create model
model = clip_utils.DemographicsModel(labels)
model.visual.ln_pre = convert_ln(model.visual.ln_pre)
for block in model.visual.transformer.resblocks:
block.ln_1 = convert_ln(block.ln_1)
block.ln_2 = convert_ln(block.ln_2)
model.visual.ln_post = convert_ln(model.visual.ln_post)
saved_model = torch.load(args.checkpoint_path)
if type(saved_model) == dict:
saved_model = saved_model['model']
model.load_state_dict(saved_model.state_dict())
self.model = model.cuda().eval()
self.class_dim = torch.tensor([0]).int() # this is not important...
self.resize_h = image_size[0]
self.resize_w = image_size[0]
if len(image_size) > 1:
self.resize_w = image_size[1]
# bbox [0,0,1,1] is entire image
# bbox [-.1,0,1,1] is entire image with 10% padding on the left
# bbox [0,0.5,1,1.5] pads the bottom of the image by 50%, and then takes a crop of the bottom half of the image
def forward(self, img, bboxes):
print(f"image input start shape: {img.shape}") # (B, C, H, W): (1, 3, 224, 224)
img = img.squeeze(0).permute(1, 2, 0) # (H, W, C): (224, 224, 3)
print(f"image model start shape: {img.shape}")
height, width = img.size(0), img.size(1)
bbox_list = []
for bbox in bboxes:
y0, x0, y1, x1 = bbox[0], bbox[1], bbox[2], bbox[3]
# compute 'extend_square_80' bounding box adjustment:
ix0, iy0, ix1, iy1 = (torch.round(x0 * width)).int(), (torch.round(y0 * height)).int(), (torch.round(x1 * width)).int(), (torch.round(y1 * height)).int()
bbox_w = ix1 - ix0
bbox_h = iy1 - iy0
# first, extend the bbox to a square if necessary:
if bbox_w > bbox_h:
new_iy1 = iy1 + torch.floor((bbox_w - bbox_h) / 2)
new_iy0 = iy0 - torch.ceil((bbox_w - bbox_h) / 2)
iy0,iy1 = new_iy0, new_iy1
else:
new_ix1 = ix1 + torch.floor((bbox_h - bbox_w) / 2)
new_ix0 = ix0 - torch.ceil((bbox_h - bbox_w) / 2)
ix0,ix1 = new_ix0, new_ix1
# Then, add 80% to each side
escrop = 80 / 100 / 2
new_ix0 = (ix0 - escrop * (ix1 - ix0)).int()
new_iy0 = (iy0 - escrop * (iy1 - iy0)).int()
new_ix1 = (ix1 + escrop * (ix1 - ix0)).int()
new_iy1 = (iy1 + escrop * (iy1 - iy0)).int()
x0, y0, x1, y1 = new_ix0 / width, new_iy0 / height, new_ix1 / width, new_iy1 / height
new_bbox = torch.tensor([y0, x0, y1, x1])
# Current bbox width and height
bbox_w = ((x1 - x0) * width).int()
bbox_h = ((y1 - y0) * height).int()
pad_left, pad_top, pad_right, pad_bottom = 0, 0, 0, 0
# bbox width and height that we are gonna get after the resize
resize_w, resize_h = self.resize_w, self.resize_h
paddings = []
# check if the bbox we're gonna get is degenerate and if so fix it
# this matters because of how we calculate scale_w below. if bbox_w
# is 0, then scale_w will be infinite and the inference will fail
bbox_w = max(1, bbox_w)
bbox_h = max(1, bbox_h)
# calc how much we have to scale the image so that the bbox is scaled to
# the correct size
scale_w = self.resize_w / bbox_w
scale_h = self.resize_h / bbox_h
# First, calculate how much we have to pad the image after it is resized
# here, x0 is the coord after extendsquare AND 80 crop,
# in image space
# left_crop is just .40
# width is the width of the image
# scale_w is the scaling ratio
if x0 < 0:
pad_left = int((-x0) * width * scale_w)
if y0 < 0:
pad_top = int((-y0) * height * scale_h)
if x1 > 1:
pad_right = int((x1 - 1) * width * scale_w)
if y1 > 1:
pad_bottom = int((y1 - 1) * height * scale_h)
# If the bbox is invalid (resulting in incorrect padding) just make
# sure each dimension >= 1. (since dimensions for resize are calculated
# below using these padding values)
if pad_left + pad_right >= self.resize_w:
pad_left = self.resize_w // 2 - 1
pad_right = self.resize_w // 2 - 1
if pad_top + pad_bottom >= self.resize_h:
pad_top = self.resize_h // 2 - 1
pad_bottom = self.resize_h // 2 - 1
# Now that we've calculated padding, clamp bbox so it is a valid crop
new_bbox = torch.clamp(new_bbox, min=0.0, max=1.0)
y0,x0,y1,x1 = new_bbox[0], new_bbox[1], new_bbox[2], new_bbox[3] # get new clamped vals
top = (torch.floor(y0 * height)).int()
left = (torch.floor(x0 * width)).int()
right = (torch.ceil(x1 * width)).int()
bottom = (torch.ceil(y1 * height)).int()
bbox_w = (torch.round((x1 - x0) * width)).int()
bbox_h = (torch.round((y1 - y0) * height)).int()
bbox_img = torch.clone(img[top : bottom, left : right, :])
bbox_img = bbox_img.permute(2, 0, 1).unsqueeze(0)
# ------------------------------ Problem Line -----------------------------
# Resize the image
bbox_img = FU.interpolate(bbox_img, size=(self.resize_h - pad_top - pad_bottom, self.resize_w - pad_left - pad_right))
# ------------------------------ Problem Line -----------------------------
# Convert the image back to (height, width, channels) format
bbox_img = bbox_img.squeeze(0).permute(1, 2, 0)
end_img = torch.zeros((self.resize_w, self.resize_h, 3))
end_img[pad_top:self.resize_h - pad_bottom, pad_left:self.resize_w - pad_right, :] = bbox_img
bbox_list.append(end_img)
imgs = torch.stack(bbox_list)
imgs = imgs.cuda().float()
imgs /= 255.0
imgs = imgs.permute(0, 3, 1, 2) # (Batch, channel, height, width):(1, 3, 224, 224)
clip_mean = (0.48145466, 0.4578275, 0.40821073)
clip_std = (0.26862954, 0.26130258, 0.27577711)
imgs[:,0] = (imgs[:,0] - clip_mean[0]) / clip_std[0];
imgs[:,1] = (imgs[:,1] - clip_mean[1]) / clip_std[1];
imgs[:,2] = (imgs[:,2] - clip_mean[2]) / clip_std[2];
outputs = self.model(imgs)
final_vals = {}
# gender head is head 0
gender_probs = torch.nn.functional.softmax(outputs[0],dim=1)
final_vals['female'] = gender_probs[:, 0:1]
final_vals['other_gender'] = gender_probs[:, 1:2]
final_vals['male'] = gender_probs[:, 2:3]
# age-other head is head 1
age_ok_probs = torch.nn.functional.softmax(outputs[1],dim=1)
final_vals['other_age'] = age_ok_probs[:, 1:2]
# secondary/primary head is head 2
primary_probs = torch.nn.functional.softmax(outputs[2],dim=1)
final_vals['primary'] = primary_probs[:, 0:1]
final_vals['secondary'] = primary_probs[:, 1:2]
# 0-100 age probs is head 3
age_probs = torch.nn.functional.softmax(outputs[3],dim=1)
final_vals['baby'] = torch.sum(age_probs[:, 0:2], dim=1, keepdim=True)
final_vals['toddler'] = torch.sum(age_probs[:, 2:5], dim=1, keepdim=True)
final_vals['pre_teen'] = torch.sum(age_probs[:, 5:13], dim=1, keepdim=True)
final_vals['teenager'] = torch.sum(age_probs[:, 13:18], dim=1, keepdim=True)
final_vals['adult'] = torch.sum(age_probs[:, 18:45], dim=1, keepdim=True)
final_vals['middle_aged'] = torch.sum(age_probs[:, 45:65], dim=1, keepdim=True)
final_vals['senior'] = torch.sum(age_probs[:, 65:], dim=1, keepdim=True)
age_midpoints = torch.unsqueeze(torch.arange(100, dtype=torch.int32).cuda(), dim=0) + 0.5
final_vals['age_regression'] = torch.sum(age_probs * age_midpoints, dim=1, keepdim=True)
classes = list(final_vals.keys())
scores = list(final_vals.values())
scores = torch.cat(scores, 1)
return scores
Model conversion
args.image_size = [int(x) for x in args.image_size.split(',')]
model = ClipDemographics(args.checkpoint_path,
args.image_size,
)
traced_script_module = torch.jit.trace(model, (test_img, test_bboxes))
traced_script_module.save("traced_model.pt")
image_shape = ct.Shape(shape=(1,3, ct.RangeDim(64, 1024), ct.RangeDim(64, 1024)))
image_input_scale = ct.TensorType(name="colorImage", shape=image_shape)
bbox_shape = ct.Shape(shape=(ct.RangeDim(lower_bound=0, upper_bound=100), 4))
input_bbox = ct.TensorType(name="input_bbox", dtype=np.float32, shape=bbox_shape)
image_encoder_model = ct.convert(
traced_script_module,
convert_to="mlprogram",
minimum_deployment_target=ct.target.iOS16,
inputs=[image_input_scale, input_bbox],
outputs=[ct.TensorType(name="output", dtype=np.float32)],
)
image_encoder_model.save("demographics.mlpackage")
print("\n Finish Model Conversion \n")
# Load the MLModel
mlmodel = ct.models.MLModel('demographics.mlpackage')
print("\n Finish Model Loading \n")
@Gianluigi121 - This is a lot of code. Please give us a simpler example to reproduce your issue. Also please include all the code we need in order to run it (ex: import statements).
@Gianluigi121 it's lots code but not too many lines to remove I'm afraid. What is such const error anyway, is it possible get into state you try to modify a const value in your app? Or is the val parameter somehow damaged. I'm new to Python so sorry my noobie questions :)