coremltools Failure of converting the `torch.nn.functional.interpolate` from Pytorch to CoreML

I'm trying to convert my Pytorch model which takes a flexible-size image and bounding box coordinates as input to a Coreml model. However, I failed to do that. I think the main problem should be the line that I used bbox_img = FU.interpolate(bbox_img, size=(self.resize_h - pad_top - pad_bottom, self.resize_w - pad_left - pad_right)) which is torch.nn.functional.interpolate. (I have commented out that line with "----- Problem Line ---------")I wonder does coreml support this function?

Error Msg:

Traceback (most recent call last):
  File "model_flex.py", line 350, in <module>
    main()
  File "model_flex.py", line 324, in main
    image_encoder_model = ct.convert(
  File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/_converters_entry.py", line 492, in convert
    mlmodel = mil_convert(
  File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/converter.py", line 188, in mil_convert
    return _mil_convert(model, convert_from, convert_to, ConverterRegistry, MLModel, compute_units, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/converter.py", line 212, in _mil_convert
    proto, mil_program = mil_convert_to_proto(
  File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/converter.py", line 285, in mil_convert_to_proto
    prog = frontend_converter(model, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/converter.py", line 108, in __call__
    return load(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/frontend/torch/load.py", line 63, in load
    return _perform_torch_convert(converter, debug)
  File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/frontend/torch/load.py", line 102, in _perform_torch_convert
    prog = converter.convert()
  File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/frontend/torch/converter.py", line 284, in convert
    convert_nodes(self.context, self.graph)
  File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/frontend/torch/ops.py", line 88, in convert_nodes
    add_op(context, node)
  File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/frontend/torch/ops.py", line 2953, in upsample_nearest2d
    upsample_nearest2d = mb.upsample_nearest_neighbor(
  File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/mil/ops/registry.py", line 182, in add_op
    return cls._add_op(op_cls_to_add, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/mil/builder.py", line 162, in _add_op
    kwargs.update(cls._create_vars(
  File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/mil/builder.py", line 145, in _create_vars
    var = cls._add_const(val, new_var_name, before_op)
  File "/opt/conda/lib/python3.8/site-packages/coremltools/converters/mil/mil/builder.py", line 76, in _add_const
    raise ValueError("Cannot add const {}".format(val))
ValueError: Cannot add const 224.0001/is10

Code to Reproduce:

Model Definition:

class ClipDemographics(torch.nn.Module):
    def __init__(self, checkpoint_path, image_size):
        super(ClipDemographics, self).__init__()

        labels = [
            ['female','gender_other','male'],
            ['age-ok','age-other'],
            ['primary','secondary'],
            [0]*100]

        # create model
        model = clip_utils.DemographicsModel(labels)
        model.visual.ln_pre = convert_ln(model.visual.ln_pre)
        for block in model.visual.transformer.resblocks:
            block.ln_1 = convert_ln(block.ln_1)
            block.ln_2 = convert_ln(block.ln_2)
        model.visual.ln_post = convert_ln(model.visual.ln_post)

        saved_model = torch.load(args.checkpoint_path)
        if type(saved_model) == dict:
          saved_model = saved_model['model']
        model.load_state_dict(saved_model.state_dict())
        self.model = model.cuda().eval()
        self.class_dim = torch.tensor([0]).int() # this is not important...

        self.resize_h = image_size[0]
        self.resize_w = image_size[0]
        if len(image_size) > 1:
            self.resize_w = image_size[1]

    # bbox [0,0,1,1] is entire image
    # bbox [-.1,0,1,1] is entire image with 10% padding on the left
    # bbox [0,0.5,1,1.5] pads the bottom of the image by 50%, and then takes a crop of the bottom half of the image
    def forward(self, img, bboxes):
        print(f"image input start shape: {img.shape}")   # (B, C, H, W): (1, 3, 224, 224)
        img = img.squeeze(0).permute(1, 2, 0)   # (H, W, C): (224, 224, 3)
        print(f"image model start shape: {img.shape}")
        
        height, width = img.size(0), img.size(1)
        bbox_list = []
        for bbox in bboxes:
            y0, x0, y1, x1 = bbox[0], bbox[1], bbox[2], bbox[3]

            # compute 'extend_square_80' bounding box adjustment:
            ix0, iy0, ix1, iy1 = (torch.round(x0 * width)).int(), (torch.round(y0 * height)).int(), (torch.round(x1 * width)).int(), (torch.round(y1 * height)).int()

            bbox_w = ix1 - ix0
            bbox_h = iy1 - iy0

            # first, extend the bbox to a square if necessary:
            if bbox_w > bbox_h:
                new_iy1 = iy1 + torch.floor((bbox_w - bbox_h) / 2)
                new_iy0 = iy0 - torch.ceil((bbox_w - bbox_h) / 2)
                iy0,iy1 = new_iy0, new_iy1
            else:
                new_ix1 = ix1 + torch.floor((bbox_h - bbox_w) / 2)
                new_ix0 = ix0 - torch.ceil((bbox_h - bbox_w) / 2)
                ix0,ix1 = new_ix0, new_ix1

            # Then, add 80% to each side
            escrop = 80 / 100 / 2
            new_ix0 = (ix0 - escrop * (ix1 - ix0)).int()
            new_iy0 = (iy0 - escrop * (iy1 - iy0)).int()
            new_ix1 = (ix1 + escrop * (ix1 - ix0)).int()
            new_iy1 = (iy1 + escrop * (iy1 - iy0)).int()

            x0, y0, x1, y1 = new_ix0 / width, new_iy0 / height, new_ix1 / width, new_iy1 / height
            new_bbox = torch.tensor([y0, x0, y1, x1])

            # Current bbox width and height
            bbox_w = ((x1 - x0) * width).int()
            bbox_h = ((y1 - y0) * height).int()

            pad_left, pad_top, pad_right, pad_bottom = 0, 0, 0, 0

            # bbox width and height that we are gonna get after the resize
            resize_w, resize_h = self.resize_w, self.resize_h
            paddings = []

            # check if the bbox we're gonna get is degenerate and if so fix it
            # this matters because of how we calculate scale_w below. if bbox_w
            # is 0, then scale_w will be infinite and the inference will fail
            bbox_w = max(1, bbox_w) 
            bbox_h = max(1, bbox_h)

            # calc how much we have to scale the image so that the bbox is scaled to
            # the correct size
            scale_w = self.resize_w / bbox_w
            scale_h = self.resize_h / bbox_h

            # First, calculate how much we have to pad the image after it is resized
            # here, x0 is the coord after extendsquare AND 80 crop, 
            #     in image space
            # left_crop is just .40
            # width is the width of the image
            # scale_w is the scaling ratio 
            if x0 < 0:
                pad_left = int((-x0) * width * scale_w)
            if y0 < 0:
                pad_top = int((-y0) * height * scale_h)
            if x1 > 1:
                pad_right = int((x1 - 1) * width * scale_w)
            if y1 > 1:
                pad_bottom = int((y1 - 1) * height * scale_h)

            # If the bbox is invalid (resulting in incorrect padding) just make 
            # sure each dimension >= 1. (since dimensions for resize are calculated
            # below using these padding values)
            if pad_left + pad_right >= self.resize_w:
                pad_left = self.resize_w // 2 - 1
                pad_right = self.resize_w // 2 - 1
            if pad_top + pad_bottom >= self.resize_h:
                pad_top = self.resize_h // 2 - 1
                pad_bottom = self.resize_h // 2 - 1

            # Now that we've calculated padding, clamp bbox so it is a valid crop
            new_bbox = torch.clamp(new_bbox, min=0.0, max=1.0)

            y0,x0,y1,x1 = new_bbox[0], new_bbox[1], new_bbox[2], new_bbox[3] # get new clamped vals

            top = (torch.floor(y0 * height)).int()
            left = (torch.floor(x0 * width)).int()

            right = (torch.ceil(x1 * width)).int()
            bottom = (torch.ceil(y1 * height)).int()

            bbox_w = (torch.round((x1 - x0) * width)).int()
            bbox_h = (torch.round((y1 - y0) * height)).int()
            bbox_img = torch.clone(img[top : bottom, left : right, :])
            bbox_img = bbox_img.permute(2, 0, 1).unsqueeze(0)

            # ------------------------------ Problem Line -----------------------------
            # Resize the image
            bbox_img = FU.interpolate(bbox_img, size=(self.resize_h - pad_top - pad_bottom, self.resize_w - pad_left - pad_right))
            # ------------------------------ Problem Line -----------------------------

            # Convert the image back to (height, width, channels) format
            bbox_img = bbox_img.squeeze(0).permute(1, 2, 0)

            end_img  = torch.zeros((self.resize_w, self.resize_h, 3))
            end_img[pad_top:self.resize_h - pad_bottom, pad_left:self.resize_w - pad_right, :] = bbox_img

            bbox_list.append(end_img)
        imgs = torch.stack(bbox_list)
        imgs = imgs.cuda().float()

        imgs /= 255.0
        imgs = imgs.permute(0, 3, 1, 2)  # (Batch, channel, height, width):(1, 3, 224, 224)

        clip_mean = (0.48145466, 0.4578275, 0.40821073)
        clip_std = (0.26862954, 0.26130258, 0.27577711)
        imgs[:,0] = (imgs[:,0] - clip_mean[0]) / clip_std[0];
        imgs[:,1] = (imgs[:,1] - clip_mean[1]) / clip_std[1];
        imgs[:,2] = (imgs[:,2] - clip_mean[2]) / clip_std[2];

        outputs = self.model(imgs)
        final_vals = {}

        # gender head is head 0
        gender_probs = torch.nn.functional.softmax(outputs[0],dim=1)
        final_vals['female'] = gender_probs[:, 0:1]
        final_vals['other_gender'] = gender_probs[:, 1:2]
        final_vals['male'] = gender_probs[:, 2:3]

        # age-other head is head 1
        age_ok_probs = torch.nn.functional.softmax(outputs[1],dim=1)
        final_vals['other_age'] = age_ok_probs[:, 1:2]

        # secondary/primary head is head 2
        primary_probs = torch.nn.functional.softmax(outputs[2],dim=1)
        final_vals['primary'] = primary_probs[:, 0:1]
        final_vals['secondary'] = primary_probs[:, 1:2]

        # 0-100 age probs is head 3   
        age_probs = torch.nn.functional.softmax(outputs[3],dim=1)
        final_vals['baby']         = torch.sum(age_probs[:, 0:2], dim=1, keepdim=True)
        final_vals['toddler']      = torch.sum(age_probs[:, 2:5], dim=1, keepdim=True)
        final_vals['pre_teen']     = torch.sum(age_probs[:, 5:13], dim=1, keepdim=True)
        final_vals['teenager']     = torch.sum(age_probs[:, 13:18], dim=1, keepdim=True)
        final_vals['adult']        = torch.sum(age_probs[:, 18:45], dim=1, keepdim=True)
        final_vals['middle_aged']  = torch.sum(age_probs[:, 45:65], dim=1, keepdim=True)
        final_vals['senior']       = torch.sum(age_probs[:, 65:], dim=1, keepdim=True)

        age_midpoints = torch.unsqueeze(torch.arange(100, dtype=torch.int32).cuda(), dim=0) + 0.5
        final_vals['age_regression'] = torch.sum(age_probs * age_midpoints, dim=1, keepdim=True)

        classes = list(final_vals.keys())
        scores = list(final_vals.values())

        scores = torch.cat(scores, 1)
        return scores

Model conversion

args.image_size = [int(x) for x in args.image_size.split(',')]

model = ClipDemographics(args.checkpoint_path,
                            args.image_size,
                            )

traced_script_module = torch.jit.trace(model, (test_img, test_bboxes))
traced_script_module.save("traced_model.pt")

image_shape = ct.Shape(shape=(1,3, ct.RangeDim(64, 1024), ct.RangeDim(64, 1024)))
image_input_scale = ct.TensorType(name="colorImage", shape=image_shape)

bbox_shape = ct.Shape(shape=(ct.RangeDim(lower_bound=0, upper_bound=100), 4))
input_bbox = ct.TensorType(name="input_bbox", dtype=np.float32, shape=bbox_shape)

image_encoder_model = ct.convert(
            traced_script_module,
            convert_to="mlprogram",
            minimum_deployment_target=ct.target.iOS16,
            inputs=[image_input_scale, input_bbox],
            outputs=[ct.TensorType(name="output", dtype=np.float32)],
        )

image_encoder_model.save("demographics.mlpackage")
print("\n Finish Model Conversion \n")

# Load the MLModel
mlmodel = ct.models.MLModel('demographics.mlpackage')  
print("\n Finish Model Loading \n")

Jul 19 '23 18:07 Gianluigi121

@Gianluigi121 - This is a lot of code. Please give us a simpler example to reproduce your issue. Also please include all the code we need in order to run it (ex: import statements).

Jul 19 '23 21:07 TobyRoseman

@Gianluigi121 it's lots code but not too many lines to remove I'm afraid. What is such const error anyway, is it possible get into state you try to modify a const value in your app? Or is the val parameter somehow damaged. I'm new to Python so sorry my noobie questions :)

Jul 21 '23 09:07 harism