coremltools GroupNormalization torch op definition is incorrect

🐞Describing the bug

I think the definition of Group Normalization as a supported pytorch operation in coremltools/converters/mil/frontend/torch/ops.py (function: group_norm) is incorrect. The definition given in the Pytorch docs for torch.nn.GroupNorm states that the input can be of shape (N, C, ) so Group Norm should also work 3D (or even 2D) Tensors of shape (N, C, L) as an example where L = time dimension for sequences. However, the coremltools function group_norm is implemented to process only 4D inputs with shape (N, C, H, W) as for e.g. for images.

I tried converting a pre-trained Huggingface transformers model for Speech-to-text Wav2Vec2 into a coreml NeuralNetwork Model and got the following error.

Stack Trace

WARNING:root:Tuple detected at graph output. This will be flattened in the converted model.
Converting PyTorch Frontend ==> MIL Ops:   3%|▊                        | 23/735 [00:00<00:00, 14807.21 ops/s]
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Input In [42], in <cell line: 1>()
----> 1 coreml_stt_model = ct.convert(scripted_stt_model,
      2                               source="pytorch", 
      3                               inputs=[ct.TensorType(name="context",
      4                                                     shape=(1, 64000),
      5                                                     dtype=np.float32)])

File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/_converters_entry.py:451, in convert(model, source, inputs, outputs, classifier_config, minimum_deployment_target, convert_to, compute_precision, skip_model_load, compute_units, package_dir, debug)
    448 if specification_version is None:
    449     specification_version = _set_default_specification_version(exact_target)
--> 451 mlmodel = mil_convert(
    452     model,
    453     convert_from=exact_source,
    454     convert_to=exact_target,
    455     inputs=inputs,
    456     outputs=outputs_as_tensor_or_image_types, # None or list[ct.ImageType/ct.TensorType]
    457     classifier_config=classifier_config,
    458     transforms=tuple(transforms),
    459     skip_model_load=skip_model_load,
    460     compute_units=compute_units,
    461     package_dir=package_dir,
    462     debug=debug,
    463     specification_version=specification_version,
    464 )
    466 if exact_target == 'milinternal':
    467     return mlmodel # Returns the MIL program

File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/converter.py:193, in mil_convert(model, convert_from, convert_to, compute_units, **kwargs)
    154 @_profile
    155 def mil_convert(
    156     model,
   (...)
    160     **kwargs
    161 ):
    162     """
    163     Convert model from a specified frontend `convert_from` to a specified
    164     converter backend `convert_to`.
   (...)
    191         See `coremltools.converters.convert`
    192     """
--> 193     return _mil_convert(model, convert_from, convert_to, ConverterRegistry, MLModel, compute_units, **kwargs)

File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/converter.py:220, in _mil_convert(model, convert_from, convert_to, registry, modelClass, compute_units, **kwargs)
    217     # To make sure everyone can read and write to this directory (on par with os.mkdir())
    218     _os.chmod(weights_dir, _stat.S_IRWXU | _stat.S_IRWXG | _stat.S_IRWXO)
--> 220 proto, mil_program = mil_convert_to_proto(
    221                         model,
    222                         convert_from,
    223                         convert_to,
    224                         registry,
    225                         **kwargs
    226                      )
    228 _reset_conversion_state()
    230 if convert_to == 'milinternal':

File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/converter.py:283, in mil_convert_to_proto(model, convert_from, convert_to, converter_registry, **kwargs)
    280 kwargs.setdefault("convert_to", convert_to)
    281 frontend_converter = frontend_converter_type()
--> 283 prog = frontend_converter(model, **kwargs)
    285 if convert_to.lower() != "neuralnetwork":
    286     passes = kwargs.get("transforms", list())

File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/converter.py:115, in TorchFrontend.__call__(self, *args, **kwargs)
    112 def __call__(self, *args, **kwargs):
    113     from .frontend.torch import load
--> 115     return load(*args, **kwargs)

File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/load.py:53, in load(model_spec, debug, **kwargs)
     51 opset_version = kwargs["specification_version"]
     52 converter = TorchConverter(torchscript, inputs, outputs, cut_at_symbols, opset_version)
---> 53 return _perform_torch_convert(converter, debug)

File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/load.py:92, in _perform_torch_convert(converter, debug)
     90 def _perform_torch_convert(converter, debug):
     91     try:
---> 92         prog = converter.convert()
     93     except RuntimeError as e:
     94         if debug and "convert function" in str(e):

File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/converter.py:269, in TorchConverter.convert(self)
    266 self.convert_const()
    268 # Add the rest of the operations
--> 269 convert_nodes(self.context, self.graph)
    271 graph_outputs = [self.context[name] for name in self.graph.outputs]
    273 # An output can be None when it's a None constant, which happens
    274 # in Fairseq MT.

File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/ops.py:92, in convert_nodes(context, graph)
     88 if add_op is None:
     89     raise RuntimeError(
     90         "PyTorch convert function for op '{}' not implemented.".format(node.kind)
     91     )
---> 92 add_op(context, node)
     94 # We've generated all the outputs the graph needs, terminate conversion.
     95 if _all_outputs_present(context, graph):

File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/ops.py:1603, in group_norm(context, node)
   1601 bias = inputs[3]
   1602 eps = inputs[4] 
-> 1603 n,c,h,w = x.shape[0],x.shape[1],x.shape[2],x.shape[3]
   1604 num_groups = builtins.min(num_groups,c)
   1605 x = mb.reshape(x=x, shape=[n,num_groups,c//num_groups,h,w])

IndexError: tuple index out of range

To Reproduce

import torch
import coremltools as ct
from transformers import AutoModelForCTC, AutoTokenizer

model_name = "facebook/wav2vec2-base-960h"
stt_tokenizer = AutoTokenizer.from_pretrained(model_name)
stt_model = AutoModelForCTC.from_pretrained(model_name, torchscript=True, return_dict=False)
stt_model = stt_model.eval()

# trace and script model
x = torch.randn(1, 64000)
traced_stt_model = torch.jit.trace(stt_model, x)
scripted_stt_model = torch.jit.script(traced_stt_model)

# convert
coreml_stt_model = ct.convert(scripted_stt_model, source="pytorch",
                                                     inputs=[ct.TensorType(name="context",
                                                     shape=(1, 64000),
                                                     dtype=np.float32)])

System environment (please complete the following information):

coremltools version: 6.0b2
OS: OSX Monterrey (M1 architecture)
torch version 1.11.0
python 3.9
numpy 1.22.4
transformers 4.22.0

Additional context

Fix applied to group_norm to handle input shapes like (N, C, *) as in pytorch documentation.

@register_torch_op
def group_norm(context, node):
    inputs = _get_inputs(context, node, expected=6)
    x = inputs[0]
    num_groups = inputs[1].val
    weight = inputs[2]
    bias = inputs[3]
    eps = inputs[4] 
    n,c = x.shape[0],x.shape[1] # at minimum (N, C) required
    input_shape = [*x.shape] # n, c, *
    num_groups = builtins.min(num_groups,c)
    new_shape = [n, num_groups, c//num_groups]
    new_shape += [*x.shape[2:]] # adds remaining dims
    num_extra_axes = len(x.shape[2:])
    axes_ = [int(i) for i in range(2, 2 + num_extra_axes + 1)]
    weight_shape, bias_shape = [1,c], [1,c]
    weight_shape += [1 for _ in range(num_extra_axes)]
    bias_shape += [1 for _ in range(num_extra_axes)]
    
    x = mb.reshape(x=x, shape=new_shape)
    mean = mb.reduce_mean(x=x, axes=axes_, keep_dims=True)
    var = _std(x,axes_,True,False,eps.val)
    x = mb.sub(x=x,y=mean)
    x = mb.real_div(x=x,y=var)
    x = mb.reshape(x=x, shape=input_shape)
    if weight is not None:
        weight = mb.reshape(x=weight, shape=weight_shape)
        x = mb.mul(x=x,y=weight)
    if bias is not None:
        bias = mb.reshape(x=bias, shape=bias_shape)
        x = mb.add(x=x,y=bias)
    context.add(x,node.name)

This fix, and following this one, also removing any weight_normalization layers in the model got me to successfully convert this Huggingface Wav2Vec2 model to coreml mlmodel format. The text transcription from this model, after passing through the tokenizer, looks fine, though the floating point tensor output of the model does deviate from the torchscript version significantly. Of course, this could happen even if the logic of my fix to group_norm is correct so I am not 100% sure whether what I am doing is correct. Please let me know.

Sep 21 '22 14:09 GaganNarula

@GaganNarula - please open a pull request with this fix.

Sep 21 '22 19:09 TobyRoseman

Fixed via #1609

Sep 27 '22 16:09 TobyRoseman