GroupNormalization torch op definition is incorrect
🐞Describing the bug
I think the definition of Group Normalization as a supported pytorch operation in coremltools/converters/mil/frontend/torch/ops.py (function: group_norm) is incorrect. The definition given in the Pytorch docs for torch.nn.GroupNorm states that the input can be of shape (N, C, ) so Group Norm should also work 3D (or even 2D) Tensors of shape (N, C, L) as an example where L = time dimension for sequences. However, the coremltools function group_norm is implemented to process only 4D inputs with shape (N, C, H, W) as for e.g. for images.
I tried converting a pre-trained Huggingface transformers model for Speech-to-text Wav2Vec2 into a coreml NeuralNetwork Model and got the following error.
Stack Trace
WARNING:root:Tuple detected at graph output. This will be flattened in the converted model.
Converting PyTorch Frontend ==> MIL Ops: 3%|▊ | 23/735 [00:00<00:00, 14807.21 ops/s]
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
Input In [42], in <cell line: 1>()
----> 1 coreml_stt_model = ct.convert(scripted_stt_model,
2 source="pytorch",
3 inputs=[ct.TensorType(name="context",
4 shape=(1, 64000),
5 dtype=np.float32)])
File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/_converters_entry.py:451, in convert(model, source, inputs, outputs, classifier_config, minimum_deployment_target, convert_to, compute_precision, skip_model_load, compute_units, package_dir, debug)
448 if specification_version is None:
449 specification_version = _set_default_specification_version(exact_target)
--> 451 mlmodel = mil_convert(
452 model,
453 convert_from=exact_source,
454 convert_to=exact_target,
455 inputs=inputs,
456 outputs=outputs_as_tensor_or_image_types, # None or list[ct.ImageType/ct.TensorType]
457 classifier_config=classifier_config,
458 transforms=tuple(transforms),
459 skip_model_load=skip_model_load,
460 compute_units=compute_units,
461 package_dir=package_dir,
462 debug=debug,
463 specification_version=specification_version,
464 )
466 if exact_target == 'milinternal':
467 return mlmodel # Returns the MIL program
File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/converter.py:193, in mil_convert(model, convert_from, convert_to, compute_units, **kwargs)
154 @_profile
155 def mil_convert(
156 model,
(...)
160 **kwargs
161 ):
162 """
163 Convert model from a specified frontend `convert_from` to a specified
164 converter backend `convert_to`.
(...)
191 See `coremltools.converters.convert`
192 """
--> 193 return _mil_convert(model, convert_from, convert_to, ConverterRegistry, MLModel, compute_units, **kwargs)
File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/converter.py:220, in _mil_convert(model, convert_from, convert_to, registry, modelClass, compute_units, **kwargs)
217 # To make sure everyone can read and write to this directory (on par with os.mkdir())
218 _os.chmod(weights_dir, _stat.S_IRWXU | _stat.S_IRWXG | _stat.S_IRWXO)
--> 220 proto, mil_program = mil_convert_to_proto(
221 model,
222 convert_from,
223 convert_to,
224 registry,
225 **kwargs
226 )
228 _reset_conversion_state()
230 if convert_to == 'milinternal':
File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/converter.py:283, in mil_convert_to_proto(model, convert_from, convert_to, converter_registry, **kwargs)
280 kwargs.setdefault("convert_to", convert_to)
281 frontend_converter = frontend_converter_type()
--> 283 prog = frontend_converter(model, **kwargs)
285 if convert_to.lower() != "neuralnetwork":
286 passes = kwargs.get("transforms", list())
File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/converter.py:115, in TorchFrontend.__call__(self, *args, **kwargs)
112 def __call__(self, *args, **kwargs):
113 from .frontend.torch import load
--> 115 return load(*args, **kwargs)
File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/load.py:53, in load(model_spec, debug, **kwargs)
51 opset_version = kwargs["specification_version"]
52 converter = TorchConverter(torchscript, inputs, outputs, cut_at_symbols, opset_version)
---> 53 return _perform_torch_convert(converter, debug)
File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/load.py:92, in _perform_torch_convert(converter, debug)
90 def _perform_torch_convert(converter, debug):
91 try:
---> 92 prog = converter.convert()
93 except RuntimeError as e:
94 if debug and "convert function" in str(e):
File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/converter.py:269, in TorchConverter.convert(self)
266 self.convert_const()
268 # Add the rest of the operations
--> 269 convert_nodes(self.context, self.graph)
271 graph_outputs = [self.context[name] for name in self.graph.outputs]
273 # An output can be None when it's a None constant, which happens
274 # in Fairseq MT.
File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/ops.py:92, in convert_nodes(context, graph)
88 if add_op is None:
89 raise RuntimeError(
90 "PyTorch convert function for op '{}' not implemented.".format(node.kind)
91 )
---> 92 add_op(context, node)
94 # We've generated all the outputs the graph needs, terminate conversion.
95 if _all_outputs_present(context, graph):
File ~/miniconda3/envs/coreml/lib/python3.9/site-packages/coremltools/converters/mil/frontend/torch/ops.py:1603, in group_norm(context, node)
1601 bias = inputs[3]
1602 eps = inputs[4]
-> 1603 n,c,h,w = x.shape[0],x.shape[1],x.shape[2],x.shape[3]
1604 num_groups = builtins.min(num_groups,c)
1605 x = mb.reshape(x=x, shape=[n,num_groups,c//num_groups,h,w])
IndexError: tuple index out of range
To Reproduce
import torch
import coremltools as ct
from transformers import AutoModelForCTC, AutoTokenizer
model_name = "facebook/wav2vec2-base-960h"
stt_tokenizer = AutoTokenizer.from_pretrained(model_name)
stt_model = AutoModelForCTC.from_pretrained(model_name, torchscript=True, return_dict=False)
stt_model = stt_model.eval()
# trace and script model
x = torch.randn(1, 64000)
traced_stt_model = torch.jit.trace(stt_model, x)
scripted_stt_model = torch.jit.script(traced_stt_model)
# convert
coreml_stt_model = ct.convert(scripted_stt_model, source="pytorch",
inputs=[ct.TensorType(name="context",
shape=(1, 64000),
dtype=np.float32)])
System environment (please complete the following information):
- coremltools version: 6.0b2
- OS: OSX Monterrey (M1 architecture)
- torch version 1.11.0
- python 3.9
- numpy 1.22.4
- transformers 4.22.0
Additional context
Fix applied to group_norm to handle input shapes like (N, C, *) as in pytorch documentation.
@register_torch_op
def group_norm(context, node):
inputs = _get_inputs(context, node, expected=6)
x = inputs[0]
num_groups = inputs[1].val
weight = inputs[2]
bias = inputs[3]
eps = inputs[4]
n,c = x.shape[0],x.shape[1] # at minimum (N, C) required
input_shape = [*x.shape] # n, c, *
num_groups = builtins.min(num_groups,c)
new_shape = [n, num_groups, c//num_groups]
new_shape += [*x.shape[2:]] # adds remaining dims
num_extra_axes = len(x.shape[2:])
axes_ = [int(i) for i in range(2, 2 + num_extra_axes + 1)]
weight_shape, bias_shape = [1,c], [1,c]
weight_shape += [1 for _ in range(num_extra_axes)]
bias_shape += [1 for _ in range(num_extra_axes)]
x = mb.reshape(x=x, shape=new_shape)
mean = mb.reduce_mean(x=x, axes=axes_, keep_dims=True)
var = _std(x,axes_,True,False,eps.val)
x = mb.sub(x=x,y=mean)
x = mb.real_div(x=x,y=var)
x = mb.reshape(x=x, shape=input_shape)
if weight is not None:
weight = mb.reshape(x=weight, shape=weight_shape)
x = mb.mul(x=x,y=weight)
if bias is not None:
bias = mb.reshape(x=bias, shape=bias_shape)
x = mb.add(x=x,y=bias)
context.add(x,node.name)
This fix, and following this one, also removing any weight_normalization layers in the model got me to successfully convert this Huggingface Wav2Vec2 model to coreml mlmodel format. The text transcription from this model, after passing through the tokenizer, looks fine, though the floating point tensor output of the model does deviate from the torchscript version significantly. Of course, this could happen even if the logic of my fix to group_norm is correct so I am not 100% sure whether what I am doing is correct. Please let me know.
@GaganNarula - please open a pull request with this fix.
Fixed via #1609