neural-compressor
neural-compressor copied to clipboard
Export Quantized Model to ONNX: `NotImplementedError: Could not run 'quantized::conv2d.new' with arguments from the 'CPU' backend.`
Dear all,
In order to easily use Intel Neural Compressor in our team, and because we use PyTorch Lightning, I am building Lightning Callbacks in order to call your hooks when needed in lightning's training loop. Here is the current state of the project: https://github.com/clementpoiret/lightning-nc
Unfortunately, I face issues when trying to export the quantized models in ONNX. Exporting a fp32 model does not cause issues.
Here is a toy example you can play with (requires torch 2.1 and lightning 2.1):
import os
import lightning as L
import timm
import torch
import torch.nn.functional as F
from neural_compressor import QuantizationAwareTrainingConfig
from neural_compressor.config import Torch2ONNXConfig
from neural_compressor.training import WeightPruningConfig
from lightning_nc import QATCallback, WeightPruningCallback
from torch import Tensor, nn, optim, utils
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
# Define your main model here
class VeryComplexModel(nn.Module):
def __init__(self):
super().__init__()
self.backbone = timm.create_model("vit_small_patch14_dinov2.lvd142m",
pretrained=True)
self.mlp = nn.Sequential(nn.Linear(self.backbone.num_features, 128),
nn.ReLU(), nn.Linear(128, 10))
def forward(self, x):
return self.mlp(self.backbone(x))
# Then, define your LightningModule as usual
class Classifier(L.LightningModule):
def __init__(self):
super().__init__()
# This is mandatory for the callbacks
self.model = VeryComplexModel()
def forward(self, x):
return self.model(x)
def training_step(self, batch, batch_idx):
x, y = batch
# This is just to use MNIST images on a pretrained timm model, you can skip that
x = x.repeat(1, 3, 1, 1)
x = F.interpolate(x, size=(518, 518))
y_hat = self.forward(x)
loss = F.cross_entropy(y_hat, y)
return loss
def configure_optimizers(self):
optimizer = optim.Adam(self.parameters(), lr=1e-3)
return [optimizer]
# setup data
dataset = MNIST(os.getcwd(), download=True, transform=ToTensor())
train_loader = utils.data.DataLoader(dataset)
clf = Classifier()
# Define the configs for Pruning and Quantization
q_config = QuantizationAwareTrainingConfig()
p_config = WeightPruningConfig([{
"op_names": ["backbone.*"],
"start_step": 1,
"end_step": 100,
"target_sparsity": 0.5,
"pruning_frequency": 1,
"pattern": "4x1",
"min_sparsity_ratio_per_op": 0.,
"pruning_scope": "global",
}])
callbacks = [
QATCallback(config=q_config),
WeightPruningCallback(config=p_config),
]
trainer = L.Trainer(accelerator="gpu",
strategy="auto",
limit_train_batches=100,
max_epochs=1,
callbacks=callbacks)
trainer.fit(model=clf, train_dataloaders=train_loader)
# Save the fp32 model if we remove the QATCallback: OK
# clf.model.export(
# "fp32_model.onnx",
# Torch2ONNXConfig(
# dtype="fp32",
# opset_version=17,
# example_inputs=torch.randn(1, 3, 518, 518),
# input_names=["input"],
# output_names=["output"],
# dynamic_axes={
# "input": {
# 0: "batch_size"
# },
# "output": {
# 0: "batch_size"
# },
# },
# ))
# Save the quantized model: BROKEN
clf.model.export(
"int8_model.onnx",
Torch2ONNXConfig(
dtype="int8",
opset_version=17,
quant_format="QLinear", # or QDQ
example_inputs=torch.randn(1, 3, 518, 518),
input_names=["input"],
output_names=["output"],
dynamic_axes={
"input": {
0: "batch_size"
},
"output": {
0: "batch_size"
},
},
))
when calling the export(...) fn, I end up with the following error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/neural_compressor/model/torch_model.py", line 395, in export
torch_to_int8_onnx(
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/neural_compressor/experimental/export/torch2onnx.py", line 410, in torch_to_int8_onnx
static_quant_export(
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/neural_compressor/experimental/export/torch2onnx.py", line 283, in static_quant_export
raise e
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/neural_compressor/experimental/export/torch2onnx.py", line 257, in static_quant_export
torch.onnx.export(
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/onnx/utils.py", line 516, in export
_export(
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/onnx/utils.py", line 1596, in _export
graph, params_dict, torch_out = _model_to_graph(
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/onnx/utils.py", line 1134, in _model_to_graph
model = _pre_trace_quant_model(model, args)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/onnx/utils.py", line 1089, in _pre_trace_quant_model
return torch.jit.trace(model, args)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/jit/_trace.py", line 798, in trace
return trace_module(
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/jit/_trace.py", line 1065, in trace_module
module._c._create_method_from_trace(
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1508, in _slow_forward
result = self.forward(*input, **kwargs)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/neural_compressor/experimental/export/torch2onnx.py", line 245, in wrapper
output = model_fn(*args, **kwargs)
File "<stdin>", line 9, in forward
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1508, in _slow_forward
result = self.forward(*input, **kwargs)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/timm/models/vision_transformer.py", line 685, in forward
x = self.forward_features(x)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/timm/models/vision_transformer.py", line 662, in forward_features
x = self.patch_embed(x)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1508, in _slow_forward
result = self.forward(*input, **kwargs)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/timm/layers/patch_embed.py", line 87, in forward
x = self.proj(x)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1508, in _slow_forward
result = self.forward(*input, **kwargs)
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/ao/nn/quantized/modules/conv.py", line 468, in forward
return ops.quantized.conv2d(
File "/home/clementpoiret/micromamba/envs/torch/lib/python3.10/site-packages/torch/_ops.py", line 692, in __call__
return self._op(*args, **kwargs or {})
NotImplementedError: Could not run 'quantized::conv2d.new' with arguments from the 'CPU' backend. This could be because the operator doesn't exist for this backend, or was
omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes
for possible resolutions. 'quantized::conv2d.new' is only available for these backends: [QuantizedCPU, QuantizedCUDA, BackendSelect, Python, FuncTorchDynamicLayerBackMode
, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradMPS, AutogradXPU, AutogradHPU, Aut
ogradLazy, AutogradMeta, Tracer, AutocastCPU, AutocastCUDA, FuncTorchBatched, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynam
icLayerFrontMode, PreDispatch, PythonDispatcher].
Do you have any clue?
I tried training completely on CPU by setting accelerator="cpu" on the Trainer, same issue.
Thanks a lot, Clément.
Some more details:
If I wrap x around QuantStub and DeQuantStub as in PyTorch's doc, such as:
# Define your main model here
class VeryComplexModel(nn.Module):
def __init__(self):
super().__init__()
self.quant = QuantStub()
self.dequant = DeQuantStub()
self.backbone = timm.create_model("vit_small_patch14_dinov2.lvd142m",
pretrained=True)
self.mlp = nn.Sequential(nn.Linear(self.backbone.num_features, 128),
nn.ReLU(), nn.Linear(128, 10))
def forward(self, x):
x = self.quant(x)
x = self.mlp(self.backbone(x))
x = self.dequant(x)
return x
I still have an error, not for 'quantized::conv2d.new' anymore, but:
NotImplementedError: Could not run 'aten::qscheme' with arguments from the 'CPU' backend.
I made some progresses, using a very simple model, not involving this aten:qscheme works, such as:
# Define your main model here
class VeryComplexModel(nn.Module):
def __init__(self):
super().__init__()
self.quant = torch.quantization.QuantStub()
self.dequant = torch.quantization.DeQuantStub()
self.backbone = nn.Sequential(
nn.Conv2d(1, 2, 3),
nn.ReLU(),
)
self.mlp = nn.Linear(1352, 10)
def forward(self, x):
x = self.quant(x)
x = self.backbone(x)
x = x.flatten(1)
x = self.mlp(x)
x = self.dequant(x)
return x
# Then, define your LightningModule as usual
class Classifier(L.LightningModule):
def __init__(self):
super().__init__()
# This is mandatory for the callbacks
self.model = VeryComplexModel()
def forward(self, x):
return self.model(x)
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self.forward(x)
loss = F.cross_entropy(y_hat, y)
return loss
def configure_optimizers(self):
optimizer = optim.Adam(self.parameters(), lr=1e-3)
return [optimizer]
The issue seems to occur in the forward method of the visiontranformer class, do you have any clue? maybe a config to exclude the _pos_embed fn from quantization?
Hi, any news on the issue? The only workaround I found has been to train normally first, compile to ONNX, then use a PTQ directly on the ONNX model to avoid using QAT, but that's not a real fix :/
Hi @clementpoiret , sorry for the late response. This is not an issue of export. The QAT PyTorch model you generated is invalid. Please refer to this document for the usage of QAT.
Dear @yuwenzho thanks for your answer. You're right, I certainly have a bug in my callbacks. But even following the doc, I can't export dinov2 as ONNX:
torch.onnx.errors.SymbolicValueError: ONNX symbolic expected the output of `%permute : Tensor(*, *, *, *, *) = onnx::Transpose[perm=[2, 0, 3, 1, 4]](%reshape), scope: torch.fx.graph_module.GraphModule:: # <eval_with_key>.31:48:0
` to be a quantized tensor. Is this likely due to missing support for quantized `onnx::Transpose`.
Here is a complete code snippet:
import os
import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
from neural_compressor import QuantizationAwareTrainingConfig
from neural_compressor.config import Torch2ONNXConfig
from neural_compressor.training import prepare_compression
from torch import optim, utils
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from tqdm import tqdm
# Define your main model here
class VeryComplexModel(nn.Module):
def __init__(self):
super().__init__()
self.backbone = timm.create_model("vit_small_patch14_dinov2.lvd142m",
pretrained=True)
self.clf_layers = nn.Sequential(
nn.Linear(self.backbone.num_features, 128), nn.ReLU(),
nn.Linear(128, 10))
def forward(self, x):
# x = x.repeat(1, 3, 1, 1)
# x = F.interpolate(x, size=(518, 518))
x = self.backbone(x)
x = self.clf_layers(x)
return x
criterion = nn.CrossEntropyLoss()
model = VeryComplexModel()
dataset = MNIST(os.getcwd(), download=True, transform=ToTensor())
train_loader = utils.data.DataLoader(dataset)
def train(model, steps=10):
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.train()
for epoch in range(2):
for i, (data, target) in enumerate(tqdm(train_loader)):
if i > steps:
break
# repeat and interpolate to match the input shape
data = data.repeat(1, 3, 1, 1)
data = F.interpolate(data, size=(518, 518))
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
conf = QuantizationAwareTrainingConfig()
compression_manager = prepare_compression(model, conf)
compression_manager.callbacks.on_train_begin()
model = compression_manager.model
train(model)
compression_manager.callbacks.on_train_end()
compression_manager.save("./output")
# Export as ONNX
model.export(
"int8_model.onnx",
Torch2ONNXConfig(
dtype="int8",
opset_version=17,
quant_format="QDQ",
example_inputs=torch.randn(1, 3, 518, 518),
input_names=["input"],
output_names=["output"],
dynamic_axes={
"input": {
0: "batch_size"
},
"output": {
0: "batch_size"
},
},
))
This error means that quantized transpose is not supported.
@PenghuiCheng I tried setting the transpose to FP32 but it doesn't work, could you please help to check?
from neural_compressor.utils.constant import FP32
conf = QuantizationAwareTrainingConfig(
op_type_dict={"transpose":FP32,}
)
Hi @clementpoiret , as @yuwenzho mentioned above, quantized transpose is not supported by ONNX exporter. For the full supported/unsupported TorchScript operators by ONNX export, please refer to ONNX SUPPORTED TORCHSCRIPT OPERATORS. While INC can fallback PyTorch modules that perform quantized operations to fp32 (typically defined for weighted operations like Linear and Conv), operators like transpose are traced and quantized by PyTorch thus cannot be set to fp32 during quantization in INC.
For your circumstance, you can try either creating a symbolic function to convert the operator and register it as a custom symbolic function, or contribute to PyTorch to add the same symbolic function to torch.onnx. For more details, you can refer to adding quantized ops.