tvm icon indicating copy to clipboard operation
tvm copied to clipboard

[FixBug][QNN][ONNX-Frontent] Error reading zero_point parameter in per-channel quantization.

Open MPolaris opened this issue 4 months ago • 0 comments

In QNN-Frontent, the zero_point parameter reading method seems to be incorrect. In the '_qnn_conv2d_legalize_cuda' function, data of type uint8 will be shift, but only when zero_point is a scalar, i.e. only when Per-Tensor is considered, in Per-channel, zero_point will be a 1d array. I have submitted a PR to fix this issue. Bugs can be replicated through the following code:

import onnx
import numpy as np

input_tensor = onnx.helper.make_tensor_value_info('input', onnx.TensorProto.FLOAT, [1,3,224,224])
output_tensor = onnx.helper.make_tensor_value_info('output', onnx.TensorProto.FLOAT, [1,3,112,112])
input_q_info = onnx.helper.make_tensor_value_info('input_q', onnx.TensorProto.UINT8, [1,3,224,224])
conv_q_info = onnx.helper.make_tensor_value_info('conv_q', onnx.TensorProto.UINT8, [1,3,112,112])

q1_scale = onnx.helper.make_tensor('q1_scale', onnx.TensorProto.FLOAT, [], [1])
q1_zero_point = onnx.helper.make_tensor('q1_zero_point', onnx.TensorProto.UINT8, [], [0])
q2_scale = onnx.helper.make_tensor('q2_scale', onnx.TensorProto.FLOAT, [], [1])
q2_zero_point = onnx.helper.make_tensor('q2_zero_point', onnx.TensorProto.UINT8, [], [0])
weight = onnx.helper.make_tensor('weight', onnx.TensorProto.UINT8, [3,3,3,3], np.random.randint(0, 255, (3,3,3,3)).astype(np.uint8))
bias = onnx.helper.make_tensor('bias', onnx.TensorProto.INT32, [3], np.random.randn(3).astype(np.int32))
w_scale = onnx.helper.make_tensor('w_scale', onnx.TensorProto.FLOAT, [3], [1,2,3])
w_zero_point = onnx.helper.make_tensor('w_zero_point', onnx.TensorProto.UINT8, [3], [1,2,3])

input_q = onnx.helper.make_node('QuantizeLinear', ['input', 'q1_scale', 'q1_zero_point'], ['input_q'], name='input_quantize')
attrs = {
    "dilations":[1, 1],
    "group":1,
    "kernel_shape":[3, 3],
    "pads":[1, 1, 1, 1],
    "strides":[2, 2]
}
conv = onnx.helper.make_node('QLinearConv', ['input_q', 'q1_scale', 'q1_zero_point', 
                                                'weight', 'w_scale', 'w_zero_point', 
                                                'q2_scale', 'q2_zero_point', 'bias'], ['conv_q'], name='conv', **attrs)
output = onnx.helper.make_node('DequantizeLinear', ['conv_q', 'q2_scale', 'q2_zero_point'], ['output'], name='output_dequantize')

graph = onnx.helper.make_graph(
    [input_q, conv, output],
    'quantized_graph',
    [input_tensor],
    [output_tensor],
    initializer=[q1_scale, q1_zero_point, q2_scale, q2_zero_point, weight, bias, w_scale, w_zero_point],
    value_info=[input_q_info, conv_q_info],
)

model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("com.microsoft", 1), onnx.helper.make_opsetid("", 11)])

model_name = "./quantized.onnx"
onnx.save_model(model, model_name)

import tvm
from tvm import relay
onnx_model = onnx.load("./quantized.onnx")
mod, params = relay.frontend.from_onnx(onnx_model)
target ="cuda"
with tvm.transform.PassContext(opt_level=3):
    executor = relay.build_module.create_executor(
        "graph", mod, tvm.cuda(0), target, params
    ).evaluate()

cc @ibsidorenko

MPolaris avatar Feb 27 '24 02:02 MPolaris