aimet icon indicating copy to clipboard operation
aimet copied to clipboard

issues with fine-tuning of quantsim model in keras

Open btuan opened this issue 3 years ago • 2 comments

Using aimet version 1.16.2, with tf-cpu.

Issue: when following the AIMET tutorial for using Keras, I'm running into issues re-loading the graph to fine-tune a quantsim model.

import numpy as np
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import preprocess_input

from aimet_common.defs import QuantScheme
from aimet_tensorflow import quantsim
from aimet_tensorflow.common import graph_eval
from aimet_tensorflow.utils import graph_saver
from aimet_tensorflow.utils.convert_tf_sess_to_keras import (
    save_tf_session_single_gpu,
    load_tf_sess_variables_to_keras_single_gpu
)

# instantiating a resnet50 model and saving it
model = tf.keras.applications.resnet50.ResNet50()
sess = tf.keras.backend.get_session()
input_op, output_op = model.inputs[0].name, model.outputs[0].name
sim = quantsim.QuantizationSimModel(
    sess,
    starting_op_names=[input_op[:input_op.index(":")]],
    output_op_names=[output_op[:output_op.index(":")]],
    quant_scheme=QuantScheme.post_training_tf_enhanced,
    config_file='/usr/local/lib/python3.6/dist-packages/aimet_common/quantsim_config/default_config.json'
)
save_tf_session_single_gpu(sim.session, "/tmp/aimet", input_op, output_op)

# loading the resnet50 + quantsim model into a subclassed keras model
tf.keras.backend.clear_session()
tf.keras.backend.set_learning_phase(1)
model = load_tf_sess_variables_to_keras_single_gpu("/tmp/aimet", [])
model.compile("adam", "categorical_crossentropy")

# compiling the model and attempting to train
x_train = np.random.rand(32, 224, 224, 3)
y_train = np.random.rand(32, )
x_train = preprocess_input(x_train)
y_train = tf.keras.utils.to_categorical(y_train, 1000)
history = model.fit(x_train, y_train, epochs=1, batch_size=1, shuffle=False)
Expand here for full exception stacktrace
---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py in get_attr(self, name)
   2379       with c_api_util.tf_buffer() as buf:
-> 2380         c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
   2381         data = c_api.TF_GetBuffer(buf)

InvalidArgumentError: Operation 'model/StatefulPartitionedCall' has no attr named '_XlaCompile'.

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gradients_util.py in _MaybeCompile(scope, op, func, grad_fn)
    344     try:
--> 345       xla_compile = op.get_attr("_XlaCompile")
    346       xla_separate_compiled_gradients = op.get_attr(

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py in get_attr(self, name)
   2383       # Convert to ValueError for backwards compatibility.
-> 2384       raise ValueError(str(e))
   2385     x = attr_value_pb2.AttrValue()

ValueError: Operation 'model/StatefulPartitionedCall' has no attr named '_XlaCompile'.

During handling of the above exception, another exception occurred:

InvalidArgumentError                      Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py in get_attr(self, name)
   2379       with c_api_util.tf_buffer() as buf:
-> 2380         c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
   2381         data = c_api.TF_GetBuffer(buf)

InvalidArgumentError: Operation 'conv5_block3_3_bn_1/cond/ReadVariableOp_2/Switch' has no attr named '_XlaCompile'.

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gradients_util.py in _MaybeCompile(scope, op, func, grad_fn)
    344     try:
--> 345       xla_compile = op.get_attr("_XlaCompile")
    346       xla_separate_compiled_gradients = op.get_attr(

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py in get_attr(self, name)
   2383       # Convert to ValueError for backwards compatibility.
-> 2384       raise ValueError(str(e))
   2385     x = attr_value_pb2.AttrValue()

ValueError: Operation 'conv5_block3_3_bn_1/cond/ReadVariableOp_2/Switch' has no attr named '_XlaCompile'.

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py in _apply_op_helper(self, op_type_name, name, **keywords)
    471                 preferred_dtype=default_dtype,
--> 472                 as_ref=input_arg.is_ref)
    473             if input_arg.number_attr and len(

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py in internal_convert_n_to_tensor(values, dtype, name, as_ref, preferred_dtype, ctx)
   1361             preferred_dtype=preferred_dtype,
-> 1362             ctx=ctx))
   1363   return ret

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py in internal_convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, ctx, accepted_result_types)
   1272           "Tensor conversion requested dtype %s for Tensor with dtype %s: %r" %
-> 1273           (dtype.name, value.dtype.name, value))
   1274     return value

ValueError: Tensor conversion requested dtype float32 for Tensor with dtype resource: <tf.Tensor 'gradients/conv5_block3_3_bn_1/cond/ReadVariableOp_2/Switch_grad/Switch_1:1' shape=() dtype=resource>

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-15-7cb75defa0d7> in <module>
      8 y_train = tf.keras.utils.to_categorical(y_train, 1000)
      9 
---> 10 history = model.fit(x_train, y_train, epochs=1, batch_size=1, shuffle=False)

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    725         max_queue_size=max_queue_size,
    726         workers=workers,
--> 727         use_multiprocessing=use_multiprocessing)
    728 
    729   def evaluate(self,

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_arrays.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
    673         validation_steps=validation_steps,
    674         validation_freq=validation_freq,
--> 675         steps_name='steps_per_epoch')
    676 
    677   def evaluate(self,

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_arrays.py in model_iteration(model, inputs, targets, sample_weights, batch_size, epochs, verbose, callbacks, val_inputs, val_targets, val_sample_weights, shuffle, initial_epoch, steps_per_epoch, validation_steps, validation_freq, mode, validation_in_fit, prepared_feed_values_from_dataset, steps_name, **kwargs)
    187   # function we recompile the metrics based on the updated
    188   # sample_weight_mode value.
--> 189   f = _make_execution_function(model, mode)
    190 
    191   # Prepare validation data. Hold references to the iterator and the input list

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_arrays.py in _make_execution_function(model, mode)
    564   if model._distribution_strategy:
    565     return distributed_training_utils._make_execution_function(model, mode)
--> 566   return model._make_execution_function(mode)
    567 
    568 

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py in _make_execution_function(self, mode)
   2181   def _make_execution_function(self, mode):
   2182     if mode == ModeKeys.TRAIN:
-> 2183       self._make_train_function()
   2184       return self.train_function
   2185     if mode == ModeKeys.TEST:

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py in _make_train_function(self)
   2113           # Training updates
   2114           updates = self.optimizer.get_updates(
-> 2115               params=self._collected_trainable_weights, loss=self.total_loss)
   2116           # Unconditional updates
   2117           updates += self.get_updates_for(None)

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in get_updates(self, loss, params)
    496 
    497   def get_updates(self, loss, params):
--> 498     grads = self.get_gradients(loss, params)
    499     grads_and_vars = list(zip(grads, params))
    500     self._assert_valid_dtypes([

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in get_gradients(self, loss, params)
    387     with backend.get_graph().as_default(), backend.name_scope(self._name +
    388                                                               "/gradients"):
--> 389       grads = gradients.gradients(loss, params)
    390       for grad, param in zip(grads, params):
    391         if grad is None:

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gradients_impl.py in gradients(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients)
    156         ys, xs, grad_ys, name, colocate_gradients_with_ops,
    157         gate_gradients, aggregation_method, stop_gradients,
--> 158         unconnected_gradients)
    159   # pylint: enable=protected-access
    160 

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gradients_util.py in _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients, src_graph)
    677                 # functions.
    678                 in_grads = _MaybeCompile(grad_scope, op, func_call,
--> 679                                          lambda: grad_fn(op, *out_grads))
    680               else:
    681                 # For function call ops, we add a 'SymbolicGradient'

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gradients_util.py in _MaybeCompile(scope, op, func, grad_fn)
    348       xla_scope = op.get_attr("_XlaScope").decode()
    349     except ValueError:
--> 350       return grad_fn()  # Exit early
    351 
    352   if not xla_compile:

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gradients_util.py in <lambda>()
    677                 # functions.
    678                 in_grads = _MaybeCompile(grad_scope, op, func_call,
--> 679                                          lambda: grad_fn(op, *out_grads))
    680               else:
    681                 # For function call ops, we add a 'SymbolicGradient'

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py in _registered_grad_fn(op, *doutputs)
    713     @ops.RegisterGradient(self._gradient_name)
    714     def _registered_grad_fn(op, *doutputs):  # pylint: disable=unused-variable
--> 715       return self._rewrite_forward_and_call_backward(op, *doutputs)
    716     return self._gradient_name
    717 

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py in _rewrite_forward_and_call_backward(self, op, *doutputs)
    659   def _rewrite_forward_and_call_backward(self, op, *doutputs):
    660     """Add outputs to the forward call and feed them to the grad function."""
--> 661     forward_function, backwards_function = self.forward_backward(len(doutputs))
    662     if not backwards_function.outputs:
    663       return []

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py in forward_backward(self, num_doutputs)
    580     if forward_backward is not None:
    581       return forward_backward
--> 582     forward, backward = self._construct_forward_backward(num_doutputs)
    583     self._cached_function_pairs[num_doutputs] = (forward, backward)
    584     return forward, backward

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py in _construct_forward_backward(self, num_doutputs)
    627           args=[], kwargs={},
    628           signature=signature,
--> 629           func_graph=backwards_graph)
    630       backwards_graph_captures = backwards_graph.external_captures
    631       captures_from_forward = [

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
    913                                           converted_func)
    914 
--> 915       func_outputs = python_func(*func_args, **func_kwargs)
    916 
    917       # invariant: `func_outputs` contains only Tensors, CompositeTensors,

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py in _backprop_function(*grad_ys)
    617           self._func_graph.inputs,
    618           grad_ys=grad_ys,
--> 619           src_graph=self._func_graph)
    620 
    621     with self._func_graph.as_default():

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gradients_util.py in _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops, gate_gradients, aggregation_method, stop_gradients, unconnected_gradients, src_graph)
    677                 # functions.
    678                 in_grads = _MaybeCompile(grad_scope, op, func_call,
--> 679                                          lambda: grad_fn(op, *out_grads))
    680               else:
    681                 # For function call ops, we add a 'SymbolicGradient'

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gradients_util.py in _MaybeCompile(scope, op, func, grad_fn)
    348       xla_scope = op.get_attr("_XlaScope").decode()
    349     except ValueError:
--> 350       return grad_fn()  # Exit early
    351 
    352   if not xla_compile:

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gradients_util.py in <lambda>()
    677                 # functions.
    678                 in_grads = _MaybeCompile(grad_scope, op, func_call,
--> 679                                          lambda: grad_fn(op, *out_grads))
    680               else:
    681                 # For function call ops, we add a 'SymbolicGradient'

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/control_flow_grad.py in _SwitchGrad(op, *grad)
     86     false_grad = switch(grad[0], op.inputs[1])[0]
     87     true_grad = switch(grad[1], op.inputs[1])[1]
---> 88     return merge([false_grad, true_grad])[0], None
     89 
     90 

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/control_flow_ops.py in merge(inputs, name)
    399         return gen_control_flow_ops.ref_merge(inputs, name)
    400       else:
--> 401         return gen_control_flow_ops.merge(inputs, name)
    402     else:
    403       # If there is a mix of tensors and indexed slices, then convert the

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gen_control_flow_ops.py in merge(inputs, name)
    444   _attr_N = len(inputs)
    445   _, _, _op = _op_def_lib._apply_op_helper(
--> 446         "Merge", inputs=inputs, name=name)
    447   _result = _op.outputs[:]
    448   _inputs_flat = _op.inputs

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/op_def_library.py in _apply_op_helper(self, op_type_name, name, **keywords)
    498                                 (prefix, dtype.name))
    499               else:
--> 500                 raise TypeError("%s that don't all match." % prefix)
    501             else:
    502               raise TypeError(

TypeError: Tensors in list passed to 'inputs' of 'Merge' Op have types [float32, resource] that don't all match.

btuan avatar Jul 22 '21 23:07 btuan

@btuan Thank you for reporting this issue. @quic-hanwxion could you take a quick look at this.

quic-ssiddego avatar Jul 26 '21 23:07 quic-ssiddego

Hi @btuan Apologies for not getting back on this earlier. At the moment, we have only tested recipe for AIMET compression support for Keras models. And, we are yet to test and extend support for QAT with keras model. Please track this issue for updates on this : https://github.com/quic/aimet/issues/765

quic-ssiddego avatar Oct 06 '21 22:10 quic-ssiddego

Closing this issue due to inactivity. Please re-open it/ create a new issue if you need further help.

quic-mangal avatar Apr 04 '23 15:04 quic-mangal