keras-tuner icon indicating copy to clipboard operation
keras-tuner copied to clipboard

Keras Tuner stopped at the end of first epoch when train on TPU

Open qlzh727 opened this issue 4 years ago • 20 comments

This is discovered during test of https://www.kaggle.com/kivlichangoogle/jigsaw-multilingual-getting-started. Note that this issue only happens when TPU is enabled for the notebook.

When Keras tuner is added, the search process will fail at the start of send epoch, when saving/loading checkpoint. I think the root cause is in TF and TPU, but a temp walk around in keras tuner will be nice, eg disable checkpointing if possible.

The error stack is like below:

---------------------------------------------------------------------------
UnimplementedError                        Traceback (most recent call last)
<ipython-input-14-a9734e750f48> in <module>
      6              verbose=1,
      7              validation_data=nonenglish_val_datasets['Combined'],
----> 8              validation_steps=100)

/opt/conda/lib/python3.7/site-packages/kerastuner/engine/base_tuner.py in search(self, *fit_args, **fit_kwargs)
    128 
    129             self.on_trial_begin(trial)
--> 130             self.run_trial(trial, *fit_args, **fit_kwargs)
    131             self.on_trial_end(trial)
    132         self.on_search_end()

/opt/conda/lib/python3.7/site-packages/kerastuner/engine/multi_execution_tuner.py in run_trial(self, trial, *fit_args, **fit_kwargs)
     94 
     95             model = self.hypermodel.build(trial.hyperparameters)
---> 96             history = model.fit(*fit_args, **copied_fit_kwargs)
     97             for metric, epoch_values in history.history.items():
     98                 if self.oracle.objective.direction == 'min':

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    817         max_queue_size=max_queue_size,
    818         workers=workers,
--> 819         use_multiprocessing=use_multiprocessing)
    820 
    821   def evaluate(self,

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    340                 mode=ModeKeys.TRAIN,
    341                 training_context=training_context,
--> 342                 total_epochs=epochs)
    343             cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
    344 

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
    126         step=step, mode=mode, size=current_batch_size) as batch_logs:
    127       try:
--> 128         batch_outs = execution_function(iterator)
    129       except (StopIteration, errors.OutOfRangeError):
    130         # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in execution_function(input_fn)
     96     # `numpy` translates Tensors to values in Eager mode.
     97     return nest.map_structure(_non_none_constant_value,
---> 98                               distributed_function(input_fn))
     99 
    100   return execution_function

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/util/nest.py in map_structure(func, *structure, **kwargs)
    566 
    567   return pack_sequence_as(
--> 568       structure[0], [func(*x) for x in entries],
    569       expand_composites=expand_composites)
    570 

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/util/nest.py in <listcomp>(.0)
    566 
    567   return pack_sequence_as(
--> 568       structure[0], [func(*x) for x in entries],
    569       expand_composites=expand_composites)
    570 

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in _non_none_constant_value(v)
    128 
    129 def _non_none_constant_value(v):
--> 130   constant_value = tensor_util.constant_value(v)
    131   return constant_value if constant_value is not None else v
    132

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/tensor_util.py in constant_value(tensor, partial)
    820   """
    821   if isinstance(tensor, ops.EagerTensor):
--> 822     return tensor.numpy()
    823   if not is_tensor(tensor):
    824     return tensor

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in numpy(self)
    940     """
    941     # TODO(slebedev): Consider avoiding a copy for non-CPU or remote tensors.
--> 942     maybe_arr = self._numpy()  # pylint: disable=protected-access
    943     return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr
    944 

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in _numpy(self)
    908       return self._numpy_internal()
    909     except core._NotOkStatusException as e:
--> 910       six.raise_from(core._status_to_exception(e.code, e.message), None)
    911 
    912   @property

/opt/conda/lib/python3.7/site-packages/six.py in raise_from(value, from_value)

UnimplementedError: File system scheme '[local]' not implemented (file: 'keras-tuner-dir/jigsaw-multilingual/trial_5a9ddcb2f29ca8ba91966d1ca1862a84/checkpoints/epoch_0/checkpoint_temp_59890bc9cb8a40159f0a31dc22a070ff')
	Encountered when executing an operation using EagerExecutor. This error cancels all future operations and poisons their output tensors.

qlzh727 avatar May 29 '20 20:05 qlzh727

Yes, Same to me

Shubhamai avatar Jun 04 '20 11:06 Shubhamai

the same problem, you found any work around

MagdyIbrahim87 avatar Jun 15 '20 00:06 MagdyIbrahim87

Nope @MagdyIbrahim87

Shubhamai avatar Jun 15 '20 03:06 Shubhamai

I tried with gpu and it is working properly and the tpu works only when i changed file path to gcs bucket I have some question after the tuner test part of the trials if the systrm shut down can i resume the remaining trials or it should start from scratch

MagdyIbrahim87 avatar Jun 15 '20 05:06 MagdyIbrahim87

Same problem here!

@MagdyIbrahim87 can you show me how did you do that? I'm still studying cloud services.

damhurmuller avatar Jun 23 '20 03:06 damhurmuller

369 should have fixed this issue. Before Keras Tuner's next release, you can try pip install git+git://github.com/keras-team/keras-tuner@master

yixingfu avatar Jul 31 '20 00:07 yixingfu

369 should have fixed this issue. Before Keras Tuner's next release, you can try pip install git+git://github.com/keras-team/keras-tuner@master

With TF 2.2 GPU (Ubuntu) I can construct hyperband. However, while searching I get a TPU issue. Have you tested it?

turgut090 avatar Jul 31 '20 19:07 turgut090

@henry090 is it the same issue (local file system unimplemented, happening when checkpointing)?

yixingfu avatar Jul 31 '20 19:07 yixingfu

@yixingfu This is what I get:

  AttributeError: module 'tensorflow._api.v2.distribute' has no attribute 'TPUStrategy' 

turgut090 avatar Jul 31 '20 19:07 turgut090

@yixingfu This is what I get:

  AttributeError: module 'tensorflow._api.v2.distribute' has no attribute 'TPUStrategy' 

@henry090 I see. That is a slightly different thing. This fix is based on TF2.3, since TF2.3 is released already. In TF2.3 TPUStrategy is under distribute, but in TF2.2 it is under distribute.experimental. Can you try your code on TF2.3?

This will be off topic here, but to use TF2.3 in TPUs you need

from cloud_tpu_client import Client
c = Client()
c.configure_tpu_version(tf.__version__, restart_type='ifNeeded')

yixingfu avatar Jul 31 '20 19:07 yixingfu

Can you try your code on TF2.3?

It works now on my local machine. TF GPU 2.3 (Ubuntu)

This will be off topic here, but to use TF2.3 in TPUs you need

This helped to fix the error after this:

resolver = tf$distribute$cluster_resolver$TPUClusterResolver(tpu=paste('grpc://',Sys.getenv('COLAB_TPU_ADDR'),sep='' ))
tf$config$experimental_connect_to_cluster(resolver)
tf$tpu$experimental$initialize_tpu_system(resolver)

However, while searching I get the following.

Error in py_call_impl(callable, dots$args, dots$keywords): UnimplementedError: File system scheme '[local]' not implemented (file: 'results_k/mnist/trial_bbff097a2e59cf959f53d633b4f9457c/checkpoints/epoch_0/checkpoint_temp_fba1425e7e7644a0848d31c4f8ce019b/part-00000-of-00001')
	Encountered when executing an operation using EagerExecutor. This error cancels all future operations and poisons their output tensors.

turgut090 avatar Jul 31 '20 19:07 turgut090

Can you share a colab so I can take a look?

yixingfu avatar Jul 31 '20 20:07 yixingfu

@yixingfu Sorry for the late reply, https://colab.research.google.com/drive/1tQuB6v-_b09lQQN7EkQeV9CzvndCO6SZ?usp=sharing

turgut090 avatar Aug 01 '20 03:08 turgut090

@yixingfu Sorry for the late reply, https://colab.research.google.com/drive/1tQuB6v-_b09lQQN7EkQeV9CzvndCO6SZ?usp=sharing

You need to use strategy = tf.distribute.TPUStrategy() instead of tf.distribute.experimental.TPUStrategy(). That being said, it looks like the checkpointing callbacks are indeed trying to save h5 files (which should solve the TPU local file problem), but for some reason the checkpoints are not actually being saved here. I may need to look into this a bit more, but I did get it working on colab. I will share the example later after I edit it a bit.

yixingfu avatar Aug 02 '20 04:08 yixingfu

You need to use strategy = tf.distribute.TPUStrategy() instead of tf.distribute.experimental.TPUStrategy()

This now gets stuck here: https://colab.research.google.com/drive/1tQuB6v-_b09lQQN7EkQeV9CzvndCO6SZ?usp=sharing Screenshot from 2020-08-02 09-15-36

turgut090 avatar Aug 02 '20 05:08 turgut090

@yixingfu any news?

turgut090 avatar Aug 04 '20 14:08 turgut090

@yixingfu any news?

I'm kind of wondering on this. Looks like the checkpoint is not being saved.

This gist shows a working example of using Keras Tuner directly. Not sure why it is failing using the R wrapper.

yixingfu avatar Aug 04 '20 15:08 yixingfu

This gist shows

Thanks for this. In fact, it was working even last time. From RStudio I am able to see results. It is because of the Jupyter notebook. I just ran it for 1 epoch and could get results. However, an increase in epoch number on TPU did not decrease the amount of spent time for computation. Strange.

turgut090 avatar Aug 04 '20 16:08 turgut090

Keras tuner hyperband throws the following error

UnimplementedError: ./untitled_project; Operation not supported

medha-chippa avatar Jan 23 '22 08:01 medha-chippa

To add ...

I don't use Google Colab, but Kaggle. Using TPU, I get that same error "File system scheme '[local]' not implemented", when the tuner tries to write the checkpoints on Kaggle's working directory.

Since I don't have a gs://location, I just "modified" the function called by Keras Tuner to save checkpoints, to allow writing to local dir, which is the Kaggle working directory. I used patch() to mock the function.

First important thing is that Keras Tuner must be version 1.1.2 and above.

Example:

from mock import patch

<your code>

# now the new function to "replace" the existing one (keras_tuner.engine.tuner_utils.SaveBestEpoch.on_epoch_end)

    def new_on_epoch_end(self, epoch, logs=None):
        if not self.objective.has_value(logs):
            # Save on every epoch if metric value is not in the logs. Either no
            # objective is specified, or objective is computed and returned
            # after `fit()`.

            #***** the following are the lines I added ******************************************
            # Save model in Tensorflow's "SavedModel" format
            
            save_locally = tf.saved_model.SaveOptions(experimental_io_device = '/job:localhost')
            
            # I then added ', options = save_locally' to the line below.
            #************************************************************************************
        
            self.model.save_weights(self.filepath, options = save_locally)
            return
        current_value = self.objective.get_value(logs)
        if self.objective.better_than(current_value, self.best_value):
            self.best_value = current_value
            
            #***** the following are the lines I added ******************************************
            # Save model in Tensorflow's "SavedModel" format
            
            save_locally = tf.saved_model.SaveOptions(experimental_io_device = '/job:localhost')
            
            # I then added ', options = save_locally' to the line below.
            #************************************************************************************
            
            self.model.save_weights(self.filepath, options = save_locally)    
    

    with patch('keras_tuner.engine.tuner_utils.SaveBestEpoch.on_epoch_end', new_on_epoch_end):
        # Perform hypertuning.  The parameters are exactly like those in the fit() method.
        tuner.search(
            X_train,
            y_train,
            epochs=num_of_epochs,
            validation_data = (X_valid, y_valid), 
            callbacks=[early_stopping]   
            )

<more of your code>

Since I used 'with patch', after all is done, it reverts back to the original code automatically.

josephramon avatar May 23 '22 21:05 josephramon