keras-tuner
keras-tuner copied to clipboard
Keras Tuner stopped at the end of first epoch when train on TPU
This is discovered during test of https://www.kaggle.com/kivlichangoogle/jigsaw-multilingual-getting-started. Note that this issue only happens when TPU is enabled for the notebook.
When Keras tuner is added, the search process will fail at the start of send epoch, when saving/loading checkpoint. I think the root cause is in TF and TPU, but a temp walk around in keras tuner will be nice, eg disable checkpointing if possible.
The error stack is like below:
---------------------------------------------------------------------------
UnimplementedError Traceback (most recent call last)
<ipython-input-14-a9734e750f48> in <module>
6 verbose=1,
7 validation_data=nonenglish_val_datasets['Combined'],
----> 8 validation_steps=100)
/opt/conda/lib/python3.7/site-packages/kerastuner/engine/base_tuner.py in search(self, *fit_args, **fit_kwargs)
128
129 self.on_trial_begin(trial)
--> 130 self.run_trial(trial, *fit_args, **fit_kwargs)
131 self.on_trial_end(trial)
132 self.on_search_end()
/opt/conda/lib/python3.7/site-packages/kerastuner/engine/multi_execution_tuner.py in run_trial(self, trial, *fit_args, **fit_kwargs)
94
95 model = self.hypermodel.build(trial.hyperparameters)
---> 96 history = model.fit(*fit_args, **copied_fit_kwargs)
97 for metric, epoch_values in history.history.items():
98 if self.oracle.objective.direction == 'min':
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
817 max_queue_size=max_queue_size,
818 workers=workers,
--> 819 use_multiprocessing=use_multiprocessing)
820
821 def evaluate(self,
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
340 mode=ModeKeys.TRAIN,
341 training_context=training_context,
--> 342 total_epochs=epochs)
343 cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
344
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
126 step=step, mode=mode, size=current_batch_size) as batch_logs:
127 try:
--> 128 batch_outs = execution_function(iterator)
129 except (StopIteration, errors.OutOfRangeError):
130 # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in execution_function(input_fn)
96 # `numpy` translates Tensors to values in Eager mode.
97 return nest.map_structure(_non_none_constant_value,
---> 98 distributed_function(input_fn))
99
100 return execution_function
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/util/nest.py in map_structure(func, *structure, **kwargs)
566
567 return pack_sequence_as(
--> 568 structure[0], [func(*x) for x in entries],
569 expand_composites=expand_composites)
570
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/util/nest.py in <listcomp>(.0)
566
567 return pack_sequence_as(
--> 568 structure[0], [func(*x) for x in entries],
569 expand_composites=expand_composites)
570
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in _non_none_constant_value(v)
128
129 def _non_none_constant_value(v):
--> 130 constant_value = tensor_util.constant_value(v)
131 return constant_value if constant_value is not None else v
132
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/tensor_util.py in constant_value(tensor, partial)
820 """
821 if isinstance(tensor, ops.EagerTensor):
--> 822 return tensor.numpy()
823 if not is_tensor(tensor):
824 return tensor
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in numpy(self)
940 """
941 # TODO(slebedev): Consider avoiding a copy for non-CPU or remote tensors.
--> 942 maybe_arr = self._numpy() # pylint: disable=protected-access
943 return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr
944
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in _numpy(self)
908 return self._numpy_internal()
909 except core._NotOkStatusException as e:
--> 910 six.raise_from(core._status_to_exception(e.code, e.message), None)
911
912 @property
/opt/conda/lib/python3.7/site-packages/six.py in raise_from(value, from_value)
UnimplementedError: File system scheme '[local]' not implemented (file: 'keras-tuner-dir/jigsaw-multilingual/trial_5a9ddcb2f29ca8ba91966d1ca1862a84/checkpoints/epoch_0/checkpoint_temp_59890bc9cb8a40159f0a31dc22a070ff')
Encountered when executing an operation using EagerExecutor. This error cancels all future operations and poisons their output tensors.
Yes, Same to me
the same problem, you found any work around
Nope @MagdyIbrahim87
I tried with gpu and it is working properly and the tpu works only when i changed file path to gcs bucket I have some question after the tuner test part of the trials if the systrm shut down can i resume the remaining trials or it should start from scratch
Same problem here!
@MagdyIbrahim87 can you show me how did you do that? I'm still studying cloud services.
369 should have fixed this issue. Before Keras Tuner's next release, you can try pip install git+git://github.com/keras-team/keras-tuner@master
369 should have fixed this issue. Before Keras Tuner's next release, you can try
pip install git+git://github.com/keras-team/keras-tuner@master
With TF 2.2 GPU (Ubuntu) I can construct hyperband. However, while searching I get a TPU issue. Have you tested it?
@henry090 is it the same issue (local file system unimplemented, happening when checkpointing)?
@yixingfu This is what I get:
AttributeError: module 'tensorflow._api.v2.distribute' has no attribute 'TPUStrategy'
@yixingfu This is what I get:
AttributeError: module 'tensorflow._api.v2.distribute' has no attribute 'TPUStrategy'
@henry090 I see. That is a slightly different thing. This fix is based on TF2.3, since TF2.3 is released already.
In TF2.3 TPUStrategy is under distribute
, but in TF2.2 it is under distribute.experimental
. Can you try your code on TF2.3?
This will be off topic here, but to use TF2.3 in TPUs you need
from cloud_tpu_client import Client
c = Client()
c.configure_tpu_version(tf.__version__, restart_type='ifNeeded')
Can you try your code on TF2.3?
It works now on my local machine. TF GPU 2.3 (Ubuntu)
This will be off topic here, but to use TF2.3 in TPUs you need
This helped to fix the error after this:
resolver = tf$distribute$cluster_resolver$TPUClusterResolver(tpu=paste('grpc://',Sys.getenv('COLAB_TPU_ADDR'),sep='' ))
tf$config$experimental_connect_to_cluster(resolver)
tf$tpu$experimental$initialize_tpu_system(resolver)
However, while searching I get the following.
Error in py_call_impl(callable, dots$args, dots$keywords): UnimplementedError: File system scheme '[local]' not implemented (file: 'results_k/mnist/trial_bbff097a2e59cf959f53d633b4f9457c/checkpoints/epoch_0/checkpoint_temp_fba1425e7e7644a0848d31c4f8ce019b/part-00000-of-00001')
Encountered when executing an operation using EagerExecutor. This error cancels all future operations and poisons their output tensors.
Can you share a colab so I can take a look?
@yixingfu Sorry for the late reply, https://colab.research.google.com/drive/1tQuB6v-_b09lQQN7EkQeV9CzvndCO6SZ?usp=sharing
@yixingfu Sorry for the late reply, https://colab.research.google.com/drive/1tQuB6v-_b09lQQN7EkQeV9CzvndCO6SZ?usp=sharing
You need to use strategy = tf.distribute.TPUStrategy()
instead of tf.distribute.experimental.TPUStrategy()
. That being said, it looks like the checkpointing callbacks are indeed trying to save h5 files (which should solve the TPU local file problem), but for some reason the checkpoints are not actually being saved here. I may need to look into this a bit more, but I did get it working on colab. I will share the example later after I edit it a bit.
You need to use
strategy = tf.distribute.TPUStrategy()
instead oftf.distribute.experimental.TPUStrategy()
This now gets stuck here:
https://colab.research.google.com/drive/1tQuB6v-_b09lQQN7EkQeV9CzvndCO6SZ?usp=sharing
@yixingfu any news?
@yixingfu any news?
I'm kind of wondering on this. Looks like the checkpoint is not being saved.
This gist shows a working example of using Keras Tuner directly. Not sure why it is failing using the R wrapper.
This gist shows
Thanks for this. In fact, it was working even last time. From RStudio I am able to see results. It is because of the Jupyter notebook. I just ran it for 1 epoch and could get results. However, an increase in epoch number on TPU did not decrease the amount of spent time for computation. Strange.
Keras tuner hyperband throws the following error
UnimplementedError: ./untitled_project; Operation not supported
To add ...
I don't use Google Colab, but Kaggle. Using TPU, I get that same error "File system scheme '[local]' not implemented", when the tuner tries to write the checkpoints on Kaggle's working directory.
Since I don't have a gs://location, I just "modified" the function called by Keras Tuner to save checkpoints, to allow writing to local dir, which is the Kaggle working directory. I used patch() to mock the function.
First important thing is that Keras Tuner must be version 1.1.2 and above.
Example:
from mock import patch
<your code>
# now the new function to "replace" the existing one (keras_tuner.engine.tuner_utils.SaveBestEpoch.on_epoch_end)
def new_on_epoch_end(self, epoch, logs=None):
if not self.objective.has_value(logs):
# Save on every epoch if metric value is not in the logs. Either no
# objective is specified, or objective is computed and returned
# after `fit()`.
#***** the following are the lines I added ******************************************
# Save model in Tensorflow's "SavedModel" format
save_locally = tf.saved_model.SaveOptions(experimental_io_device = '/job:localhost')
# I then added ', options = save_locally' to the line below.
#************************************************************************************
self.model.save_weights(self.filepath, options = save_locally)
return
current_value = self.objective.get_value(logs)
if self.objective.better_than(current_value, self.best_value):
self.best_value = current_value
#***** the following are the lines I added ******************************************
# Save model in Tensorflow's "SavedModel" format
save_locally = tf.saved_model.SaveOptions(experimental_io_device = '/job:localhost')
# I then added ', options = save_locally' to the line below.
#************************************************************************************
self.model.save_weights(self.filepath, options = save_locally)
with patch('keras_tuner.engine.tuner_utils.SaveBestEpoch.on_epoch_end', new_on_epoch_end):
# Perform hypertuning. The parameters are exactly like those in the fit() method.
tuner.search(
X_train,
y_train,
epochs=num_of_epochs,
validation_data = (X_valid, y_valid),
callbacks=[early_stopping]
)
<more of your code>
Since I used 'with patch', after all is done, it reverts back to the original code automatically.