sagemaker-debugger
sagemaker-debugger copied to clipboard
tensorflow_datasets failed to load dataset with data_dir="s3://<sagemaker-bucket>" in sagemaker notebook instance
I have been looking for an example to use tensorflow_datasets in sagemaker and found the following source. https://github.com/awslabs/sagemaker-debugger/blob/master/tests/tensorflow/test_keras_to_estimator.py
I believe this code - test_keras_to_estimator.py - works well with loading data to S3. Could you shed some light?
Thanks in advance
Btw, I am using tensorflow 2.3.0 with tensorflow_datasets 3.2.1.
The error I have is like below.
>>> dataset = tfds.load("iris", data_dir=s3_data_dir, split='train', as_supervised=True)
Downloading and preparing dataset iris/2.0.0 (download: 4.44 KiB, generated: Unknown size, total: 4.44 KiB) to s3://sagemaker-my-default-bucket/tensorflow_datasets/iris/2.0.0...
---------------------------------------------------------------------------
AbortedError Traceback (most recent call last)
<ipython-input-5-9439fd899962> in <module>
1 # data, info = tfds.load('c4', data_dir=s3_data_dir)
----> 2 data, info = tfds.load('iris', data_dir=s3_data_dir)
~/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/tensorflow_datasets/core/api_utils.py in disallow_positional_args_dec(fn, instance, args, kwargs)
67 _check_no_positional(fn, args, ismethod, allowed=allowed)
68 _check_required(fn, kwargs)
---> 69 return fn(*args, **kwargs)
70
71 return disallow_positional_args_dec(wrapped) # pylint: disable=no-value-for-parameter
~/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/tensorflow_datasets/core/registered.py in load(name, split, data_dir, batch_size, shuffle_files, download, as_supervised, decoders, read_config, with_info, builder_kwargs, download_and_prepare_kwargs, as_dataset_kwargs, try_gcs)
369 if download:
370 download_and_prepare_kwargs = download_and_prepare_kwargs or {}
--> 371 dbuilder.download_and_prepare(**download_and_prepare_kwargs)
372
373 if as_dataset_kwargs is None:
~/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/tensorflow_datasets/core/api_utils.py in disallow_positional_args_dec(fn, instance, args, kwargs)
67 _check_no_positional(fn, args, ismethod, allowed=allowed)
68 _check_required(fn, kwargs)
---> 69 return fn(*args, **kwargs)
70
71 return disallow_positional_args_dec(wrapped) # pylint: disable=no-value-for-parameter
~/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/tensorflow_datasets/core/dataset_builder.py in download_and_prepare(self, download_dir, download_config)
359 dl_manager = self._make_download_manager(
360 download_dir=download_dir,
--> 361 download_config=download_config)
362
363 # Create a tmp dir and rename to self._data_dir on successful exit.
~/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/tensorflow_datasets/core/dataset_builder.py in _make_download_manager(self, download_dir, download_config)
784 force_extraction=(download_config.download_mode == FORCE_REDOWNLOAD),
785 force_checksums_validation=download_config.force_checksums_validation,
--> 786 register_checksums=download_config.register_checksums,
787 )
788
~/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/tensorflow_datasets/core/api_utils.py in disallow_positional_args_dec(fn, instance, args, kwargs)
67 _check_no_positional(fn, args, ismethod, allowed=allowed)
68 _check_required(fn, kwargs)
---> 69 return fn(*args, **kwargs)
70
71 return disallow_positional_args_dec(wrapped) # pylint: disable=no-value-for-parameter
~/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/tensorflow_datasets/core/download/download_manager.py in __init__(self, download_dir, extract_dir, manual_dir, manual_dir_instructions, dataset_name, force_download, force_extraction, force_checksums_validation, register_checksums)
205 self._manual_dir = manual_dir and os.path.expanduser(manual_dir)
206 self._manual_dir_instructions = manual_dir_instructions
--> 207 tf.io.gfile.makedirs(self._download_dir)
208 tf.io.gfile.makedirs(self._extract_dir)
209 self._force_download = force_download
~/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages/tensorflow/python/lib/io/file_io.py in recursive_create_dir_v2(path)
478 errors.OpError: If the operation fails.
479 """
--> 480 _pywrap_file_io.RecursivelyCreateDir(compat.as_bytes(path))
481
482
AbortedError: All 10 retry attempts failed. The last failure: Unknown: : No response body.
Some new information:
In Sagemaker Instance or Studio, if tfds.load is executed in the terminal or in the notebook it will fail. However if it is in the script mode of training it works fine.