aws-ml-jp
aws-ml-jp copied to clipboard
SageMaker Local Mode does not work on AL2
https://github.com/aws-samples/aws-ml-jp/blob/main/sagemaker/hpo-pytorch-mnist/pytorch_mnist.ipynb
Creating 9ajrhyi7nk-algo-1-1nstb ...
Creating 9ajrhyi7nk-algo-1-1nstb ... done
Attaching to 9ajrhyi7nk-algo-1-1nstb
9ajrhyi7nk-algo-1-1nstb | 2022-10-30 14:27:21,859 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training
9ajrhyi7nk-algo-1-1nstb | 2022-10-30 14:27:21,863 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)
9ajrhyi7nk-algo-1-1nstb | 2022-10-30 14:27:21,872 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed.
9ajrhyi7nk-algo-1-1nstb | 2022-10-30 14:27:21,875 sagemaker_pytorch_container.training INFO Invoking user training script.
9ajrhyi7nk-algo-1-1nstb | 2022-10-30 14:27:22,009 sagemaker-training-toolkit ERROR Reporting training FAILURE
9ajrhyi7nk-algo-1-1nstb | 2022-10-30 14:27:22,009 sagemaker-training-toolkit ERROR framework error:
9ajrhyi7nk-algo-1-1nstb | Traceback (most recent call last):
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/sagemaker_training/trainer.py", line 85, in train
9ajrhyi7nk-algo-1-1nstb | entrypoint()
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/sagemaker_pytorch_container/training.py", line 121, in main
9ajrhyi7nk-algo-1-1nstb | train(environment.Environment())
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/sagemaker_pytorch_container/training.py", line 73, in train
9ajrhyi7nk-algo-1-1nstb | runner_type=runner_type)
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/sagemaker_training/entry_point.py", line 92, in run
9ajrhyi7nk-algo-1-1nstb | files.download_and_extract(uri=uri, path=environment.code_dir)
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/sagemaker_training/files.py", line 131, in download_and_extract
9ajrhyi7nk-algo-1-1nstb | s3_download(uri, dst)
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/sagemaker_training/files.py", line 167, in s3_download
9ajrhyi7nk-algo-1-1nstb | s3.Bucket(bucket).download_file(key, dst)
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/boto3/s3/inject.py", line 247, in bucket_download_file
9ajrhyi7nk-algo-1-1nstb | ExtraArgs=ExtraArgs, Callback=Callback, Config=Config)
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/boto3/s3/inject.py", line 173, in download_file
9ajrhyi7nk-algo-1-1nstb | extra_args=ExtraArgs, callback=Callback)
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/boto3/s3/transfer.py", line 315, in download_file
9ajrhyi7nk-algo-1-1nstb | future.result()
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/s3transfer/futures.py", line 106, in result
9ajrhyi7nk-algo-1-1nstb | return self._coordinator.result()
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/s3transfer/futures.py", line 265, in result
9ajrhyi7nk-algo-1-1nstb | raise self._exception
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/s3transfer/tasks.py", line 255, in _main
9ajrhyi7nk-algo-1-1nstb | self._submit(transfer_future=transfer_future, **kwargs)
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/s3transfer/download.py", line 343, in _submit
9ajrhyi7nk-algo-1-1nstb | **transfer_future.meta.call_args.extra_args
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/botocore/client.py", line 391, in _api_call
9ajrhyi7nk-algo-1-1nstb | return self._make_api_call(operation_name, kwargs)
9ajrhyi7nk-algo-1-1nstb | File "/opt/conda/lib/python3.6/site-packages/botocore/client.py", line 719, in _make_api_call
9ajrhyi7nk-algo-1-1nstb | raise error_class(parsed_response, operation_name)
9ajrhyi7nk-algo-1-1nstb | botocore.exceptions.ClientError: An error occurred (403) when calling the HeadObject operation: Forbidden
9ajrhyi7nk-algo-1-1nstb |
9ajrhyi7nk-algo-1-1nstb | An error occurred (403) when calling the HeadObject operation: Forbidden
9ajrhyi7nk-algo-1-1nstb exited with code 1
1
Aborting on container exit...
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, environment, job_name)
247 try:
--> 248 _stream_output(process)
249 except RuntimeError as e:
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/local/image.py in _stream_output(process)
915 if exit_code != 0:
--> 916 raise RuntimeError("Process exited with code: %s" % exit_code)
917
RuntimeError: Process exited with code: 1
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_538/1942905621.py in <cell line: 1>()
----> 1 estimator.fit({'training': inputs})
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/workflow/pipeline_context.py in wrapper(*args, **kwargs)
246 return self_instance.sagemaker_session.context
247
--> 248 return run_func(*args, **kwargs)
249
250 return wrapper
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config)
1100 self._prepare_for_training(job_name=job_name)
1101
-> 1102 self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_config)
1103 self.jobs.append(self.latest_training_job)
1104 if wait:
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs, experiment_config)
2002 train_args = cls._get_train_args(estimator, inputs, experiment_config)
2003
-> 2004 estimator.sagemaker_session.train(**train_args)
2005
2006 return cls(estimator.sagemaker_session, estimator._current_job_name)
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image_uri, algorithm_arn, encrypt_inter_container_traffic, use_spot_instances, checkpoint_s3_uri, checkpoint_local_path, experiment_config, debugger_rule_configs, debugger_hook_config, tensorboard_output_config, enable_sagemaker_metrics, profiler_rule_configs, profiler_config, environment, retry_strategy)
611 self.sagemaker_client.create_training_job(**request)
612
--> 613 self._intercept_create_request(train_request, submit, self.train.__name__)
614
615 def _get_train_request( # noqa: C901
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/session.py in _intercept_create_request(self, request, create, func_name)
4315 func_name (str): the name of the function needed intercepting
4316 """
-> 4317 return create(request)
4318
4319
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/session.py in submit(request)
609 LOGGER.info("Creating training-job with name: %s", job_name)
610 LOGGER.debug("train request: %s", json.dumps(request, indent=4))
--> 611 self.sagemaker_client.create_training_job(**request)
612
613 self._intercept_create_request(train_request, submit, self.train.__name__)
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, Environment, **kwargs)
192 hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {}
193 logger.info("Starting training job")
--> 194 training_job.start(
195 InputDataConfig, OutputDataConfig, hyperparameters, Environment, TrainingJobName
196 )
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, environment, job_name)
241 self.environment = environment
242
--> 243 self.model_artifacts = self.container.train(
244 input_data_config, output_data_config, hyperparameters, environment, job_name
245 )
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, environment, job_name)
251 # which contains the exit code and append the command line to it.
252 msg = "Failed to run: %s, %s" % (compose_command, str(e))
--> 253 raise RuntimeError(msg)
254 finally:
255 artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name)
RuntimeError: Failed to run: ['docker-compose', '-f', '/tmp/tmpqxy0jzs6/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1
Same error happens on a script provided in official repo https://github.com/aws/amazon-sagemaker-examples/blob/main/frameworks/pytorch/get_started_mnist_train.ipynb