aws-ml-jp icon indicating copy to clipboard operation
aws-ml-jp copied to clipboard

SageMaker Local Mode does not work on AL2

Open harusametime opened this issue 3 years ago • 1 comments

https://github.com/aws-samples/aws-ml-jp/blob/main/sagemaker/hpo-pytorch-mnist/pytorch_mnist.ipynb

Creating 9ajrhyi7nk-algo-1-1nstb ... 
Creating 9ajrhyi7nk-algo-1-1nstb ... done
Attaching to 9ajrhyi7nk-algo-1-1nstb
9ajrhyi7nk-algo-1-1nstb | 2022-10-30 14:27:21,859 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
9ajrhyi7nk-algo-1-1nstb | 2022-10-30 14:27:21,863 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
9ajrhyi7nk-algo-1-1nstb | 2022-10-30 14:27:21,872 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
9ajrhyi7nk-algo-1-1nstb | 2022-10-30 14:27:21,875 sagemaker_pytorch_container.training INFO     Invoking user training script.
9ajrhyi7nk-algo-1-1nstb | 2022-10-30 14:27:22,009 sagemaker-training-toolkit ERROR    Reporting training FAILURE
9ajrhyi7nk-algo-1-1nstb | 2022-10-30 14:27:22,009 sagemaker-training-toolkit ERROR    framework error: 
9ajrhyi7nk-algo-1-1nstb | Traceback (most recent call last):
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/sagemaker_training/trainer.py", line 85, in train
9ajrhyi7nk-algo-1-1nstb |     entrypoint()
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/sagemaker_pytorch_container/training.py", line 121, in main
9ajrhyi7nk-algo-1-1nstb |     train(environment.Environment())
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/sagemaker_pytorch_container/training.py", line 73, in train
9ajrhyi7nk-algo-1-1nstb |     runner_type=runner_type)
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/sagemaker_training/entry_point.py", line 92, in run
9ajrhyi7nk-algo-1-1nstb |     files.download_and_extract(uri=uri, path=environment.code_dir)
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/sagemaker_training/files.py", line 131, in download_and_extract
9ajrhyi7nk-algo-1-1nstb |     s3_download(uri, dst)
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/sagemaker_training/files.py", line 167, in s3_download
9ajrhyi7nk-algo-1-1nstb |     s3.Bucket(bucket).download_file(key, dst)
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/boto3/s3/inject.py", line 247, in bucket_download_file
9ajrhyi7nk-algo-1-1nstb |     ExtraArgs=ExtraArgs, Callback=Callback, Config=Config)
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/boto3/s3/inject.py", line 173, in download_file
9ajrhyi7nk-algo-1-1nstb |     extra_args=ExtraArgs, callback=Callback)
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/boto3/s3/transfer.py", line 315, in download_file
9ajrhyi7nk-algo-1-1nstb |     future.result()
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/s3transfer/futures.py", line 106, in result
9ajrhyi7nk-algo-1-1nstb |     return self._coordinator.result()
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/s3transfer/futures.py", line 265, in result
9ajrhyi7nk-algo-1-1nstb |     raise self._exception
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/s3transfer/tasks.py", line 255, in _main
9ajrhyi7nk-algo-1-1nstb |     self._submit(transfer_future=transfer_future, **kwargs)
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/s3transfer/download.py", line 343, in _submit
9ajrhyi7nk-algo-1-1nstb |     **transfer_future.meta.call_args.extra_args
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/botocore/client.py", line 391, in _api_call
9ajrhyi7nk-algo-1-1nstb |     return self._make_api_call(operation_name, kwargs)
9ajrhyi7nk-algo-1-1nstb |   File "/opt/conda/lib/python3.6/site-packages/botocore/client.py", line 719, in _make_api_call
9ajrhyi7nk-algo-1-1nstb |     raise error_class(parsed_response, operation_name)
9ajrhyi7nk-algo-1-1nstb | botocore.exceptions.ClientError: An error occurred (403) when calling the HeadObject operation: Forbidden
9ajrhyi7nk-algo-1-1nstb | 
9ajrhyi7nk-algo-1-1nstb | An error occurred (403) when calling the HeadObject operation: Forbidden
9ajrhyi7nk-algo-1-1nstb exited with code 1
1
Aborting on container exit...
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, environment, job_name)
    247         try:
--> 248             _stream_output(process)
    249         except RuntimeError as e:

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/local/image.py in _stream_output(process)
    915     if exit_code != 0:
--> 916         raise RuntimeError("Process exited with code: %s" % exit_code)
    917 

RuntimeError: Process exited with code: 1

During handling of the above exception, another exception occurred:

RuntimeError                              Traceback (most recent call last)
/tmp/ipykernel_538/1942905621.py in <cell line: 1>()
----> 1 estimator.fit({'training': inputs})

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/workflow/pipeline_context.py in wrapper(*args, **kwargs)
    246             return self_instance.sagemaker_session.context
    247 
--> 248         return run_func(*args, **kwargs)
    249 
    250     return wrapper

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config)
   1100         self._prepare_for_training(job_name=job_name)
   1101 
-> 1102         self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_config)
   1103         self.jobs.append(self.latest_training_job)
   1104         if wait:

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs, experiment_config)
   2002         train_args = cls._get_train_args(estimator, inputs, experiment_config)
   2003 
-> 2004         estimator.sagemaker_session.train(**train_args)
   2005 
   2006         return cls(estimator.sagemaker_session, estimator._current_job_name)

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image_uri, algorithm_arn, encrypt_inter_container_traffic, use_spot_instances, checkpoint_s3_uri, checkpoint_local_path, experiment_config, debugger_rule_configs, debugger_hook_config, tensorboard_output_config, enable_sagemaker_metrics, profiler_rule_configs, profiler_config, environment, retry_strategy)
    611             self.sagemaker_client.create_training_job(**request)
    612 
--> 613         self._intercept_create_request(train_request, submit, self.train.__name__)
    614 
    615     def _get_train_request(  # noqa: C901

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/session.py in _intercept_create_request(self, request, create, func_name)
   4315             func_name (str): the name of the function needed intercepting
   4316         """
-> 4317         return create(request)
   4318 
   4319 

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/session.py in submit(request)
    609             LOGGER.info("Creating training-job with name: %s", job_name)
    610             LOGGER.debug("train request: %s", json.dumps(request, indent=4))
--> 611             self.sagemaker_client.create_training_job(**request)
    612 
    613         self._intercept_create_request(train_request, submit, self.train.__name__)

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, Environment, **kwargs)
    192         hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {}
    193         logger.info("Starting training job")
--> 194         training_job.start(
    195             InputDataConfig, OutputDataConfig, hyperparameters, Environment, TrainingJobName
    196         )

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, environment, job_name)
    241         self.environment = environment
    242 
--> 243         self.model_artifacts = self.container.train(
    244             input_data_config, output_data_config, hyperparameters, environment, job_name
    245         )

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, environment, job_name)
    251             # which contains the exit code and append the command line to it.
    252             msg = "Failed to run: %s, %s" % (compose_command, str(e))
--> 253             raise RuntimeError(msg)
    254         finally:
    255             artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name)

RuntimeError: Failed to run: ['docker-compose', '-f', '/tmp/tmpqxy0jzs6/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1

harusametime avatar Oct 31 '22 05:10 harusametime

Same error happens on a script provided in official repo https://github.com/aws/amazon-sagemaker-examples/blob/main/frameworks/pytorch/get_started_mnist_train.ipynb

harusametime avatar Oct 31 '22 05:10 harusametime