sagemaker-debugger icon indicating copy to clipboard operation
sagemaker-debugger copied to clipboard

Turn off debugger hooks in PyTorch?

Open austinmw opened this issue 4 years ago • 4 comments

Hi, I'm training with sagemaker using a custom docker image I created by extending the pytorch 1.6 training image with additional ssh settings for Horovod. I'm also using a custom estimator which I created by subclassing the pytorch estimator and adding a distribution parameter and configuration method.

When I launch single-node training everything works fine, but when I attempt to launch multi-node training, I get smdebug hook errors, even though I did not set any debugger rules. How can I turn off this functionality altogether?

[1,10]: File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 717, in _call_impl [1,10]: hook = smd_utils.get_smdebug_hook() [1,10]: File "/opt/conda/lib/python3.6/site-packages/torch/utils/smdebug.py", line 35, in get_smdebug_hook [1,10]: return smd.get_hook(create_if_not_exists=True) [1,10]: File "/opt/conda/lib/python3.6/site-packages/smdebug/pytorch/singleton_utils.py", line 22, in get_hook [1,10]: create_if_not_exists=create_if_not_exists, [1,10]: File "/opt/conda/lib/python3.6/site-packages/smdebug/core/singleton_utils.py", line 60, in get_hook [1,10]: _create_hook(json_config_path, hook_class) [1,10]: File "/opt/conda/lib/python3.6/site-packages/smdebug/core/singleton_utils.py", line 28, in _create_hook [1,10]: hook = hook_class.create_from_json_file(json_file_path=json_config_path) [1,10]: File "/opt/conda/lib/python3.6/site-packages/smdebug/core/hook.py", line 294, in create_from_json_file [1,10]: return create_hook_from_json_config(cls, json_config_path=json_file_path) [1,10]: File "/opt/conda/lib/python3.6/site-packages/smdebug/core/json_config.py", line 243, in create_hook_from_json_config [1,10]: save_all=save_all, [1,10]: File "/opt/conda/lib/python3.6/site-packages/smdebug/pytorch/hook.py", line 47, in init [1,10]: include_workers=include_workers, [1,10]: File "/opt/conda/lib/python3.6/site-packages/smdebug/core/hook.py", line 955, in init [1,10]: include_workers=include_workers, [1,10]: File "/opt/conda/lib/python3.6/site-packages/smdebug/core/hook.py", line 243, in init [1,10]: self._initialize_to_last_saved_state() [1,10]: File "/opt/conda/lib/python3.6/site-packages/smdebug/core/hook.py", line 251, in _initialize_to_last_saved_state [1,10]: self.state_store = StateStore() [1,10]: File "/opt/conda/lib/python3.6/site-packages/smdebug/core/state_store.py", line 42, in init [1,10]: self._read_states_file() [1,10]: File "/opt/conda/lib/python3.6/site-packages/smdebug/core/state_store.py", line 76, in _read_states_file [1,10]: parameters = json.load(json_data) [1,10]: File "/opt/conda/lib/python3.6/json/init.py", line 299, in load [1,10]: parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw) [1,10]: File "/opt/conda/lib/python3.6/json/init.py", line 354, in loads [1,10]: return _default_decoder.decode(s) [1,10]: File "/opt/conda/lib/python3.6/json/decoder.py", line 342, in decode [1,10]: raise JSONDecodeError("Extra data", s, end) [1,10]:json.decoder.JSONDecodeError: Extra data: line 1 column 1444 (char 1443) [1,10]:Error in atexit._run_exitfuncs: [1,10]:Traceback (most recent call last): [1,10]: File "/opt/conda/lib/python3.6/site-packages/smdebug/core/hook.py", line 961, in _cleanup [1,10]: if not self.exported_collections: [1,10]:AttributeError: 'Hook' object has no attribute 'exported_collections'

austinmw avatar Nov 17 '20 01:11 austinmw

Uninstalling smdebug from the image worked. Upgrading smdebug to latest version did not work.

austinmw avatar Nov 17 '20 06:11 austinmw

Hi @austinmw . Thanks for reporting this. Can you please provide the example script to reproduce this.

Vikas-kum avatar Nov 19 '20 20:11 Vikas-kum

I used the custom framework paradigm so it's a package of about 30 python files that I'm unable to share. Could try to put together a minimum reproducible example but it'd take a while.

The underlying model is partially based on https://github.com/ifzhang/FairMOT (using HRNet-18 backbone).

austinmw avatar Nov 20 '20 04:11 austinmw

I have the same problem with pytorch in sagemaker. Although, my code works correctly in local device in sagemaker problem is emerged.

AttributeError Traceback (most recent call last) in 15 # os.mkdir(PATH) 16 # shutil.copy(path,PATH) ---> 17 s = start_process(path,cp_path,isPath,isCuda) 18 K = s.pullThetrigger() 19 #report = open(os.path.join(PATH,"report.txt"),"w")

~/face_estimation_deployment/Invoke_AI.py in init(self, img_path, cp_path, iSpath, isCuda) 48 self.Cheek = "CHEEK" 49 self.requirement = ["CHINE","JAWLINE","SMOKELINE","KHATELABKHAND","AROUNDLIP","AROUNDEYE","FOREHEAD"] ---> 50 self.__check_value() 51 self.__resizer() 52 X = "numpy pandas gdown tqdm tensorflow keras mtcnn".split(" ")

~/face_estimation_deployment/Invoke_AI.py in __check_value(self) 63 subprocess.check_call([sys.executable, '-m', 'pip', 'install', package]) 64 def __check_value(self): ---> 65 if self.checker.checker()==False: 66 print("Essentioal part of Face isnt Find the program wont execute") 67 return

~/face_estimation_deployment/segment_Face.py in checker(self) 1111 def checker(self): 1112 requirment , parsing , _ = self.__start_PROCESS() -> 1113 ALLCLASSES = np.unique(parsing) 1114 for r in requirment: 1115 inner_correct = False

~/face_estimation_deployment/segment_Face.py in __start_PROCESS(self) 1107 img = img.cuda() 1108 out = net(img)[0] -> 1109 parsing = out.squeeze(0).cpu().numpy().argmax(0) 1110 return requirment,parsing,image 1111 def checker(self):

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 733 if is_hook_registration_suspended() is False: 734 import torch.utils.smdebug as smd_utils --> 735 hook = smd_utils.get_smdebug_hook() 736 if hook and not hook.has_registered_module: 737 hook.register_hook(self)

/opt/conda/lib/python3.6/site-packages/torch/utils/smdebug.py in get_smdebug_hook() 47 48 try: ---> 49 import smdebug.pytorch as smd 50 except ImportError: 51 return None

/opt/conda/lib/python3.6/site-packages/smdebug/pytorch/init.py in 2 from smdebug import ReductionConfig, SaveConfig, SaveConfigMode, modes 3 from smdebug.core.collection import CollectionKeys ----> 4 from smdebug.trials import create_trial 5 6 # Local

/opt/conda/lib/python3.6/site-packages/smdebug/trials/init.py in 1 # Local ----> 2 from .local_trial import LocalTrial 3 from .s3_trial import S3Trial 4 from .trial import Trial 5 from .utils import create_trial

/opt/conda/lib/python3.6/site-packages/smdebug/trials/local_trial.py in 4 # First Party 5 from smdebug.core.collection_manager import CollectionManager ----> 6 from smdebug.core.index_reader import LocalIndexReader 7 from smdebug.core.utils import get_path_to_collections, list_collection_files_in_directory 8

/opt/conda/lib/python3.6/site-packages/smdebug/core/index_reader.py in 21 from smdebug.core.modes import ModeKeys 22 from smdebug.core.s3_utils import list_s3_objects ---> 23 from smdebug.core.tfrecord.tensor_reader import TensorReader 24 from smdebug.core.utils import ( 25 get_path_to_events_directory,

/opt/conda/lib/python3.6/site-packages/smdebug/core/tfrecord/tensor_reader.py in 5 from smdebug.core.logger import get_logger 6 from smdebug.core.modes import MODE_PLUGIN_NAME, MODE_STEP_PLUGIN_NAME, ModeKeys ----> 7 from smdebug.core.tfevent.event_file_reader import get_tensor_data 8 from smdebug.core.tfevent.proto.event_pb2 import Event 9 from smdebug.core.tfrecord.record_reader import masked_crc32c

/opt/conda/lib/python3.6/site-packages/smdebug/core/tfevent/event_file_reader.py in 22 23 # First Party ---> 24 import smdebug.core.tfevent.proto.types_pb2 as types_pb2 25 from smdebug.core.modes import MODE_PLUGIN_NAME, MODE_STEP_PLUGIN_NAME, ModeKeys 26 from smdebug.core.tfrecord.record_reader import RecordReader

AttributeError: module 'smdebug' has no attribute 'core'

amirhRahimi1993 avatar Dec 12 '21 07:12 amirhRahimi1993