--- Logging error ---
Traceback (most recent call last):
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 723, in run
result = self._invoke_run(role)
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 864, in _invoke_run
time.sleep(monitor_interval)
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler
raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
torch.distributed.elastic.multiprocessing.api.SignalException: Process 2890585 got signal: 1
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/rich/logging.py", line 170, in emit
self.console.print(log_renderable)
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/rich/console.py", line 1684, in print
render_options = self.options.update(
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/rich/console.py", line 982, in options
max_height=self.size.height,
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/rich/console.py", line 1002, in size
if self.is_dumb_terminal:
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/rich/console.py", line 974, in is_dumb_terminal
_term = self._environ.get("TERM", "")
File "/home/hhh/.conda/envs/python38/lib/python3.8/_collections_abc.py", line 660, in get
return self[key]
File "/home/hhh/.conda/envs/python38/lib/python3.8/os.py", line 672, in getitem
value = self._data[self.encodekey(key)]
File "/home/hhh/.conda/envs/python38/lib/python3.8/os.py", line 748, in encode
def encode(value):
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler
raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
torch.distributed.elastic.multiprocessing.api.SignalException: Process 2890585 got signal: 1
Call stack:
File "/home/hhh/.conda/envs/python38/bin/accelerate", line 8, in
sys.exit(main())
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
args.func(args)
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/accelerate/commands/launch.py", line 900, in launch_command
deepspeed_launcher(args)
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/accelerate/commands/launch.py", line 643, in deepspeed_launcher
distrib_run.run(args)
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 241, in launch_agent
result = agent.run()
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 729, in run
log.warning(f"Received {e.sigval} death signal, shutting down workers")
File "/home/hhh/.conda/envs/python38/lib/python3.8/logging/init.py", line 1458, in warning
self._log(WARNING, msg, args, **kwargs)
File "/home/hhh/.conda/envs/python38/lib/python3.8/logging/init.py", line 1589, in _log
self.handle(record)
File "/home/hhh/.conda/envs/python38/lib/python3.8/logging/init.py", line 1599, in handle
self.callHandlers(record)
File "/home/hhh/.conda/envs/python38/lib/python3.8/logging/init.py", line 1661, in callHandlers
hdlr.handle(record)
File "/home/hhh/.conda/envs/python38/lib/python3.8/logging/init.py", line 954, in handle
self.emit(record)
File "/home/hhh/.conda/envs/python38/lib/python3.8/site-packages/rich/logging.py", line 172, in emit
self.handleError(record)
Message: 'Received 1 death signal, shutting down workers'
nohup & :
[16:21:34] WARNING Received 1 death signal, shutting down workers api.py:729
WARNING Sending process 2928786 closing signal SIGHUP api.py:698
WARNING Sending process 2928787 closing signal SIGHUP api.py:698
WARNING Sending process 2928788 closing signal SIGHUP api.py:698
WARNING Sending process 2928789 closing signal SIGHUP api.py:698
WARNING Sending process 2928790 closing signal SIGHUP api.py:698
WARNING Sending process 2928791 closing signal SIGHUP api.py:698
WARNING Sending process 2928792 closing signal SIGHUP api.py:698
WARNING Sending process 2928793 closing signal SIGHUP api.py:698
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /home/hhh/.conda/envs/python38/bin/accelerate:8 in │
│ │
│ 5 from accelerate.commands.accelerate_cli import main │
│ 6 if name == 'main': │
│ 7 │ sys.argv[0] = re.sub(r'(-script.pyw|.exe)?$', '', sys.argv[0]) │
│ ❱ 8 │ sys.exit(main()) │
│ 9 │
│ │
│ /home/hhh/.conda/envs/python38/lib/python3.8/site-packages/accelera │
│ te/commands/accelerate_cli.py:45 in main │
│ │
│ 42 │ │ exit(1) │
│ 43 │ │
│ 44 │ # Run │
│ ❱ 45 │ args.func(args) │
│ 46 │
│ 47 │
│ 48 if name == "main": │
│ │
│ /home/hhh/.conda/envs/python38/lib/python3.8/site-packages/accelera │
│ te/commands/launch.py:900 in launch_command │
│ │
│ 897 │ │ if mp_from_config_flag: │
│ 898 │ │ │ args.deepspeed_fields_from_accelerate_config.append("mixed │
│ 899 │ │ args.deepspeed_fields_from_accelerate_config = ",".join(args.d │
│ ❱ 900 │ │ deepspeed_launcher(args) │
│ 901 │ elif args.use_fsdp and not args.cpu: │
│ 902 │ │ multi_gpu_launcher(args) │
│ 903 │ elif args.use_megatron_lm and not args.cpu: │
│ │
│ /home/hhh/.conda/envs/python38/lib/python3.8/site-packages/accelera │
│ te/commands/launch.py:643 in deepspeed_launcher │
│ │
│ 640 │ │ ) │
│ 641 │ │ with patch_environment(**current_env): │
│ 642 │ │ │ try: │
│ ❱ 643 │ │ │ │ distrib_run.run(args) │
│ 644 │ │ │ except Exception: │
│ 645 │ │ │ │ if is_rich_available() and debug: │
│ 646 │ │ │ │ │ console = get_console() │
│ │
│ /home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/di │
│ stributed/run.py:785 in run │
│ │
│ 782 │ │ ) │
│ 783 │ │
│ 784 │ config, cmd, cmd_args = config_from_args(args) │
│ ❱ 785 │ elastic_launch( │
│ 786 │ │ config=config, │
│ 787 │ │ entrypoint=cmd, │
│ 788 │ )(*cmd_args) │
│ │
│ /home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/di │
│ stributed/launcher/api.py:134 in call │
│ │
│ 131 │ │ self._entrypoint = entrypoint │
│ 132 │ │
│ 133 │ def call(self, *args): │
│ ❱ 134 │ │ return launch_agent(self._config, self._entrypoint, list(args) │
│ 135 │
│ 136 │
│ 137 def _get_entrypoint_name( │
│ │
│ /home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/di │
│ stributed/launcher/api.py:241 in launch_agent │
│ │
│ 238 │ try: │
│ 239 │ │ metrics.initialize_metrics(metrics.MetricsConfig(config.metric │
│ 240 │ │ │
│ ❱ 241 │ │ result = agent.run() │
│ 242 │ │ # records that agent.run() has succeeded NOT that workers have │
│ 243 │ │ events.record(agent.get_event_succeeded()) │
│ 244 │
│ │
│ /home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/di │
│ stributed/elastic/metrics/api.py:129 in wrapper │
│ │
│ 126 │ │ │ key = _get_metric_name(f) │
│ 127 │ │ │ try: │
│ 128 │ │ │ │ start = time.time() │
│ ❱ 129 │ │ │ │ result = f(*args, **kwargs) │
│ 130 │ │ │ │ put_metric(f"{key}.success", 1, group) │
│ 131 │ │ │ except Exception: │
│ 132 │ │ │ │ put_metric(f"{key}.failure", 1, group) │
│ │
│ /home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/di │
│ stributed/elastic/agent/server/api.py:723 in run │
│ │
│ 720 │ │ start_time = time.monotonic() │
│ 721 │ │ shutdown_called: bool = False │
│ 722 │ │ try: │
│ ❱ 723 │ │ │ result = self._invoke_run(role) │
│ 724 │ │ │ self.total_execution_time = int(time.monotonic() - start │
│ 725 │ │ │ self._record_metrics(result) │
│ 726 │ │ │ self._record_worker_events(result) │
│ │
│ /home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/di │
│ stributed/elastic/agent/server/api.py:864 in _invoke_run │
│ │
│ 861 │ │ │
│ 862 │ │ while True: │
│ 863 │ │ │ assert self._worker_group.state != WorkerState.INIT │
│ ❱ 864 │ │ │ time.sleep(monitor_interval) │
│ 865 │ │ │ run_result = self._monitor_workers(self._worker_group) │
│ 866 │ │ │ state = run_result.state │
│ 867 │ │ │ self._worker_group.state = state │
│ │
│ /home/hhh/.conda/envs/python38/lib/python3.8/site-packages/torch/di │
│ stributed/elastic/multiprocessing/api.py:62 in _terminate_process_handler │
│ │
│ 59 │ be terminated. │
│ 60 │ """ │
│ 61 │ sigval = signal.Signals(signum) │
│ ❱ 62 │ raise SignalException(f"Process {os.getpid()} got signal: {sigval} │
│ 63 │
│ 64 │
│ 65 def _get_kill_signal() -> signal.Signals: │
╰──────────────────────────────────────────────────────────────────────────────╯
SignalException: Process 2928711 got signal: 1
Have you solved this problems?
In my situation, seems to happen when closing the terminal.
Have you solved this problems? In my situation, seems to happen when closing the terminal.
me too, how did you solved it
I finally solved this problem by using tmux.