DeepSpeed
DeepSpeed copied to clipboard
Error when comparing full_best_metric_val and fast_best_metric_val in Autotuning
https://github.com/microsoft/DeepSpeed/blob/4e886f0568832d292183926bcc1a9105def25f2c/deepspeed/autotuning/autotuner.py#L714-L726 It is possible that fast_best_metric_val could be None at fast mode tuning if all experiments are OOM, while "slow" mode gets some results. So this line could go wrong: https://github.com/microsoft/DeepSpeed/blob/4e886f0568832d292183926bcc1a9105def25f2c/deepspeed/autotuning/autotuner.py#L631
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /home/tiger/.local/bin/deepspeed:7 in <module> │
│ │
│ 4 __import__('pkg_resources').require('deepspeed==0.9.3+d10b8ca0') │
│ 5 __file__ = ' DeepSpeed/bin/deepspeed' │
│ 6 with open(__file__) as f: │
│ ❱ 7 │ exec(compile(f.read(), __file__, 'exec')) │
│ 8 │
│ │
│ DeepSpeed/bin/deepspeed:6 in <module> │
│ │
│ 3 from deepspeed.launcher.runner import main │
│ 4 │
│ 5 if __name__ == '__main__': │
│ ❱ 6 │ main() │
│ 7 │
│ │
│ DeepSpeed/deepspeed/launcher/runner.py:450 in main │
│ │
│ 447 │ │ logger.info(f"Using IP address of {args.master_addr} for node {first_host}") │
│ 448 │ │
│ 449 │ if args.autotuning != "": │
│ ❱ 450 │ │ run_autotuning(args, active_resources) │
│ 451 │ │ return │
│ 452 │ │
│ 453 │ if args.num_nodes > 0: │
│ │
│ DeepSpeed/deepspeed/launcher/runner.py:352 in │
│ run_autotuning │
│ │
│ 349 │ tuner = Autotuner(args, active_resources) │
│ 350 │ logger.info("[Start] Running autotuning") │
│ 351 │ │
│ ❱ 352 │ tuner.tune() │
│ 353 │ tuner.print_tuning_results() │
│ 354 │ │
│ 355 │ logger.info("[End] Running autotuning") │
│ │
│ DeepSpeed/deepspeed/autotuning/autotuner.py:467 in tune │
│ │
│ 464 │ │ │ │ logger.info( │
│ 465 │ │ │ │ │ f"The model might be runable with ZERO 1 (which requires at least {m │
│ 466 │ │ │ │ ) │
│ ❱ 467 │ │ │ │ next_max_mbs, next_mbs, next_metric_val = self.tune_space(DEFAULT_TUNING │
│ 468 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ prev_max_mbs=m │
│ 469 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ prev_best_mbs= │
│ 470 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ prev_best_metr │
│ │
│ DeepSpeed/deepspeed/autotuning/autotuner.py:631 in │
│ tune_space │
│ │
│ 628 │ │ full_best_record = self.get_best_space_record(tuning_space_name) │
│ 629 │ │ full_best_metric_val = full_best_record[1] if full_best_record else -1 │
│ 630 │ │ │
│ ❱ 631 │ │ if full_best_metric_val > fast_best_metric_val: │
│ 632 │ │ │ best_metric_val = full_best_metric_val │
│ 633 │ │ │ best_mbs = full_best_record[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] if │
│ 634 │ │ else: │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
TypeError: '>' not supported between instances of 'NoneType' and 'float'