starcoder
starcoder copied to clipboard
hardware requirements for finetuning
what was the maximum sequence length used for finetuning starcoder to produce star chat alpha? Was it done on a single GPU card or multiple cards? Please provide insights on the memory requirement for fine tuning star chat alpha and Start coder
Relevant details are in the article: https://huggingface.co/blog/starchat-alpha
Also check out the finetune directory in the repo, especially the config.yaml
.
https://github.com/bigcode-project/starcoder/blob/main/chat/config.yaml
It is actually 1024. Can the team share guides to fine-tune 8K context length?
Thanks... In the
Relevant details are in the article: https://huggingface.co/blog/starchat-alpha
Also check out the finetune directory in the repo, especially the
config.yaml
. https://github.com/bigcode-project/starcoder/blob/main/chat/config.yaml
Thanks. There is mentioning that 8 x A100 (80GB) GPUs is required for the whole training. We are trying to finetune the model. What could be the approximate hardware requirements for fine tuning?
@evangineer we are following the same blog mentioned by you article: https://huggingface.co/blog/starchat-alpha on a Hardware: 2 * 40gb GPU deepspeed version is 0.9.3. Transformers version is 4.31.0 dev0 and accelerate version is 0.21.0. dev0
Command: torchrun --nproc_per_node=8 train.py config.yaml --deepspeed=deepspeed_z3_config_bf16.json
The error is as follows:
The error and trace back is as follows: ───────────────── Traceback (most recent call last) ─────────────────╮ │ /home/unnati/starchat_lora_deepspeed/starcoder/chat/train.py:353 in │ │ │ │ │ │ 350 │ │ 351 │ │ 352 if name == "main": │ │ ❱ 353 │ main() │ │ 354 │ │ │ │ /home/unnati/starchat_lora_deepspeed/starcoder/chat/train.py:254 in │ │ main │ │ │ │ 251 │ │ model_args.model_name_or_path, │ │ 252 │ │ revision=model_args.model_revision, │ │ 253 │ │ torch_dtype=torch_dtype, │ │ ❱ 254 │ │ use_cache=False if training_args.gradient_checkpointi │ │ 255 # quantization_config=bnb_config │ │ 256 │ ) │ │ 257 │ model.resize_token_embeddings(len(tokenizer)) │ │ │ │ /home/unnati/.local/lib/python3.7/site-packages/transformers/models │ │ /auto/auto_factory.py:485 in from_pretrained │ │ │ │ 482 │ │ elif type(config) in cls._model_mapping.keys(): │ │ 483 │ │ │ model_class = _get_model_class(config, cls._model │ │ 484 │ │ │ return model_class.from_pretrained( │ │ ❱ 485 │ │ │ │ pretrained_model_name_or_path, *model_args, c │ │ 486 │ │ │ ) │ │ 487 │ │ raise ValueError( │ │ 488 │ │ │ f"Unrecognized configuration class {config.__clas │ │ │ │ /home/unnati/.local/lib/python3.7/site-packages/transformers/modeli │ │ ng_utils.py:2694 in from_pretrained │ │ │ │ 2691 │ │ │ import deepspeed │ │ 2692 │ │ │ │ │ 2693 │ │ │ logger.info("Detected DeepSpeed ZeRO-3: activati │ │ ❱ 2694 │ │ │ init_contexts = [deepspeed.zero.Init(config_dict │ │ 2695 │ │ elif load_in_8bit or load_in_4bit or low_cpu_mem_usa │ │ 2696 │ │ │ init_contexts.append(init_empty_weights()) │ │ 2697 │ │ │ │ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/parti │ │ tion_parameters.py:813 in init │ │ │ │ 810 │ │ │ logger.warning( │ │ 811 │ │ │ │ f'zero.Init: the config argument is deprec │ │ 812 │ │ _ds_config = deepspeed.runtime.config.DeepSpeedConfi │ │ ❱ 813 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ 814 │ │ if _ds_config is not None: │ │ 815 │ │ │ mem_efficient_linear = _ds_config.zero_config.me │ │ 816 │ │ super().init(enabled=enabled, mem_efficient_line │ │ │ │ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/config.py: │ │ 769 in init │ │ │ │ 766 │ │ │ │ 767 │ │ # Pass a copy so that user json is unmodified, e.g. │ │ 768 │ │ self._initialize_params(copy.copy(self._param_dict)) │ │ ❱ 769 │ │ self._configure_train_batch_size() │ │ 770 │ │ self._do_sanity_check() │ │ 771 │ │ │ 772 │ def _initialize_params(self, param_dict): │ │ │ │ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/config.py: │ │ 942 in _configure_train_batch_size │ │ │ │ 939 │ │ │ 940 │ def _configure_train_batch_size(self): │ │ 941 │ │ self._set_batch_related_parameters() │ │ ❱ 942 │ │ self._batch_assertion() │ │ 943 │ │ │ 944 │ def _do_sanity_check(self): │ │ 945 │ │ self._do_error_check() │ │ │ │ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/config.py: │ │ 891 in _batch_assertion │ │ │ │ 888 │ │ assert (grad_acc > 0), f"Gradient accumulation steps │ │ 889 │ │ │ │ 890 │ │ assert train_batch == micro_batch * grad_acc * self. │ │ ❱ 891 │ │ │ f"Check batch related parameters. train_batch_si │ │ 892 │ │ │ "to micro_batch_per_gpu * gradient_acc_step * wo │ │ 893 │ │ │ f"{train_batch} != {micro_batch} * {grad_acc} * │ │ 894 │ ╰─────────────────────────────────────────────────────────────────────╯ AssertionError: Check batch related parameters. train_batch_size is not equal to micro_batch_per_gpu * gradient_acc_step * world_size 64 != 4 * 8 * 1 ╭───────────────── Traceback (most recent call last) ─────────────────╮ │ /home/unnati/starchat_lora_deepspeed/starcoder/chat/train.py:353 in │ │ │ │ │ │ 350 │ │ 351 │ │ 352 if name == "main": │ │ ❱ 353 │ main() │ │ 354 │ │ │ │ /home/unnati/starchat_lora_deepspeed/starcoder/chat/train.py:254 in │ │ main │ │ │ │ 251 │ │ model_args.model_name_or_path, │ │ 252 │ │ revision=model_args.model_revision, │ │ 253 │ │ torch_dtype=torch_dtype, │ │ ❱ 254 │ │ use_cache=False if training_args.gradient_checkpointi │ │ 255 # quantization_config=bnb_config │ │ 256 │ ) │ │ 257 │ model.resize_token_embeddings(len(tokenizer)) │ │ │ │ /home/unnati/.local/lib/python3.7/site-packages/transformers/models │ │ /auto/auto_factory.py:485 in from_pretrained │ │ │ │ 482 │ │ elif type(config) in cls._model_mapping.keys(): │ │ 483 │ │ │ model_class = _get_model_class(config, cls._model │ │ 484 │ │ │ return model_class.from_pretrained( │ │ ❱ 485 │ │ │ │ pretrained_model_name_or_path, *model_args, c │ │ 486 │ │ │ ) │ │ 487 │ │ raise ValueError( │ │ 488 │ │ │ f"Unrecognized configuration class {config.__clas │ │ │ │ /home/unnati/.local/lib/python3.7/site-packages/transformers/modeli │ │ ng_utils.py:2694 in from_pretrained │ │ │ │ 2691 │ │ │ import deepspeed │ │ 2692 │ │ │ │ │ 2693 │ │ │ logger.info("Detected DeepSpeed ZeRO-3: activati │ │ ❱ 2694 │ │ │ init_contexts = [deepspeed.zero.Init(config_dict │ │ 2695 │ │ elif load_in_8bit or load_in_4bit or low_cpu_mem_usa │ │ 2696 │ │ │ init_contexts.append(init_empty_weights()) │ │ 2697 │ │ │ │ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/zero/parti │ │ tion_parameters.py:813 in init │ │ │ │ 810 │ │ │ logger.warning( │ │ 811 │ │ │ │ f'zero.Init: the config argument is deprec │ │ 812 │ │ _ds_config = deepspeed.runtime.config.DeepSpeedConfi │ │ ❱ 813 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ 814 │ │ if _ds_config is not None: │ │ 815 │ │ │ mem_efficient_linear = _ds_config.zero_config.me │ │ 816 │ │ super().init(enabled=enabled, mem_efficient_line │ │ │ │ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/config.py: │ │ 769 in init │ │ │ │ 766 │ │ │ │ 767 │ │ # Pass a copy so that user json is unmodified, e.g. │ │ 768 │ │ self._initialize_params(copy.copy(self._param_dict)) │ │ ❱ 769 │ │ self._configure_train_batch_size() │ │ 770 │ │ self._do_sanity_check() │ │ 771 │ │ │ 772 │ def _initialize_params(self, param_dict): │ │ │ │ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/config.py: │ │ 942 in _configure_train_batch_size │ │ │ │ 939 │ │ │ 940 │ def _configure_train_batch_size(self): │ │ 941 │ │ self._set_batch_related_parameters() │ │ ❱ 942 │ │ self._batch_assertion() │ │ 943 │ │ │ 944 │ def _do_sanity_check(self): │ │ 945 │ │ self._do_error_check() │ │ │ │ /opt/conda/lib/python3.7/site-packages/deepspeed/runtime/config.py: │ │ 891 in _batch_assertion │ │ │ │ 888 │ │ assert (grad_acc > 0), f"Gradient accumulation steps │ │ 889 │ │ │ │ 890 │ │ assert train_batch == micro_batch * grad_acc * self. │ │ ❱ 891 │ │ │ f"Check batch related parameters. train_batch_si │ │ 892 │ │ │ "to micro_batch_per_gpu * gradient_acc_step * wo │ │ 893 │ │ │ f"{train_batch} != {micro_batch} * {grad_acc} * │ │ 894 │ ╰─────────────────────────────────────────────────────────────────────╯ AssertionError: Check batch related parameters. train_batch_size is not equal to micro_batch_per_gpu * gradient_acc_step * world_size 64 != 4 * 8 * 1 ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1971) of binary: /opt/conda/bin/python3.7 Traceback (most recent call last): File "/opt/conda/bin/torchrun", line 8, in sys.exit(main()) File "/opt/conda/lib/python3.7/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper return f(*args, **kwargs) File "/opt/conda/lib/python3.7/site-packages/torch/distributed/run.py", line 762, in main run(args) File "/opt/conda/lib/python3.7/site-packages/torch/distributed/run.py", line 756, in run )(*cmd_args) File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 132, in call return launch_agent(self._config, self._entrypoint, list(args)) File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 248, in launch_agent failures=result.failures, torch.distributed.elastic.multiprocessing.errors.ChildFailedError: