[BUG] RuntimeError: Tensors must be contiguous error while finetuning with deepspeed.
I am just trying to fine-tune "EleutherAI/gpt-neo-1.3B" for casualLM on google colab. Without anything, it gives out of memory error. I was checking what can I do and I found deepspeed. I added deepspeed='ds_config.json', to my training arguments in jupyter notebook and used configuration from the official page "ds_config_zero2.json". After that, I start to get this error. I am trying to do it in the notebook, not as a command.
To Reproduce try fine-tuning gpt-neo
This is the full error
The following columns in the training set don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: text. If text are not expected by `GPTNeoForCausalLM.forward`, you can safely ignore this message.
[2023-01-23 12:41:08,453] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.8.0, git-hash=unknown, git-branch=unknown
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
[<ipython-input-21-3435b262f1ae>](https://localhost:8080/#) in <module>
----> 1 trainer.train()
10 frames
[/usr/local/lib/python3.8/dist-packages/transformers/trainer.py](https://localhost:8080/#) in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1525 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1526 )
-> 1527 return inner_training_loop(
1528 args=args,
1529 resume_from_checkpoint=resume_from_checkpoint,
[/usr/local/lib/python3.8/dist-packages/transformers/trainer.py](https://localhost:8080/#) in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1594 )
1595 if args.deepspeed:
-> 1596 deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
1597 self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint
1598 )
[/usr/local/lib/python3.8/dist-packages/transformers/deepspeed.py](https://localhost:8080/#) in deepspeed_init(trainer, num_training_steps, resume_from_checkpoint, inference)
342 )
343
--> 344 deepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
345
346 if resume_from_checkpoint is not None:
[/usr/local/lib/python3.8/dist-packages/deepspeed/__init__.py](https://localhost:8080/#) in initialize(args, model, optimizer, model_parameters, training_data, lr_scheduler, mpu, dist_init_required, collate_fn, config, config_params)
123
124 if not isinstance(model, PipelineModule):
--> 125 engine = DeepSpeedEngine(args=args,
126 model=model,
127 optimizer=optimizer,
[/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py](https://localhost:8080/#) in __init__(self, args, model, optimizer, model_parameters, training_data, lr_scheduler, mpu, dist_init_required, collate_fn, config, config_params, dont_change_device)
299
300 # Configure distributed model
--> 301 self._configure_distributed_model(model)
302
303 self._get_model_parameters()
[/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py](https://localhost:8080/#) in _configure_distributed_model(self, model)
1185
1186 if not self.amp_enabled():
-> 1187 self._broadcast_model()
1188
1189 # check if parameters are duplicated in optimizer param_groups
[/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py](https://localhost:8080/#) in _broadcast_model(self)
1100 else:
1101 if torch.is_tensor(p) and is_replicated(p):
-> 1102 dist.broadcast(p,
1103 groups._get_broadcast_src_rank(),
1104 group=self.data_parallel_group)
[/usr/local/lib/python3.8/dist-packages/deepspeed/comm/comm.py](https://localhost:8080/#) in log_wrapper(*args, **kwargs)
125 # Return the op, then stop the op's timer
126 try:
--> 127 return func(*args, **kwargs)
128 finally:
129 if comms_logger.enabled:
[/usr/local/lib/python3.8/dist-packages/deepspeed/comm/comm.py](https://localhost:8080/#) in broadcast(tensor, src, group, async_op, prof, log_name, debug)
230 debug=get_caller_func()):
231 global cdb
--> 232 return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
233
234
[/usr/local/lib/python3.8/dist-packages/deepspeed/comm/torch.py](https://localhost:8080/#) in broadcast(self, tensor, src, group, async_op)
68
69 def broadcast(self, tensor, src, group=None, async_op=False):
---> 70 return torch.distributed.broadcast(tensor=tensor,
71 src=src,
72 group=group,
[/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py](https://localhost:8080/#) in broadcast(tensor, src, group, async_op)
1402 group_src_rank = get_group_rank(group, src)
1403 opts.rootRank = group_src_rank
-> 1404 work = group.broadcast([tensor], opts)
1405 if async_op:
1406 return work
RuntimeError: Tensors must be contiguous
ds_report output
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
runtime if needed. Op compatibility means that your system
meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io: please install the libaio-dev package with apt
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
async_io ............... [NO] ....... [NO]
cpu_adagrad ............ [NO] ....... [OKAY]
cpu_adam ............... [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
random_ltd ............. [NO] ....... [OKAY]
[WARNING] please install triton==1.0.0 if you want to use sparse attention
sparse_attn ............ [NO] ....... [NO]
spatial_inference ...... [NO] ....... [OKAY]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/usr/local/lib/python3.8/dist-packages/torch']
torch version .................... 1.13.1+cu116
deepspeed install path ........... ['/usr/local/lib/python3.8/dist-packages/deepspeed']
deepspeed info ................... 0.8.0, unknown, unknown
torch cuda version ............... 11.6
torch hip version ................ None
nvcc version ..................... 11.2
deepspeed wheel compiled w. ...... torch 1.13, cuda 11.6
System info (please complete the following information): google colab
Launcher context
Are you launching your experiment with the deepspeed launcher, MPI, or something else?
Docker context Are you using a specific docker image that you can share?
Additional context Add any other context about the problem here.
Hi @FahriBilici , thx for raising this issue.
In order to reproduce the error, could you also provide the training script you ran and command line (e.g., either use deepspeed or pytorch launcher) for running your training script?
closed for now, feel free to reopen if needed.
I am using huggingface trainer class for finetuning. It works on jupyter notebook not directly training script. how should I share my notebook?
@GuanhuaWang I got the same problem when finetuning "EleutherAI/gpt-j-6B" using LoRA on 8×2080ti. Exactly the same error log as above. Reproduce the error: clone this repo: https://github.com/CarperAI/trlx modify the script: examples/summarize_rlhf/sft/train_gptj_summarize.py
import random
import os
import evaluate
import numpy as np
import torch
import torch.nn as nn
from peft import LoraConfig, get_peft_model
from summarize_dataset import TLDRDataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
Trainer,
TrainingArguments,
default_data_collator,
)
def set_seed(seed_val=42):
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
if __name__ == "__main__":
output_dir = "gptj-supervised-summarize-checkpoint"
train_batch_size = 16
gradient_accumulation_steps = 1
learning_rate = 1e-5
eval_batch_size = 1
eval_steps = 500
max_input_length = 550
save_steps = 1000
num_train_epochs = 5
random.seed(42)
os.environ["WANDB_DISABLED"] = "true"
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", use_cache=False, load_in_8bit=True, device_map='auto')
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.end_token_id = tokenizer.eos_token_id
model.config.pad_token_id = model.config.eos_token_id
for param in model.parameters():
param.requires_grad = False # freeze the model - train adapters later
if param.ndim == 1:
# cast the small parameters (e.g. layernorm) to fp32 for stability
param.data = param.data.to(torch.float32)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
class CastOutputToFloat(nn.Sequential):
def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)
config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
# Set up the datasets
data_path = "CarperAI/openai_summarize_tldr"
train_dataset = TLDRDataset(
data_path,
tokenizer,
"train",
max_length=max_input_length,
)
dev_dataset = TLDRDataset(
data_path,
tokenizer,
"valid",
max_length=max_input_length,
)
# Set up the metric
rouge = evaluate.load("rouge")
def compute_metrics(eval_preds):
labels_ids = eval_preds.label_ids
pred_ids = eval_preds.predictions
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
result = rouge.compute(predictions=pred_str, references=label_str)
return result
# Create a preprocessing function to extract out the proper logits from the model output
def preprocess_logits_for_metrics(logits, labels):
if isinstance(logits, tuple):
logits = logits[0]
return logits.argmax(dim=-1)
# Prepare the trainer and start training
training_args = TrainingArguments(
output_dir=output_dir,
evaluation_strategy="steps",
eval_accumulation_steps=1,
learning_rate=learning_rate,
per_device_train_batch_size=train_batch_size,
per_device_eval_batch_size=eval_batch_size,
gradient_checkpointing=True,
half_precision_backend="auto",
fp16=True,
adam_beta1=0.9,
adam_beta2=0.95,
gradient_accumulation_steps=gradient_accumulation_steps,
num_train_epochs=num_train_epochs,
warmup_steps=100,
eval_steps=eval_steps,
save_steps=save_steps,
load_best_model_at_end=True,
logging_steps=50,
deepspeed="examples/summarize_rlhf/sft/ds_config_gptj.json",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=dev_dataset,
compute_metrics=compute_metrics,
data_collator=default_data_collator,
preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)
trainer.train()
trainer.save_model(output_dir)
and run:
deepspeed examples/summarize_rlhf/sft/train_gptj_summarize.py
Hi, I first raised this issue on the pytorch repo: https://github.com/pytorch/pytorch/issues/94907#issue-1586135480.
It was suggested that DeepSpeed should ensure that the tensors it passes to torch.distributed are contiguous.
I fixed the issue by manually changing the following line in distributed_c10d.py:
https://github.com/pytorch/pytorch/blob/3ace14eb8b5e437322acf962d2f170561fd4e3bc/torch/distributed/distributed_c10d.py#L1555
I basically force the tensors to be contiguous, e.g.:
work = group.broadcast([tensor.contiguous()], opts)
However, this may result in some unexpected behavior as apparently calling .contiguous recreates the tensor (?).
I'm experiencing the same issue.
I'm experiencing the same issue.
same issue with gpt models on HF
Hello, I just faced the same issue.
I found out that the problem lies in the device_map argument of Hugging Face's AutoModel... classes.
Changing the argument from device_map="auto" to device_map=None fixed the issue for me!
I hope this help!
Hi, any updates on this? I'm using the DeepSpeed integration in PyTorch Lightning and haven't been able to resolve this except for the hack I've mentioned above:
work = group.broadcast([tensor.contiguous()], opts)
deepspeed="examples/summarize_rlhf/sft/ds_config_gptj.json",
@chenmingjiong Did you modify the json file? If so, please paste the changes here.
Hello, I just faced the same issue. I found out that the problem lies in the
device_mapargument of Hugging Face'sAutoModel...classes. Changing the argument fromdevice_map="auto"todevice_map=Nonefixed the issue for me! I hope this help!
@thinhlpg I think this loads the model to CPU instead of CUDA.
@chenmingjiong Did you modify the json file? If so, please paste the changes here.
No. I use the original file.
Same issue. Same code works for some models, but reports this error for some others.
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /export/home/project/codeai/codeai_autocomplete/finetune/train_deepspeed.py: │
│ 339 in <module> │
│ │
│ 336 │
│ 337 │
│ 338 if __name__ == '__main__': │
│ ❱ 339 │ main() │
│ │
│ /export/home/project/codeai/codeai_autocomplete/finetune/train_deepspeed.py: │
│ 335 in main │
│ │
│ 332 │ copy_source(__file__, args.output_dir) │
│ 333 │ │
│ 334 │ # train │
│ ❱ 335 │ train(args=args) │
│ 336 │
│ 337 │
│ 338 if __name__ == '__main__': │
│ │
│ /export/home/project/codeai/codeai_autocomplete/finetune/train_deepspeed.py: │
│ 210 in train │
│ │
│ 207 │ ## deepspeed │
│ 208 │ with print_time('Initializing deepspeed'): │
│ 209 │ │ model_parameters = list(filter(lambda p: p.requires_grad, mode │
│ ❱ 210 │ │ model_engine, optimizer, _, _ = deepspeed.initialize(config=ar │
│ 211 │ │ torch.cuda.empty_cache() │
│ 212 │ │
│ 213 │ ####################### │
│ │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/__init__.py:165 in initialize │
│ │
│ 162 │ │ │ │ │ │ │ │ │ │ config=config, │
│ 163 │ │ │ │ │ │ │ │ │ │ config_class=config_class) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ engine = DeepSpeedEngine(args=args, │
│ 166 │ │ │ │ │ │ │ │ │ model=model, │
│ 167 │ │ │ │ │ │ │ │ │ optimizer=optimizer, │
│ 168 │ │ │ │ │ │ │ │ │ model_parameters=model_parameters │
│ │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/runtime/engine.py:266 in __init__ │
│ │
│ 263 │ │ self.pipeline_parallelism = isinstance(model, PipelineModule) │
│ 264 │ │ │
│ 265 │ │ # Configure distributed model │
│ ❱ 266 │ │ self._configure_distributed_model(model) │
│ 267 │ │ │
│ 268 │ │ self._get_model_parameters() │
│ 269 │
│ │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/runtime/engine.py:1073 in _configure_distributed_model │
│ │
│ 1070 │ │ self.expert_data_parallel_group = groups._get_expert_data_par │
│ 1071 │ │ │
│ 1072 │ │ if not self.amp_enabled(): │
│ ❱ 1073 │ │ │ self._broadcast_model() │
│ 1074 │ │
│ 1075 │ # check if parameters are duplicated in optimizer param_groups │
│ 1076 │ def _check_for_duplicates(self, optimizer): │
│ │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/runtime/engine.py:1003 in _broadcast_model │
│ │
│ 1000 │ │ │ │ │ │ │ │ group=self.expert_data_parallel_gr │
│ 1001 │ │ │ else: │
│ 1002 │ │ │ │ if torch.is_tensor(p) and is_replicated(p): │
│ ❱ 1003 │ │ │ │ │ dist.broadcast(p, groups._get_broadcast_src_rank( │
│ 1004 │ │
│ 1005 │ @staticmethod │
│ 1006 │ def __check_params(model: Module, dtype: torch.dtype) -> None: │
│ │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/comm/comm.py:120 in log_wrapper │
│ │
│ 117 │ │ │ │ timers(log_name).start() │
│ 118 │ │ # Return the op, then stop the op's timer │
│ 119 │ │ try: │
│ ❱ 120 │ │ │ return func(*args, **kwargs) │
│ 121 │ │ finally: │
│ 122 │ │ │ if comms_logger.enabled: │
│ 123 │ │ │ │ # Need to make op blocking for accurate logging │
│ │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/comm/comm.py:217 in broadcast │
│ │
│ 214 @timed_op │
│ 215 def broadcast(tensor, src, group=None, async_op=False, prof=False, log │
│ 216 │ global cdb │
│ ❱ 217 │ return cdb.broadcast(tensor=tensor, src=src, group=group, async_op │
│ 218 │
│ 219 │
│ 220 @timed_op │
│ │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/comm/torch.py:118 in broadcast │
│ │
│ 115 │ │ │ │ │ │ │ │ │ │ │ │ async_op=async_op) │
│ 116 │ │
│ 117 │ def broadcast(self, tensor, src, group=None, async_op=False): │
│ ❱ 118 │ │ return torch.distributed.broadcast(tensor=tensor, src=src, gro │
│ 119 │ │
│ 120 │ def all_gather(self, tensor_list, tensor, group=None, async_op=Fal │
│ 121 │ │ return torch.distributed.all_gather(tensor_list=tensor_list, t │
│ │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ torch/distributed/distributed_c10d.py:1451 in wrapper │
│ │
│ 1448 │ @functools.wraps(func) │
│ 1449 │ def wrapper(*args, **kwargs): │
│ 1450 │ │ try: │
│ ❱ 1451 │ │ │ return func(*args, **kwargs) │
│ 1452 │ │ except Exception as error: │
│ 1453 │ │ │ if is_initialized(): │
│ 1454 │ │ │ │ error_msg_dict = { │
│ │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ torch/distributed/distributed_c10d.py:1570 in broadcast │
│ │
│ 1567 │ else: │
│ 1568 │ │ group_src_rank = get_group_rank(group, src) │
│ 1569 │ │ opts.rootRank = group_src_rank │
│ ❱ 1570 │ │ work = group.broadcast([tensor], opts) │
│ 1571 │ if async_op: │
│ 1572 │ │ return work │
│ 1573 │ else: │
╰──────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Tensors must be contiguous
In line with @FarzanT 's comment, you may try make this change (comm.py L214) within deepspeed to minimize the risk.
It's working but I need some time to check if the learning curve makes sense.
# deepspeed/comm/comm.py
@timed_op
def broadcast(tensor, src, group=None, async_op=False, prof=False, log_name='broadcast', debug=get_caller_func()):
global cdb
if not tensor.is_contiguous():
tensor = tensor.contiguous()
return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
Same issue. Same code works for some models, but reports this error for some others.
╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /export/home/project/codeai/codeai_autocomplete/finetune/train_deepspeed.py: │ │ 339 in <module> │ │ │ │ 336 │ │ 337 │ │ 338 if __name__ == '__main__': │ │ ❱ 339 │ main() │ │ │ │ /export/home/project/codeai/codeai_autocomplete/finetune/train_deepspeed.py: │ │ 335 in main │ │ │ │ 332 │ copy_source(__file__, args.output_dir) │ │ 333 │ │ │ 334 │ # train │ │ ❱ 335 │ train(args=args) │ │ 336 │ │ 337 │ │ 338 if __name__ == '__main__': │ │ │ │ /export/home/project/codeai/codeai_autocomplete/finetune/train_deepspeed.py: │ │ 210 in train │ │ │ │ 207 │ ## deepspeed │ │ 208 │ with print_time('Initializing deepspeed'): │ │ 209 │ │ model_parameters = list(filter(lambda p: p.requires_grad, mode │ │ ❱ 210 │ │ model_engine, optimizer, _, _ = deepspeed.initialize(config=ar │ │ 211 │ │ torch.cuda.empty_cache() │ │ 212 │ │ │ 213 │ ####################### │ │ │ │ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │ │ deepspeed/__init__.py:165 in initialize │ │ │ │ 162 │ │ │ │ │ │ │ │ │ │ config=config, │ │ 163 │ │ │ │ │ │ │ │ │ │ config_class=config_class) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ engine = DeepSpeedEngine(args=args, │ │ 166 │ │ │ │ │ │ │ │ │ model=model, │ │ 167 │ │ │ │ │ │ │ │ │ optimizer=optimizer, │ │ 168 │ │ │ │ │ │ │ │ │ model_parameters=model_parameters │ │ │ │ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │ │ deepspeed/runtime/engine.py:266 in __init__ │ │ │ │ 263 │ │ self.pipeline_parallelism = isinstance(model, PipelineModule) │ │ 264 │ │ │ │ 265 │ │ # Configure distributed model │ │ ❱ 266 │ │ self._configure_distributed_model(model) │ │ 267 │ │ │ │ 268 │ │ self._get_model_parameters() │ │ 269 │ │ │ │ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │ │ deepspeed/runtime/engine.py:1073 in _configure_distributed_model │ │ │ │ 1070 │ │ self.expert_data_parallel_group = groups._get_expert_data_par │ │ 1071 │ │ │ │ 1072 │ │ if not self.amp_enabled(): │ │ ❱ 1073 │ │ │ self._broadcast_model() │ │ 1074 │ │ │ 1075 │ # check if parameters are duplicated in optimizer param_groups │ │ 1076 │ def _check_for_duplicates(self, optimizer): │ │ │ │ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │ │ deepspeed/runtime/engine.py:1003 in _broadcast_model │ │ │ │ 1000 │ │ │ │ │ │ │ │ group=self.expert_data_parallel_gr │ │ 1001 │ │ │ else: │ │ 1002 │ │ │ │ if torch.is_tensor(p) and is_replicated(p): │ │ ❱ 1003 │ │ │ │ │ dist.broadcast(p, groups._get_broadcast_src_rank( │ │ 1004 │ │ │ 1005 │ @staticmethod │ │ 1006 │ def __check_params(model: Module, dtype: torch.dtype) -> None: │ │ │ │ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │ │ deepspeed/comm/comm.py:120 in log_wrapper │ │ │ │ 117 │ │ │ │ timers(log_name).start() │ │ 118 │ │ # Return the op, then stop the op's timer │ │ 119 │ │ try: │ │ ❱ 120 │ │ │ return func(*args, **kwargs) │ │ 121 │ │ finally: │ │ 122 │ │ │ if comms_logger.enabled: │ │ 123 │ │ │ │ # Need to make op blocking for accurate logging │ │ │ │ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │ │ deepspeed/comm/comm.py:217 in broadcast │ │ │ │ 214 @timed_op │ │ 215 def broadcast(tensor, src, group=None, async_op=False, prof=False, log │ │ 216 │ global cdb │ │ ❱ 217 │ return cdb.broadcast(tensor=tensor, src=src, group=group, async_op │ │ 218 │ │ 219 │ │ 220 @timed_op │ │ │ │ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │ │ deepspeed/comm/torch.py:118 in broadcast │ │ │ │ 115 │ │ │ │ │ │ │ │ │ │ │ │ async_op=async_op) │ │ 116 │ │ │ 117 │ def broadcast(self, tensor, src, group=None, async_op=False): │ │ ❱ 118 │ │ return torch.distributed.broadcast(tensor=tensor, src=src, gro │ │ 119 │ │ │ 120 │ def all_gather(self, tensor_list, tensor, group=None, async_op=Fal │ │ 121 │ │ return torch.distributed.all_gather(tensor_list=tensor_list, t │ │ │ │ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │ │ torch/distributed/distributed_c10d.py:1451 in wrapper │ │ │ │ 1448 │ @functools.wraps(func) │ │ 1449 │ def wrapper(*args, **kwargs): │ │ 1450 │ │ try: │ │ ❱ 1451 │ │ │ return func(*args, **kwargs) │ │ 1452 │ │ except Exception as error: │ │ 1453 │ │ │ if is_initialized(): │ │ 1454 │ │ │ │ error_msg_dict = { │ │ │ │ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │ │ torch/distributed/distributed_c10d.py:1570 in broadcast │ │ │ │ 1567 │ else: │ │ 1568 │ │ group_src_rank = get_group_rank(group, src) │ │ 1569 │ │ opts.rootRank = group_src_rank │ │ ❱ 1570 │ │ work = group.broadcast([tensor], opts) │ │ 1571 │ if async_op: │ │ 1572 │ │ return work │ │ 1573 │ else: │ ╰──────────────────────────────────────────────────────────────────────────────╯ RuntimeError: Tensors must be contiguous
Hi. What models does this end up working for, based on your experience?
@KeeratKG Ah sorry I don't recall, should be either huggyllama/llama-7b or Salesforce/codegen2-7B.
Facing the same problem,
Traceback (most recent call last):
File "fine_tune.py", line 191, in <module>
trainer.train()
File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/transformers/trainer.py", line 1662, in train
return inner_training_loop(
File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/transformers/trainer.py", line 1731, in _inner_training_loop
deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/transformers/deepspeed.py", line 378, in deepspeed_init
deepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/deepspeed/__init__.py", line 171, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 268, in __init__
self._configure_distributed_model(model)
File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1109, in _configure_distributed_model
self._broadcast_model()
File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1033, in _broadcast_model
dist.broadcast(p, groups._get_broadcast_src_rank(), group=self.data_parallel_group)
File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/deepspeed/comm/comm.py", line 116, in log_wrapper
return func(*args, **kwargs)
File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/deepspeed/comm/comm.py", line 216, in broadcast
return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/deepspeed/comm/torch.py", line 188, in broadcast
return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1404, in broadcast
work = group.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
facing th same issue @pacman100
if not tensor.is_contiguous(): tensor = tensor.contiguous()
This is my accelerate config
compute_environment: LOCAL_MACHINE
deepspeed_config:
offload_optimizer_device: cpu
gradient_clipping: 1.0
zero_stage: 2
distributed_type: DEEPSPEED
downcast_bf16: 'no'
fsdp_config: {}
machine_rank: 0
main_training_function: main
main_process_port: 20680
mixed_precision: 'no'
num_machines: 1
num_processes: 1
rdzv_backend: static
same_network: true
use_cpu: false
@pacman100 I have tried this solution but I got another one)
09/26/2023 18:32:51 - INFO - root - Train ataset length :12008
09/26/2023 18:32:51 - INFO - __main__ - Sample 10476 of the training set: {'input_ids': [32100, 125,
5, 651, 5, 1]}.
09/26/2023 18:32:51 - INFO - accelerate.accelerator - Updating DeepSpeed's gradient accumulation step
piled!
Building extension module cpu_adam...
t-hash=unknown, git-branch=unknown
09/26/2023 18:33:01 - INFO - torch.distributed.distributed_c10d - Added key: store_based_barrier_key:
main()
1213, in _configure_optimizer
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1467, in _configure_zero_optimizer
optimizer = DeepSpeedZeroOptimizer(
File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 337, in __init__
self.flatten_dense_tensors_aligned(
File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 889, in flatten_dense_tensors_aligned
return self.flatten(align_dense_tensors(tensor_list, alignment))
File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/torch/_utils.py", line 459, in _flatten_dense_tensors
return torch._C._nn.flatten_dense_tensors(tensors)
RuntimeError: torch.cat(): expected a non-empty list of Tensors
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3608256) of binary: /mnt/ssd/arij/NeurIPS/NeurIPSS/bin/python3.9
Traceback (most recent call last):
File "/mnt/ssd/arij/NeurIPS/NeurIPSS/bin/accelerate", line 8, in <module>
sys.exit(main())
File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
args.func(args)
File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/accelerate/commands/launch.py", line 964, in launch_command
deepspeed_launcher(args)
File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/accelerate/commands/launch.py", line 687, in deepspeed_launcher
distrib_run.run(args)
File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
train_seqtoseq_dolly_PEFT_24_9_2023.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-09-26_18:33:06
host : srv-dgx02
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 3608256)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
dp_report
facing the same issue.
my code works fine with flan-t5 but raises this error with t5-base
Traceback (most recent call last):
File "run_question_answering.py", line 898, in <module>
main()
File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "run_question_answering.py", line 834, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/transformers/trainer.py", line 1539, in train
return inner_training_loop(
File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/transformers/trainer.py", line 1656, in _inner_training_loop
model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/accelerate/accelerator.py", line 1198, in prepare
result = self._prepare_deepspeed(*args)
File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/accelerate/accelerator.py", line 1537, in _prepare_deepspeed
engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/__init__.py", line 171, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 268, in __init__
self._configure_distributed_model(model)
File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1109, in _configure_distributed_model
self._broadcast_model()
File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1033, in _broadcast_model
dist.broadcast(p, groups._get_broadcast_src_rank(), group=self.data_parallel_group)
File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/comm/comm.py", line 116, in log_wrapper
return func(*args, **kwargs)
File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/comm/comm.py", line 216, in broadcast
return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/comm/torch.py", line 188, in broadcast
return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1451, in wrapper
return func(*args, **kwargs)
File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1570, in broadcast
work = group.broadcast([tensor], opts)
facing the same issue.
my code works fine with flan-t5 but raises this error with t5-base
Traceback (most recent call last): File "run_question_answering.py", line 898, in <module> main() File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper return f(*args, **kwargs) File "run_question_answering.py", line 834, in main train_result = trainer.train(resume_from_checkpoint=checkpoint) File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/transformers/trainer.py", line 1539, in train return inner_training_loop( File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/transformers/trainer.py", line 1656, in _inner_training_loop model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer) File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/accelerate/accelerator.py", line 1198, in prepare result = self._prepare_deepspeed(*args) File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/accelerate/accelerator.py", line 1537, in _prepare_deepspeed engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs) File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/__init__.py", line 171, in initialize engine = DeepSpeedEngine(args=args, File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 268, in __init__ self._configure_distributed_model(model) File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1109, in _configure_distributed_model self._broadcast_model() File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1033, in _broadcast_model dist.broadcast(p, groups._get_broadcast_src_rank(), group=self.data_parallel_group) File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/comm/comm.py", line 116, in log_wrapper return func(*args, **kwargs) File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/comm/comm.py", line 216, in broadcast return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/comm/torch.py", line 188, in broadcast return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op) File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1451, in wrapper return func(*args, **kwargs) File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1570, in broadcast work = group.broadcast([tensor], opts)
I tried to run with torchrun instead of deepspeed, things went fine
Hello, I just faced the same issue. I found out that the problem lies in the
device_mapargument of Hugging Face'sAutoModel...classes. Changing the argument fromdevice_map="auto"todevice_map=Nonefixed the issue for me! I hope this help!
device_map="auto" is not compatible with DeepSpeed, thus it is necessary to remove this option or change it into None. However, this is totally irrelevant to the Tensor must be contiguous error. Using device_map="auto" option causes Expected all tensors to be on the same device... error. (Github)
facing th same issue.
Traceback (most recent call last):
File "/home/nlp/zgy/VLM/src/train/train_mem.py", line 12, in <module>
train()
File "/home/nlp/zgy/VLM/src/train/train.py", line 395, in train
trainer.train()
File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/trainer.py", line 1537, in train
return inner_training_loop(
File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/trainer.py", line 1675, in _inner_training_loop
model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/accelerate/accelerator.py", line 1209, in prepare
result = self._prepare_deepspeed(*args)
File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/accelerate/accelerator.py", line 1582, in _prepare_deepspeed
engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/deepspeed/__init__.py", line 171, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 263, in __init__
self._configure_distributed_model(model)
File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1148, in _configure_distributed_model
self._broadcast_model()
File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1068, in _broadcast_model
dist.broadcast(p, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
return func(*args, **kwargs)
File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 224, in broadcast
return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 196, in broadcast
return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1910, in broadcast
work = group.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
I have tried this solution and this solution, neither worked.