ColossalAI
ColossalAI copied to clipboard
[BUG]: RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.cuda.HalfTensor) should be the same
🐛 Describe the bug
class lightningmodel(pl.LightningModule):
def __init__(self):
super().__init__()
def configure_sharded_model(self):
self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
"openai/whisper-tiny"
)
def forward(self, **inputs):
return self.model(**inputs)
def training_step(self, batch, batch_idx):
outputs = self(**batch)
loss = outputs[0]
self.log("train/loss", loss)
return loss
def configure_optimizers(self):
optimizer = CPUAdam(self.parameters(), lr=training_args.learning_rate)
scheduler = ExponentialLR(optimizer=optimizer, gamma=0.99)
return {"optimizer": optimizer, "lr_scheduler": scheduler}
dloader = DataLoader(
vectorized_datasets["train"],
collate_fn=data_collator,
batch_size=training_args.per_device_train_batch_size,
num_workers=0,
)
_logger = WandbLogger(project="huggingface")
lr_monitor = LearningRateMonitor(logging_interval="step")
plmodel = lightningmodel()
trainer = pl.Trainer(
max_epochs=1,
logger=_logger,
log_every_n_steps=1,
callbacks=[lr_monitor],
accelerator="gpu",
devices=1,
precision=16,
strategy="colossalai",
)
trainer.fit(model=plmodel, train_dataloaders=dloader)
logs
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[01/08/23 13:20:34] INFO colossalai - ProcessGroup - INFO: /home/aware/anaconda3/lib/python3.9/site-packages/colossalai/tensor/process_group.py:24 get
INFO colossalai - ProcessGroup - INFO: NCCL initialize ProcessGroup on [0]
searching chunk configuration is completed in 0.04 s.
used number: 36.01 MB, wasted number: 14.98 MB
total wasted percentage is 29.38%
Epoch 0: 0%| | 0/119752 [00:00<?, ?it/s]Traceback (most recent call last):
File "/mnt/nvme0/asr/run_speech_recognition_ctc.py", line 373, in <module>
File "/mnt/nvme0/asr/run_speech_recognition_ctc.py", line 369, in main
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 608, in fit
call._call_and_handle_interrupt(
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 36, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 88, in launch
return function(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 650, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1103, in _run
results = self._run_stage()
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1182, in _run_stage
self._run_train()
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1205, in _run_train
self.fit_loop.run()
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 267, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/epoch/training_epoch_loop.py", line 213, in advance
batch_output = self.batch_loop.run(kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(optimizers, kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 202, in advance
result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 249, in _run_optimization
self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 370, in _optimizer_step
self.trainer._call_lightning_module_hook(
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1347, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1708, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/optimizer.py", line 169, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/colossalai.py", line 433, in optimizer_step
return self.precision_plugin.optimizer_step(
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/colossalai.py", line 73, in optimizer_step
closure_result = closure()
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 149, in __call__
self._result = self.closure(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 135, in closure
step_output = self._step_fn()
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 419, in _training_step
training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values())
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1485, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/ddp.py", line 351, in training_step
return self.model(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py", line 263, in forward
outputs = self.module(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/overrides/base.py", line 98, in forward
output = self._forward_module.training_step(*inputs, **kwargs)
File "/mnt/nvme0/asr/run_speech_recognition_ctc.py", line 319, in training_step
outputs = self(**batch)
File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/mnt/nvme0/asr/run_speech_recognition_ctc.py", line 316, in forward
return self.model(**inputs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py", line 1193, in forward
outputs = self.model(
File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py", line 1046, in forward
encoder_outputs = self.encoder(
File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py", line 650, in forward
inputs_embeds = nn.functional.gelu(self.conv1(input_features))
File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 307, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 303, in _conv_forward
return F.conv1d(input, weight, bias, self.stride,
File "/home/aware/anaconda3/lib/python3.9/site-packages/colossalai/tensor/colo_parameter.py", line 70, in __torch_function__
ret = super().__torch_function__(func, types, args, kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/colossalai/tensor/colo_tensor.py", line 183, in __torch_function__
ret = func(*args, **kwargs)
RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.cuda.HalfTensor) should be the same
Environment
lightning from source colossalai latest release torch, cuda etc from conda with torch 1.12
Pinging @ver217 who made the integration
I guess the input should be torch.half? ColossalAI strategy convert model parameter to torch.half automatically.
with manually half precision data this occurs:
?it/s]Traceback (most recent call last):
File "/mnt/nvme0/asr/run_speech_recognition_ctc.py", line 360, in <module>
main()
File "/mnt/nvme0/asr/run_speech_recognition_ctc.py", line 353, in main
trainer.fit(model=plmodel, train_dataloaders=dloader)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 608, in fit
call._call_and_handle_interrupt(
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 36, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 88, in launch
return function(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 650, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1103, in _run
results = self._run_stage()
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1182, in _run_stage
self._run_train()
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1205, in _run_train
self.fit_loop.run()
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 267, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/epoch/training_epoch_loop.py", line 213, in advance
batch_output = self.batch_loop.run(kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(optimizers, kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 202, in advance
result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 249, in _run_optimization
self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 370, in _optimizer_step
self.trainer._call_lightning_module_hook(
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1347, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1708, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/optimizer.py", line 169, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/colossalai.py", line 433, in optimizer_step
return self.precision_plugin.optimizer_step(
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/colossalai.py", line 73, in optimizer_step
closure_result = closure()
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 149, in __call__
self._result = self.closure(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 144, in closure
self._backward_fn(step_output.closure_loss)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 305, in backward_fn
self.trainer._call_strategy_hook("backward", loss, optimizer, opt_idx)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1485, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/strategy.py", line 207, in backward
self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, optimizer_idx, *args, **kwargs)
File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/colossalai.py", line 57, in backward
optimizer.backward(tensor)
File "/home/aware/anaconda3/lib/python3.9/site-packages/colossalai/nn/optimizer/zero_optimizer.py", line 219, in backward
self.module.backward(loss)
File "/home/aware/anaconda3/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py", line 305, in backward
self._post_backward()
File "/home/aware/anaconda3/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py", line 293, in _post_backward
assert self.chunk_manager.accessed_mem == 0
AssertionError
We have updated a lot. This issue was closed due to inactivity. Thanks.