ColossalAI icon indicating copy to clipboard operation
ColossalAI copied to clipboard

[BUG]: RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.cuda.HalfTensor) should be the same

Open flozi00 opened this issue 2 years ago • 3 comments

🐛 Describe the bug

class lightningmodel(pl.LightningModule):
        def __init__(self):
            super().__init__()

        def configure_sharded_model(self):
            self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
                "openai/whisper-tiny"
            )

        def forward(self, **inputs):
            return self.model(**inputs)

        def training_step(self, batch, batch_idx):
            outputs = self(**batch)
            loss = outputs[0]
            self.log("train/loss", loss)

            return loss

        def configure_optimizers(self):
            optimizer = CPUAdam(self.parameters(), lr=training_args.learning_rate)
            scheduler = ExponentialLR(optimizer=optimizer, gamma=0.99)
            return {"optimizer": optimizer, "lr_scheduler": scheduler}

    dloader = DataLoader(
        vectorized_datasets["train"],
        collate_fn=data_collator,
        batch_size=training_args.per_device_train_batch_size,
        num_workers=0,
    )

    _logger = WandbLogger(project="huggingface")

    lr_monitor = LearningRateMonitor(logging_interval="step")

    plmodel = lightningmodel()
    trainer = pl.Trainer(
        max_epochs=1,
        logger=_logger,
        log_every_n_steps=1,
        callbacks=[lr_monitor],
        accelerator="gpu",
        devices=1,
        precision=16,
        strategy="colossalai",
    )
    trainer.fit(model=plmodel, train_dataloaders=dloader)

logs

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[01/08/23 13:20:34] INFO     colossalai - ProcessGroup - INFO: /home/aware/anaconda3/lib/python3.9/site-packages/colossalai/tensor/process_group.py:24 get                                                      
                    INFO     colossalai - ProcessGroup - INFO: NCCL initialize ProcessGroup on [0]                                                                                                              
searching chunk configuration is completed in 0.04 s.
used number: 36.01 MB, wasted number: 14.98 MB
total wasted percentage is 29.38%
Epoch 0:   0%|                                                                                                                                                                       | 0/119752 [00:00<?, ?it/s]Traceback (most recent call last):
  File "/mnt/nvme0/asr/run_speech_recognition_ctc.py", line 373, in <module>
  File "/mnt/nvme0/asr/run_speech_recognition_ctc.py", line 369, in main
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 608, in fit
    call._call_and_handle_interrupt(
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 36, in _call_and_handle_interrupt
    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 88, in launch
    return function(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 650, in _fit_impl
    self._run(model, ckpt_path=self.ckpt_path)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1103, in _run
    results = self._run_stage()
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1182, in _run_stage
    self._run_train()
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1205, in _run_train
    self.fit_loop.run()
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
    self.advance(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 267, in advance
    self._outputs = self.epoch_loop.run(self._data_fetcher)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
    self.advance(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/epoch/training_epoch_loop.py", line 213, in advance
    batch_output = self.batch_loop.run(kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
    self.advance(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/batch/training_batch_loop.py", line 88, in advance
    outputs = self.optimizer_loop.run(optimizers, kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
    self.advance(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 202, in advance
    result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 249, in _run_optimization
    self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 370, in _optimizer_step
    self.trainer._call_lightning_module_hook(
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1347, in _call_lightning_module_hook
    output = fn(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1708, in optimizer_step
    optimizer.step(closure=optimizer_closure)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/optimizer.py", line 169, in step
    step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/colossalai.py", line 433, in optimizer_step
    return self.precision_plugin.optimizer_step(
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/colossalai.py", line 73, in optimizer_step
    closure_result = closure()
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 149, in __call__
    self._result = self.closure(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 135, in closure
    step_output = self._step_fn()
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 419, in _training_step
    training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values())
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1485, in _call_strategy_hook
    output = fn(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/ddp.py", line 351, in training_step
    return self.model(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py", line 263, in forward
    outputs = self.module(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/overrides/base.py", line 98, in forward
    output = self._forward_module.training_step(*inputs, **kwargs)
  File "/mnt/nvme0/asr/run_speech_recognition_ctc.py", line 319, in training_step
    outputs = self(**batch)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/mnt/nvme0/asr/run_speech_recognition_ctc.py", line 316, in forward
    return self.model(**inputs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py", line 1193, in forward
    outputs = self.model(
  File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py", line 1046, in forward
    encoder_outputs = self.encoder(
  File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/transformers/models/whisper/modeling_whisper.py", line 650, in forward
    inputs_embeds = nn.functional.gelu(self.conv1(input_features))
  File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 307, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 303, in _conv_forward
    return F.conv1d(input, weight, bias, self.stride,
  File "/home/aware/anaconda3/lib/python3.9/site-packages/colossalai/tensor/colo_parameter.py", line 70, in __torch_function__
    ret = super().__torch_function__(func, types, args, kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/colossalai/tensor/colo_tensor.py", line 183, in __torch_function__
    ret = func(*args, **kwargs)
RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.cuda.HalfTensor) should be the same

Environment

lightning from source colossalai latest release torch, cuda etc from conda with torch 1.12

flozi00 avatar Jan 08 '23 13:01 flozi00

Pinging @ver217 who made the integration

flozi00 avatar Jan 08 '23 13:01 flozi00

I guess the input should be torch.half? ColossalAI strategy convert model parameter to torch.half automatically.

feifeibear avatar Jan 10 '23 08:01 feifeibear

with manually half precision data this occurs:

?it/s]Traceback (most recent call last):
  File "/mnt/nvme0/asr/run_speech_recognition_ctc.py", line 360, in <module>
    main()
  File "/mnt/nvme0/asr/run_speech_recognition_ctc.py", line 353, in main
    trainer.fit(model=plmodel, train_dataloaders=dloader)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 608, in fit
    call._call_and_handle_interrupt(
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/call.py", line 36, in _call_and_handle_interrupt
    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 88, in launch
    return function(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 650, in _fit_impl
    self._run(model, ckpt_path=self.ckpt_path)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1103, in _run
    results = self._run_stage()
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1182, in _run_stage
    self._run_train()
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1205, in _run_train
    self.fit_loop.run()
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
    self.advance(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py", line 267, in advance
    self._outputs = self.epoch_loop.run(self._data_fetcher)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
    self.advance(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/epoch/training_epoch_loop.py", line 213, in advance
    batch_output = self.batch_loop.run(kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
    self.advance(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/batch/training_batch_loop.py", line 88, in advance
    outputs = self.optimizer_loop.run(optimizers, kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/loop.py", line 199, in run
    self.advance(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 202, in advance
    result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 249, in _run_optimization
    self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 370, in _optimizer_step
    self.trainer._call_lightning_module_hook(
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1347, in _call_lightning_module_hook
    output = fn(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/module.py", line 1708, in optimizer_step
    optimizer.step(closure=optimizer_closure)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/core/optimizer.py", line 169, in step
    step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/colossalai.py", line 433, in optimizer_step
    return self.precision_plugin.optimizer_step(
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/colossalai.py", line 73, in optimizer_step
    closure_result = closure()
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 149, in __call__
    self._result = self.closure(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 144, in closure
    self._backward_fn(step_output.closure_loss)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/loops/optimization/optimizer_loop.py", line 305, in backward_fn
    self.trainer._call_strategy_hook("backward", loss, optimizer, opt_idx)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/trainer/trainer.py", line 1485, in _call_strategy_hook
    output = fn(*args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/strategies/strategy.py", line 207, in backward
    self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, optimizer_idx, *args, **kwargs)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/lightning/pytorch/plugins/precision/colossalai.py", line 57, in backward
    optimizer.backward(tensor)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/colossalai/nn/optimizer/zero_optimizer.py", line 219, in backward
    self.module.backward(loss)
  File "/home/aware/anaconda3/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py", line 305, in backward
    self._post_backward()
  File "/home/aware/anaconda3/lib/python3.9/site-packages/colossalai/nn/parallel/data_parallel.py", line 293, in _post_backward
    assert self.chunk_manager.accessed_mem == 0
AssertionError

flozi00 avatar Jan 10 '23 10:01 flozi00

We have updated a lot. This issue was closed due to inactivity. Thanks.

binmakeswell avatar Apr 14 '23 09:04 binmakeswell