german-gpt2 icon indicating copy to clipboard operation
german-gpt2 copied to clipboard

RuntimeError: CUDA error: device-side assert triggered

Open julfr opened this issue 2 years ago • 0 comments

Hi, I tried fine-tuning first with "anonymous-german-nlp/german-gpt2", and it worked well. But now with "dbmdz/german-gpt2" (and btw the same with "stefan-it//german-gpt2-larger") i get a CUDA-error:

 Num examples = 104
  Num Epochs = 30
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 1560
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-13-3435b262f1ae> in <module>
----> 1 trainer.train()

~/.local/lib/python3.8/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1496             self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
   1497         )
-> 1498         return inner_training_loop(
   1499             args=args,
   1500             resume_from_checkpoint=resume_from_checkpoint,

~/.local/lib/python3.8/site-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   1738                         tr_loss_step = self.training_step(model, inputs)
   1739                 else:
-> 1740                     tr_loss_step = self.training_step(model, inputs)
   1741 
   1742                 if (

~/.local/lib/python3.8/site-packages/transformers/trainer.py in training_step(self, model, inputs)
   2468 
   2469         with self.compute_loss_context_manager():
-> 2470             loss = self.compute_loss(model, inputs)
   2471 
   2472         if self.args.n_gpu > 1:

~/.local/lib/python3.8/site-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs)
   2500         else:
   2501             labels = None
-> 2502         outputs = model(**inputs)
   2503         # Save past state if it exists
   2504         # TODO: this needs to be fixed and made cleaner later.

~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1100         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1101                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102             return forward_call(*input, **kwargs)
   1103         # Do not call functions when jit is used
   1104         full_backward_hooks, non_full_backward_hooks = [], []

~/.local/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py in forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, use_cache, output_attentions, output_hidden_states, return_dict)
   1056         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
   1057 
-> 1058         transformer_outputs = self.transformer(
   1059             input_ids,
   1060             past_key_values=past_key_values,

~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1100         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1101                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102             return forward_call(*input, **kwargs)
   1103         # Do not call functions when jit is used
   1104         full_backward_hooks, non_full_backward_hooks = [], []

~/.local/lib/python3.8/site-packages/transformers/models/gpt2/modeling_gpt2.py in forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict)
    842 
    843         if inputs_embeds is None:
--> 844             inputs_embeds = self.wte(input_ids)
    845         position_embeds = self.wpe(position_ids)
    846         hidden_states = inputs_embeds + position_embeds

~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1100         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1101                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102             return forward_call(*input, **kwargs)
   1103         # Do not call functions when jit is used
   1104         full_backward_hooks, non_full_backward_hooks = [], []

~/.local/lib/python3.8/site-packages/torch/nn/modules/sparse.py in forward(self, input)
    156 
    157     def forward(self, input: Tensor) -> Tensor:
--> 158         return F.embedding(
    159             input, self.weight, self.padding_idx, self.max_norm,
    160             self.norm_type, self.scale_grad_by_freq, self.sparse)

~/.local/lib/python3.8/site-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   2042         # remove once script supports set_grad_enabled
   2043         _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2044     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
   2045 
   2046 

RuntimeError: CUDA error: device-side assert triggered

Please advice, what can be a problem. Thanks a lot in advance

julfr avatar Sep 26 '22 15:09 julfr