The current code is unable to support multi-GPU environment.
I parallelized the model using torch.nn.DataParallel(model).cuda()
I got this error when I try :

@DhavalTaunk08
Source:
I took the code from this notebook.
sentiment_analysis_using_roberta.ipynb
Code changes made:
Original (from the notebook):
model = RobertaClass()
model.to(device)
Changed to:
model = RobertaClass()
model = torch.nn.DataParallel(model).cuda() <-- I added this line for parallelization
model.to(device)
Code where the error appears:
EPOCHS = 1
for epoch in range(EPOCHS):
train(epoch)
Error Found:
RuntimeError Traceback (most recent call last)
in ()
1 EPOCHS = 3
2 for epoch in range(EPOCHS):
----> 3 train(epoch)
in train(epoch)
13 targets = data['targets'].to(device, dtype = torch.long)
14
---> 15 outputs = model(ids, mask, token_type_ids)
16 loss = loss_function(outputs, targets)
17 tr_loss += loss.item()
/usr/local/lib64/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
/usr/local/lib64/python3.6/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
153 return self.module(*inputs[0], **kwargs[0])
154 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 155 outputs = self.parallel_apply(replicas, inputs, kwargs)
156 return self.gather(outputs, self.output_device)
157
/usr/local/lib64/python3.6/site-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)
163
164 def parallel_apply(self, replicas, inputs, kwargs):
--> 165 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
166
167 def gather(self, outputs, output_device):
/usr/local/lib64/python3.6/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
83 output = results[i]
84 if isinstance(output, ExceptionWrapper):
---> 85 output.reraise()
86 outputs.append(output)
87 return outputs
/usr/local/lib64/python3.6/site-packages/torch/_utils.py in reraise(self)
393 # (https://bugs.python.org/issue2651), so we work around it.
394 msg = KeyErrorMessage(msg)
--> 395 raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/usr/local/lib64/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/usr/local/lib64/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "", line 10, in forward
output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
File "/usr/local/lib64/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/grid/0/tmp/hadoop-mapred/nm-local-dir/usercache/sbiswas01/appcache/application_1599069409794_172002/container_e18_1599069409794_172002_01_000001/package/transformers/modeling_roberta.py", line 685, in forward
return_dict=return_dict,
File "/usr/local/lib64/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/grid/0/tmp/hadoop-mapred/nm-local-dir/usercache/sbiswas01/appcache/application_1599069409794_172002/container_e18_1599069409794_172002_01_000001/package/transformers/modeling_roberta.py", line 424, in forward
output_attentions,
File "/usr/local/lib64/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/grid/0/tmp/hadoop-mapred/nm-local-dir/usercache/sbiswas01/appcache/application_1599069409794_172002/container_e18_1599069409794_172002_01_000001/package/transformers/modeling_roberta.py", line 364, in forward
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
File "/grid/0/tmp/hadoop-mapred/nm-local-dir/usercache/sbiswas01/appcache/application_1599069409794_172002/container_e18_1599069409794_172002_01_000001/package/transformers/modeling_utils.py", line 1672, in apply_chunking_to_forward
return forward_fn(*input_tensors)
File "/grid/0/tmp/hadoop-mapred/nm-local-dir/usercache/sbiswas01/appcache/application_1599069409794_172002/container_e18_1599069409794_172002_01_000001/package/transformers/modeling_roberta.py", line 371, in feed_forward_chunk
layer_output = self.output(intermediate_output, attention_output)
File "/usr/local/lib64/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/grid/0/tmp/hadoop-mapred/nm-local-dir/usercache/sbiswas01/appcache/application_1599069409794_172002/container_e18_1599069409794_172002_01_000001/package/transformers/modeling_roberta.py", line 309, in forward
hidden_states = self.dense(hidden_states)
File "/usr/local/lib64/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/usr/local/lib64/python3.6/site-packages/torch/nn/modules/linear.py", line 91, in forward
return F.linear(input, self.weight, self.bias)
File "/usr/local/lib64/python3.6/site-packages/torch/nn/functional.py", line 1676, in linear
output = input.matmul(weight.t())
RuntimeError: CUDA error: CUBLAS_STATUS_INTERNAL_ERROR when calling cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)
Is it possible to solve this?