pytorch-widedeep
pytorch-widedeep copied to clipboard
(Noob Question) CUDA error: device-side assert triggered
- Environment : Google Colab Pro (Tesla P100 / T4, both tested)
Error Massage
Hi, I'm currently working with Kaggle competition which has not small dataset (about 1.5gb)
Now I'm trying to solve the error,
CUDA error: device-side assert triggered
The error occured when I utilized both of wide and deep models, it did not happen when only passed wide or deep (MLP).
The code is exactly same with example notebook - binary classification.
Full traceback With os.environ['CUDA_LAUNCH_BLOCKING'] = "1" is,
epoch 1: 0%| | 0/91783 [00:01<?, ?it/s]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
[<ipython-input-14-9b15622fbcea>](https://localhost:8080/#) in <module>()
5 n_epochs=1,
6 batch_size=4,
----> 7 val_split=0.2
8 )
10 frames
[/usr/local/lib/python3.7/dist-packages/pytorch_widedeep/utils/general_utils.py](https://localhost:8080/#) in __call__(self, wrapped, instance, args, kwargs)
59 self.primary_name
60 ] = alias
---> 61 return wrapped(*args, **kwargs)
62
63
[/usr/local/lib/python3.7/dist-packages/pytorch_widedeep/training/trainer.py](https://localhost:8080/#) in fit(self, X_wide, X_tab, X_text, X_img, X_train, X_val, val_split, target, n_epochs, validation_freq, batch_size, custom_dataloader, finetune, with_lds, **kwargs)
453 t.set_description("epoch %i" % (epoch + 1))
454 train_score, train_loss = self._train_step(
--> 455 data, targett, batch_idx, epoch, lds_weightt
456 )
457 print_loss_and_metric(t, train_loss, train_score)
[/usr/local/lib/python3.7/dist-packages/pytorch_widedeep/training/trainer.py](https://localhost:8080/#) in _train_step(self, data, target, batch_idx, epoch, lds_weightt)
1021 _, y_pred = self.model(X, y, epoch)
1022 else:
-> 1023 y_pred = self.model(X)
1024
1025 if self.model.is_tabnet:
[/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py](https://localhost:8080/#) in _call_impl(self, *input, **kwargs)
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
[/usr/local/lib/python3.7/dist-packages/pytorch_widedeep/models/wide_deep.py](https://localhost:8080/#) in forward(self, X, y, epoch)
203 return self._forward_deep_with_fds(X, y, epoch)
204
--> 205 wide_out = self._forward_wide(X)
206 if self.deephead:
207 deep = self._forward_deephead(X, wide_out)
[/usr/local/lib/python3.7/dist-packages/pytorch_widedeep/models/wide_deep.py](https://localhost:8080/#) in _forward_wide(self, X)
269 def _forward_wide(self, X):
270 if self.wide is not None:
--> 271 out = self.wide(X["wide"])
272 else:
273 batch_size = X[list(X.keys())[0]].size(0)
[/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py](https://localhost:8080/#) in _call_impl(self, *input, **kwargs)
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
[/usr/local/lib/python3.7/dist-packages/pytorch_widedeep/models/tabular/linear/wide.py](https://localhost:8080/#) in forward(self, X)
64 r"""Forward pass. Simply connecting the Embedding layer with the ouput
65 neuron(s)"""
---> 66 out = self.wide_linear(X.long()).sum(dim=1) + self.bias
67 return out
[/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py](https://localhost:8080/#) in _call_impl(self, *input, **kwargs)
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
[/usr/local/lib/python3.7/dist-packages/torch/nn/modules/sparse.py](https://localhost:8080/#) in forward(self, input)
158 return F.embedding(
159 input, self.weight, self.padding_idx, self.max_norm,
--> 160 self.norm_type, self.scale_grad_by_freq, self.sparse)
161
162 def extra_repr(self) -> str:
[/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py](https://localhost:8080/#) in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2197 # remove once script supports set_grad_enabled
2198 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2199 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
2200
2201
RuntimeError: CUDA error: device-side assert triggered
I thought this is about GPU usage problem but resource tab offered by Colab did not show high level of the usage. Also, batch size 1 did not work.
Could you give me an advice?
Thanks for nice library 😃
full code
# TARGET
target = train['target'].values
# WIDE
wide_preprocessor = WidePreprocessor(
wide_cols=feat_wide,
crossed_cols=feat_cross)
X_wide = wide_preprocessor.fit_transform(train)
gc.collect()
# DEEP
tab_preprocessor = TabPreprocessor(
embed_cols=feat_cat,
continuous_cols=feat_cont
)
X_tab = tab_preprocessor.fit_transform(train)
gc.collect()
wide = Wide(input_dim=np.unique(X_wide).shape[0], pred_dim=1)
tab_mlp = TabMlp(
column_idx=tab_preprocessor.column_idx,
cat_embed_input=tab_preprocessor.cat_embed_input,
cat_embed_dropout=0.1,
continuous_cols=tab_preprocessor.continuous_cols,
mlp_hidden_dims=[400, 200],
mlp_dropout=0.5,
mlp_activation="leaky_relu",
)
wd_model = WideDeep(wide=wide, deeptabular=tab_mlp)
# wd_model = WideDeep(wide=wide) # works ok
# wd_model = WideDeep(deeptabular=tab_mlp) # works ok
wd_trainer = Trainer(
model=wd_model,
objective="binary",
optimizers=torch.optim.AdamW(wd_model.parameters(), lr=0.001),
metrics=[Accuracy],
)
wd_trainer.fit(
X_wide=X_wide,
X_tab=X_tab,
target=target,
n_epochs=1,
batch_size=1,
val_split=0.2
)
Structure
WideDeep(
(wide): Wide(
(wide_linear): Embedding(449, 1, padding_idx=0)
)
(deeptabular): Sequential(
(0): TabMlp(
(cat_and_cont_embed): DiffSizeCatAndContEmbeddings(
(cat_embed): DiffSizeCatEmbeddings(
(embed_layers): ModuleDict(
(emb_layer_B_30_last): Embedding(5, 3, padding_idx=0)
(emb_layer_B_38_last): Embedding(9, 5, padding_idx=0)
(emb_layer_D_114_last): Embedding(4, 3, padding_idx=0)
(emb_layer_D_116_last): Embedding(4, 3, padding_idx=0)
(emb_layer_D_117_last): Embedding(9, 5, padding_idx=0)
(emb_layer_D_120_last): Embedding(4, 3, padding_idx=0)
(emb_layer_D_126_last): Embedding(3, 2, padding_idx=0)
(emb_layer_D_63_last): Embedding(7, 4, padding_idx=0)
(emb_layer_D_64_last): Embedding(5, 3, padding_idx=0)
(emb_layer_D_66_last): Embedding(3, 2, padding_idx=0)
(emb_layer_D_68_last): Embedding(8, 5, padding_idx=0)
)
(embedding_dropout): Dropout(p=0.1, inplace=False)
)
(cont_norm): BatchNorm1d(858, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(tab_mlp): MLP(
(mlp): Sequential(
(dense_layer_0): Sequential(
(0): Dropout(p=0.5, inplace=False)
(1): Linear(in_features=896, out_features=400, bias=True)
(2): LeakyReLU(negative_slope=0.01, inplace=True)
)
(dense_layer_1): Sequential(
(0): Dropout(p=0.5, inplace=False)
(1): Linear(in_features=400, out_features=200, bias=True)
(2): LeakyReLU(negative_slope=0.01, inplace=True)
)
)
)
)
(1): Linear(in_features=200, out_features=1, bias=True)
)
)
Hey @keonho-kim , thanks for opening the issue. Well check asap, in the meantime, maybe this is useful: https://link.medium.com/j5gF6HrbFrb
@keonho-kim did the post solve the problem?
Closing it, if necessary we will re-open it