segger_dev
segger_dev copied to clipboard
[BUG] IndexError: index out of range in self
hi! thanks for developing segger, i am getting this error during training (tutorial: https://github.com/EliHei2/segger_dev/blob/main/docs/notebooks/segger_tutorial.ipynb):
code:
# Base directory to store Pytorch Lightning models
models_dir = Path('/dss/dssfs02/lwp-dss-0001/pn57fo/pn57fo-dss-0000/mariam/externaldata/BreastCancer/outputs/segger_output/models')
# Initialize the Lightning data module
dm = SeggerDataModule(
data_dir=segger_data_dir,
batch_size=2,
num_workers=2,
)
dm.setup()
num_tx_tokens = 500
# If you use custom gene embeddings, use the following two lines instead:
# num_tx_tokens = dm.train[0].x_dict["tx"].shape[1] # Set the number of tokens to the number of genes
model = Segger(
# is_token_based=is_token_based,
num_tx_tokens=num_tx_tokens,
init_emb=8,
hidden_channels=64,
out_channels=16,
heads=4,
num_mid_layers=3,
)
model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr="sum")
batch = dm.train[0]
model.forward(batch.x_dict, batch.edge_index_dict)
# Wrap the model in LitSegger
ls = LitSegger(model=model)
# Initialize the Lightning trainer
trainer = Trainer(
accelerator='cuda',
strategy='auto',
precision='16-mixed',
devices=1, # set higher number if more gpus are available
max_epochs=100,
default_root_dir=models_dir,
logger=CSVLogger(models_dir),
)
error:
[/dss/dsshome1/0C/go76saz2/segger_dev/src/segger/data/parquet/pyg_dataset.py:67](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/segger_dev/src/segger/data/parquet/pyg_dataset.py#line=66): FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
data = torch.load(filepath)
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
Cell In[48], line 31
28 model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr="sum")
30 batch = dm.train[0]
---> 31 model.forward(batch.x_dict, batch.edge_index_dict)
32 # Wrap the model in LitSegger
33 ls = LitSegger(model=model)
File <eval_with_key>.66:27, in forward(self, x, edge_index)
25 int_1__tx = mul_1__tx.int(); mul_1__tx = None
26 int_1__bd = mul_1__bd.int(); mul_1__bd = None
---> 27 tx_embedding__tx = self.tx_embedding.tx(int_1__tx); int_1__tx = None
28 tx_embedding__bd = self.tx_embedding.bd(int_1__bd); int_1__bd = None
29 mul_2__tx = tx_embedding__tx * mul__tx; tx_embedding__tx = None
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/torch/nn/modules/module.py:1736](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/torch/nn/modules/module.py#line=1735), in Module._wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/torch/nn/modules/module.py:1747](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/torch/nn/modules/module.py#line=1746), in Module._call_impl(self, *args, **kwargs)
1742 # If we don't have any hooks, we want to skip the rest of the logic in
1743 # this function, and just call forward.
1744 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1749 result = None
1750 called_always_called_hooks = set()
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/torch/nn/modules/sparse.py:190](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/torch/nn/modules/sparse.py#line=189), in Embedding.forward(self, input)
189 def forward(self, input: Tensor) -> Tensor:
--> 190 return F.embedding(
191 input,
192 self.weight,
193 self.padding_idx,
194 self.max_norm,
195 self.norm_type,
196 self.scale_grad_by_freq,
197 self.sparse,
198 )
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/torch/nn/functional.py:2551](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/torch/nn/functional.py#line=2550), in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2545 # Note [embedding_renorm set_grad_enabled]
2546 # XXX: equivalent to
2547 # with torch.no_grad():
2548 # torch.embedding_renorm_
2549 # remove once script supports set_grad_enabled
2550 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2551 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
IndexError: index out of range in self
and i tried to fix it using:
num_tx_tokens = int(batch.x_dict["tx"].max().item()) + 1
which then works but i get this error next:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[46], line 2
1 # Fit model
----> 2 trainer.fit(
3 model=ls,
4 datamodule=dm
5 )
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:561](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py#line=560), in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
559 self.training = True
560 self.should_stop = False
--> 561 call._call_and_handle_interrupt(
562 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
563 )
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:48](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py#line=47), in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
46 if trainer.strategy.launcher is not None:
47 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
---> 48 return trainer_fn(*args, **kwargs)
50 except _TunerExitException:
51 _call_teardown_hook(trainer)
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:599](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py#line=598), in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
592 download_model_from_registry(ckpt_path, self)
593 ckpt_path = self._checkpoint_connector._select_ckpt_path(
594 self.state.fn,
595 ckpt_path,
596 model_provided=True,
597 model_connected=self.lightning_module is not None,
598 )
--> 599 self._run(model, ckpt_path=ckpt_path)
601 assert self.state.stopped
602 self.training = False
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:1012](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py#line=1011), in Trainer._run(self, model, ckpt_path)
1007 self._signal_connector.register_signal_handlers()
1009 # ----------------------------
1010 # RUN THE TRAINER
1011 # ----------------------------
-> 1012 results = self._run_stage()
1014 # ----------------------------
1015 # POST-Training CLEAN UP
1016 # ----------------------------
1017 log.debug(f"{self.__class__.__name__}: trainer tearing down")
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:1053](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py#line=1052), in Trainer._run_stage(self)
1051 return self.predict_loop.run()
1052 if self.training:
-> 1053 with isolate_rng():
1054 self._run_sanity_check()
1055 with torch.autograd.set_detect_anomaly(self._detect_anomaly):
File [~/miniconda3/envs/segger2/lib/python3.11/contextlib.py:137](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/contextlib.py#line=136), in _GeneratorContextManager.__enter__(self)
135 del self.args, self.kwds, self.func
136 try:
--> 137 return next(self.gen)
138 except StopIteration:
139 raise RuntimeError("generator didn't yield") from None
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/lightning/pytorch/utilities/seed.py:44](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/lightning/pytorch/utilities/seed.py#line=43), in isolate_rng(include_cuda)
22 @contextmanager
23 def isolate_rng(include_cuda: bool = True) -> Generator[None, None, None]:
24 """A context manager that resets the global random state on exit to what it was before entering.
25
26 It supports isolating the states for PyTorch, Numpy, and Python built-in random number generators.
(...) 42
43 """
---> 44 states = _collect_rng_states(include_cuda)
45 yield
46 _set_rng_states(states)
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/lightning/fabric/utilities/seed.py:137](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/lightning/fabric/utilities/seed.py#line=136), in _collect_rng_states(include_cuda)
135 states["numpy"] = np.random.get_state()
136 if include_cuda:
--> 137 states["torch.cuda"] = torch.cuda.get_rng_state_all() if torch.cuda.is_available() else []
138 return states
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/torch/cuda/random.py:49](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/torch/cuda/random.py#line=48), in get_rng_state_all()
47 results = []
48 for i in range(device_count()):
---> 49 results.append(get_rng_state(i))
50 return results
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/torch/cuda/random.py:42](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/torch/cuda/random.py#line=41), in get_rng_state(device)
40 idx = current_device()
41 default_generator = torch.cuda.default_generators[idx]
---> 42 return default_generator.get_state()
RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
another issue if i import as you did in your tutorial:
from segger.data.parquet.sample import STSampleParquet
from segger.training.segger_data_module import SeggerDataModule
from segger.training.train import LitSegger
from segger.prediction.predict_parquet import segment, load_model
from lightning.pytorch.loggers import CSVLogger
from pytorch_lightning import Trainer
from pathlib import Path
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import scanpy as sc
error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[65], line 2
1 # Fit model
----> 2 trainer.fit(
3 model=ls,
4 datamodule=dm
5 )
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/pytorch_lightning/trainer/trainer.py:554](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/pytorch_lightning/trainer/trainer.py#line=553), in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
507 def fit(
508 self,
509 model: "pl.LightningModule",
(...) 513 ckpt_path: Optional[_PATH] = None,
514 ) -> None:
515 r"""Runs the full optimization routine.
516
517 Args:
(...) 552
553 """
--> 554 model = _maybe_unwrap_optimized(model)
555 self.strategy._lightning_module = model
556 _verify_strategy_supports_compile(model, self.strategy)
File [~/miniconda3/envs/segger2/lib/python3.11/site-packages/pytorch_lightning/utilities/compile.py:111](https://ood-2.ai.lrz.de/node/dgx-002.ai.lrz.de/8926/lab/tree/~/miniconda3/envs/segger2/lib/python3.11/site-packages/pytorch_lightning/utilities/compile.py#line=110), in _maybe_unwrap_optimized(model)
109 return model
110 _check_mixed_imports(model)
--> 111 raise TypeError(
112 f"`model` must be a `LightningModule` or `torch._dynamo.OptimizedModule`, got `{type(model).__qualname__}`"
113 )
TypeError: `model` must be a `LightningModule` or `torch._dynamo.OptimizedModule`, got `LitSegger`
and when i remove from pytorch_lightning import LightningModule i get the CUDA error i mentioned in the beginning
thank you!