[BUG] OOM error after running for a while
Describe the bug
I am segmenting a H&E image, the algorithm runs for a while, and then suddenly, the GPU usage spikes, and I get a cda OOM error. Reducing batch_size to 4 has not helped, I am wondering if I shoudl change bsize? This does not happen for other images I have segmented using your amazing algorithm (.tiff format, slightly smaller size, but able to use batch size 64!, segments in 6-10 minutes) (NVIDIA A10, ~22 GB VRAM)
Additionally, using model = None gc.collect() torch.cuda.empty_cache()
does not free up GPU memory post crash. Only restarting kernal can free that memory
To Reproduce
from cellpose import models, core, io, plot
from pathlib import Path
from tqdm import trange
from natsort import natsorted
# Input Files
curl -O https://cf.10xgenomics.com/samples/spatial-exp/3.1.2/Visium_HD_Human_Kidney_FFPE/Visium_HD_Human_Kidney_FFPE_tissue_image.tif
dir = "/your/folder/"
dir = Path(dir)
if not dir.exists():
raise FileNotFoundError("directory does not exist")
image_ext = ".tif"
files = natsorted([f for f in dir.glob("*"+image_ext) if "_masks" not in f.name and "_flows" not in f.name])
img = io.imread(files[0])
flow_threshold = 0.8
cellprob_threshold = -2
tile_norm_blocksize = 0
model = models.CellposeModel(gpu=True)
masks, flows, styles = model.eval(img, batch_size=4, flow_threshold=flow_threshold, cellprob_threshold=cellprob_threshold,
normalize={"tile_norm_blocksize": tile_norm_blocksize})
Run log
cellpose version: 4.0.4.dev7+gfb5a6c0
platform: linux
python version: 3.11.11
torch version: 2.3.1+cu121
decorators.py (43): 'param_categorical_covariate_keys' is not a valid key!
decorators.py (43): 'param_continuous_covariate_keys' is not a valid key!
OutOfMemoryError: CUDA out of memory. Tried to allocate 5.37 GiB. GPU
File <command-5633562477912152>, line 1
----> 1 masks, flows, styles = model.eval(img, batch_size=4, flow_threshold=flow_threshold, cellprob_threshold=cellprob_threshold,
2 normalize={"tile_norm_blocksize": tile_norm_blocksize})
4 x1 = 4700
5 x2 = 4820
File /databricks/python/lib/python3.11/site-packages/cellpose/models.py:333, in CellposeModel.eval(self, x, batch_size, resample, channels, channel_axis, z_axis, normalize, invert, rescale, diameter, flow_threshold, cellprob_threshold, do_3D, anisotropy, flow3D_smooth, stitch_threshold, min_size, max_size_fraction, niter, augment, tile_overlap, bsize, compute_masks, progress)
331 niter0 = 200
332 niter = niter0 if niter is None or niter == 0 else niter
--> 333 masks = self._compute_masks(x.shape, dP, cellprob, flow_threshold=flow_threshold,
334 cellprob_threshold=cellprob_threshold, min_size=min_size,
335 max_size_fraction=max_size_fraction, niter=niter,
336 stitch_threshold=stitch_threshold, do_3D=do_3D)
337 else:
338 masks = np.zeros(0) #pass back zeros if not compute_masks
File /databricks/python/lib/python3.11/site-packages/cellpose/models.py:446, in CellposeModel._compute_masks(self, shape, dP, cellprob, flow_threshold, cellprob_threshold, min_size, max_size_fraction, niter, do_3D, stitch_threshold)
443 for i in iterator:
444 # turn off min_size for 3D stitching
445 min_size0 = min_size if stitch_threshold == 0 or nimg == 1 else -1
--> 446 outputs = dynamics.resize_and_compute_masks(
447 dP[:, i], cellprob[i],
448 niter=niter, cellprob_threshold=cellprob_threshold,
449 flow_threshold=flow_threshold, resize=resize,
450 min_size=min_size0, max_size_fraction=max_size_fraction,
451 device=self.device)
452 if i==0 and nimg > 1:
453 masks = np.zeros((nimg, shape[1], shape[2]), outputs.dtype)
File /databricks/python/lib/python3.11/site-packages/cellpose/dynamics.py:610, in resize_and_compute_masks(dP, cellprob, niter, cellprob_threshold, flow_threshold, do_3D, min_size, max_size_fraction, resize, device)
587 def resize_and_compute_masks(dP, cellprob, niter=200, cellprob_threshold=0.0,
588 flow_threshold=0.4, do_3D=False, min_size=15,
589 max_size_fraction=0.4, resize=None, device=torch.device("cpu")):
590 """Compute masks using dynamics from dP and cellprob, and resizes masks if resize is not None.
591
592 Args:
(...)
608 tuple: A tuple containing the computed masks and the final pixel locations.
609 """
--> 610 mask = compute_masks(dP, cellprob, niter=niter,
611 cellprob_threshold=cellprob_threshold,
612 flow_threshold=flow_threshold, do_3D=do_3D,
613 max_size_fraction=max_size_fraction,
614 device=device)
616 if resize is not None:
617 dynamics_logger.warning("Resizing is depricated in v4.0.1+")
File /databricks/python/lib/python3.11/site-packages/cellpose/dynamics.py:665, in compute_masks(dP, cellprob, p, niter, cellprob_threshold, flow_threshold, do_3D, min_size, max_size_fraction, device)
663 if device.type == "mps":
664 p_final = p_final.to(torch.device("cpu"))
--> 665 mask = get_masks_torch(p_final, inds, dP.shape[1:],
666 max_size_fraction=max_size_fraction)
667 del p_final
668 # flow thresholding factored out of get_masks
File /databricks/python/lib/python3.11/site-packages/cellpose/dynamics.py:529, in get_masks_torch(pt, inds, shape0, rpad, max_size_fraction)
526 h1 = coo.to_dense()
527 del coo
--> 529 hmax1 = max_pool_nd(h1.unsqueeze(0), kernel_size=5)
530 hmax1 = hmax1.squeeze()
531 seeds1 = torch.nonzero((h1 - hmax1 > -1e-6) * (h1 > 10))
File /databricks/python/lib/python3.11/site-packages/cellpose/dynamics.py:478, in max_pool_nd(h, kernel_size)
476 """ memory efficient max_pool in 2d or 3d """
477 ndim = h.ndim - 1
--> 478 hmax = max_pool1d(h, kernel_size=kernel_size, axis=1)
479 hmax2 = max_pool1d(hmax, kernel_size=kernel_size, axis=2)
480 if ndim==2:
File /databricks/python/lib/python3.11/site-packages/cellpose/dynamics.py:456, in max_pool1d(h, kernel_size, axis, out)
450 """ memory efficient max_pool thanks to Mark Kittisopikul
451
452 for stride=1, padding=kernel_size//2, requires odd kernel_size >= 3
453
454 """
455 if out is None:
--> 456 out = h.clone()
457 else:
458 out.copy_(h)
Screenshots
Please notice the spike in the end, before it crashes with OOM error
Hi @mrariden, I was wondering if there is any fix here I could try? I am using pip install git+https://www.github.com/mouseland/cellpose.git instead of pip install cellpose. Does this mean I am using a not up to date version of cellpose?
This may be resolved now that we're using a smaller model. This is available in the 4.0.5 release when it's published
@mrariden It seems like the model size is still the same? its still around 1.15 GB, what is being downloaded from hugging face. I am still facing the issue when running the code above.
I experience a similar issue for an image of shape (39619, 39597, 2) with version 4.0.6:
Welcome to CellposeSAM, cellpose v
cellpose version: 4.0.6
platform: linux
python version: 3.12.11
torch version: 2.8.0+cu128! The neural network component of
CPSAM is much larger than in previous versions and CPU excution is slow.
We encourage users to use GPU/MPS if available.
I run cellpose with these options:
model = models.CellposeModel(gpu=True)
options = {'flow_threshold': 0.6, 'cellprob_threshold': 0.0, 'min_size': 8*4, 'batch_size': 16, 'normalize': True, 'tile_overlap': 0.1,'diameter': None}
mask_whole, _, _ = model.eval(im[:,:,[0,1]], **options)
The error: I get after some minutes of running without problem:
Traceback (most recent call last):
File "/home/retger/celltyping_imc/code/lunaphore/segment_cellpose.py", line 59, in <module>
mask_whole, _, _ = model.eval(im[:,:,[0,1]], **options)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/retger/.cache/uv/environments-v2/segment-cellpose-a0340e657541c014/lib/python3.12/site-packages/cellpose/models.py", line 338, in eval
masks = self._compute_masks(x.shape, dP, cellprob, flow_threshold=flow_threshold, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/retger/.cache/uv/environments-v2/segment-cellpose-a0340e657541c014/lib/python3.12/site-packages/cellpose/models.py", line 524, in _compute_masks
outputs = dynamics.resize_and_compute_masks(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/retger/.cache/uv/environments-v2/segment-cellpose-a0340e657541c014/lib/python3.12/site-packages/cellpose/dynamics.py", line 610, in resize_and_compute_masks
mask = compute_masks(dP, cellprob, niter=niter,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/retger/.cache/uv/environments-v2/segment-cellpose-a0340e657541c014/lib/python3.12/site-packages/cellpose/dynamics.py", line 665, in compute_masks
mask = get_masks_torch(p_final, inds, dP.shape[1:],
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/retger/.cache/uv/environments-v2/segment-cellpose-a0340e657541c014/lib/python3.12/site-packages/cellpose/dynamics.py", line 529, in get_masks_torch
hmax1 = max_pool_nd(h1.unsqueeze(0), kernel_size=5)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/retger/.cache/uv/environments-v2/segment-cellpose-a0340e657541c014/lib/python3.12/site-packages/cellpose/dynamics.py", line 479, in max_pool_nd
hmax2 = max_pool1d(hmax, kernel_size=kernel_size, axis=2)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/retger/.cache/uv/environments-v2/segment-cellpose-a0340e657541c014/lib/python3.12/site-packages/cellpose/dynamics.py", line 456, in max_pool1d out = h.clone()
^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 5.86 GiB. GPU 0 has a total capacity of 21.95 GiB of which 5.81 GiB is free. Including non-PyTorch memory, this process has 16.1
3 GiB memory in use. Of the allocated memory 15.84 GiB is allocated by PyTorch, and 58.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
I also see a spike of memory, after a short timeout (GPU usage on the left, GPU RAM on the right):