cellpose [BUG] OOM error after running for a while

Describe the bug

I am segmenting a H&E image, the algorithm runs for a while, and then suddenly, the GPU usage spikes, and I get a cda OOM error. Reducing batch_size to 4 has not helped, I am wondering if I shoudl change bsize? This does not happen for other images I have segmented using your amazing algorithm (.tiff format, slightly smaller size, but able to use batch size 64!, segments in 6-10 minutes) (NVIDIA A10, ~22 GB VRAM)

Additionally, using model = None gc.collect() torch.cuda.empty_cache()

does not free up GPU memory post crash. Only restarting kernal can free that memory

To Reproduce

from cellpose import models, core, io, plot
from pathlib import Path
from tqdm import trange
from natsort import natsorted

# Input Files
curl -O https://cf.10xgenomics.com/samples/spatial-exp/3.1.2/Visium_HD_Human_Kidney_FFPE/Visium_HD_Human_Kidney_FFPE_tissue_image.tif

dir = "/your/folder/"
dir = Path(dir)
if not dir.exists():
  raise FileNotFoundError("directory does not exist")

image_ext = ".tif" 

files = natsorted([f for f in dir.glob("*"+image_ext) if "_masks" not in f.name and "_flows" not in f.name])

img = io.imread(files[0])

flow_threshold = 0.8
cellprob_threshold = -2
tile_norm_blocksize = 0

model = models.CellposeModel(gpu=True)
masks, flows, styles = model.eval(img, batch_size=4, flow_threshold=flow_threshold, cellprob_threshold=cellprob_threshold,
                                  normalize={"tile_norm_blocksize": tile_norm_blocksize})

Run log


cellpose version: 	4.0.4.dev7+gfb5a6c0 
platform:       	linux 
python version: 	3.11.11 
torch version:  	2.3.1+cu121
decorators.py (43): 'param_categorical_covariate_keys' is not a valid key!
decorators.py (43): 'param_continuous_covariate_keys' is not a valid key!

OutOfMemoryError: CUDA out of memory. Tried to allocate 5.37 GiB. GPU 
File <command-5633562477912152>, line 1
----> 1 masks, flows, styles = model.eval(img, batch_size=4, flow_threshold=flow_threshold, cellprob_threshold=cellprob_threshold,
      2                                   normalize={"tile_norm_blocksize": tile_norm_blocksize})
      4 x1 = 4700
      5 x2 = 4820
File /databricks/python/lib/python3.11/site-packages/cellpose/models.py:333, in CellposeModel.eval(self, x, batch_size, resample, channels, channel_axis, z_axis, normalize, invert, rescale, diameter, flow_threshold, cellprob_threshold, do_3D, anisotropy, flow3D_smooth, stitch_threshold, min_size, max_size_fraction, niter, augment, tile_overlap, bsize, compute_masks, progress)
    331     niter0 = 200
    332     niter = niter0 if niter is None or niter == 0 else niter
--> 333     masks = self._compute_masks(x.shape, dP, cellprob, flow_threshold=flow_threshold,
    334                     cellprob_threshold=cellprob_threshold, min_size=min_size,
    335                 max_size_fraction=max_size_fraction, niter=niter,
    336                 stitch_threshold=stitch_threshold, do_3D=do_3D)
    337 else:
    338     masks = np.zeros(0) #pass back zeros if not compute_masks
File /databricks/python/lib/python3.11/site-packages/cellpose/models.py:446, in CellposeModel._compute_masks(self, shape, dP, cellprob, flow_threshold, cellprob_threshold, min_size, max_size_fraction, niter, do_3D, stitch_threshold)
    443 for i in iterator:
    444     # turn off min_size for 3D stitching
    445     min_size0 = min_size if stitch_threshold == 0 or nimg == 1 else -1
--> 446     outputs = dynamics.resize_and_compute_masks(
    447         dP[:, i], cellprob[i],
    448         niter=niter, cellprob_threshold=cellprob_threshold,
    449         flow_threshold=flow_threshold, resize=resize,
    450         min_size=min_size0, max_size_fraction=max_size_fraction,
    451         device=self.device)
    452     if i==0 and nimg > 1:
    453         masks = np.zeros((nimg, shape[1], shape[2]), outputs.dtype)
File /databricks/python/lib/python3.11/site-packages/cellpose/dynamics.py:610, in resize_and_compute_masks(dP, cellprob, niter, cellprob_threshold, flow_threshold, do_3D, min_size, max_size_fraction, resize, device)
    587 def resize_and_compute_masks(dP, cellprob, niter=200, cellprob_threshold=0.0,
    588                              flow_threshold=0.4, do_3D=False, min_size=15,
    589                              max_size_fraction=0.4, resize=None, device=torch.device("cpu")):
    590     """Compute masks using dynamics from dP and cellprob, and resizes masks if resize is not None.
    591 
    592     Args:
   (...)
    608         tuple: A tuple containing the computed masks and the final pixel locations.
    609     """
--> 610     mask = compute_masks(dP, cellprob, niter=niter,
    611                             cellprob_threshold=cellprob_threshold,
    612                             flow_threshold=flow_threshold, do_3D=do_3D,
    613                             max_size_fraction=max_size_fraction, 
    614                             device=device)
    616     if resize is not None:
    617         dynamics_logger.warning("Resizing is depricated in v4.0.1+")
File /databricks/python/lib/python3.11/site-packages/cellpose/dynamics.py:665, in compute_masks(dP, cellprob, p, niter, cellprob_threshold, flow_threshold, do_3D, min_size, max_size_fraction, device)
    663 if device.type == "mps":
    664     p_final = p_final.to(torch.device("cpu"))
--> 665 mask = get_masks_torch(p_final, inds, dP.shape[1:], 
    666                        max_size_fraction=max_size_fraction)
    667 del p_final
    668 # flow thresholding factored out of get_masks
File /databricks/python/lib/python3.11/site-packages/cellpose/dynamics.py:529, in get_masks_torch(pt, inds, shape0, rpad, max_size_fraction)
    526 h1 = coo.to_dense()
    527 del coo
--> 529 hmax1 = max_pool_nd(h1.unsqueeze(0), kernel_size=5)
    530 hmax1 = hmax1.squeeze()
    531 seeds1 = torch.nonzero((h1 - hmax1 > -1e-6) * (h1 > 10))
File /databricks/python/lib/python3.11/site-packages/cellpose/dynamics.py:478, in max_pool_nd(h, kernel_size)
    476 """ memory efficient max_pool in 2d or 3d """
    477 ndim = h.ndim - 1
--> 478 hmax = max_pool1d(h, kernel_size=kernel_size, axis=1)
    479 hmax2 = max_pool1d(hmax, kernel_size=kernel_size, axis=2)
    480 if ndim==2:
File /databricks/python/lib/python3.11/site-packages/cellpose/dynamics.py:456, in max_pool1d(h, kernel_size, axis, out)
    450 """ memory efficient max_pool thanks to Mark Kittisopikul 
    451 
    452 for stride=1, padding=kernel_size//2, requires odd kernel_size >= 3
    453 
    454 """
    455 if out is None:
--> 456     out = h.clone()
    457 else:
    458     out.copy_(h)

Screenshots

Please notice the spike in the end, before it crashes with OOM error

run.log

May 16 '25 18:05 jai-boehringer

Hi @mrariden, I was wondering if there is any fix here I could try? I am using pip install git+https://www.github.com/mouseland/cellpose.git instead of pip install cellpose. Does this mean I am using a not up to date version of cellpose?

Jun 02 '25 21:06 jai-boehringer

This may be resolved now that we're using a smaller model. This is available in the 4.0.5 release when it's published

Jun 25 '25 20:06 mrariden

@mrariden It seems like the model size is still the same? its still around 1.15 GB, what is being downloaded from hugging face. I am still facing the issue when running the code above.

Jun 30 '25 15:06 jai-boehringer

I experience a similar issue for an image of shape (39619, 39597, 2) with version 4.0.6:

Welcome to CellposeSAM, cellpose v                                                                                                                                                            
cellpose version:       4.0.6                                                                                                                                                                 
platform:               linux                                                                                                                                                                 
python version:         3.12.11                                                                                                                                                               
torch version:          2.8.0+cu128! The neural network component of                                                                                                                          
CPSAM is much larger than in previous versions and CPU excution is slow.                                                                                                                      
We encourage users to use GPU/MPS if available.

I run cellpose with these options:

model = models.CellposeModel(gpu=True)
options = {'flow_threshold': 0.6, 'cellprob_threshold': 0.0, 'min_size': 8*4, 'batch_size': 16, 'normalize': True, 'tile_overlap': 0.1,'diameter': None}
mask_whole, _, _ = model.eval(im[:,:,[0,1]], **options)

The error: I get after some minutes of running without problem:

Traceback (most recent call last):
  File "/home/retger/celltyping_imc/code/lunaphore/segment_cellpose.py", line 59, in <module>
    mask_whole, _, _ = model.eval(im[:,:,[0,1]], **options)      
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                  
  File "/home/retger/.cache/uv/environments-v2/segment-cellpose-a0340e657541c014/lib/python3.12/site-packages/cellpose/models.py", line 338, in eval
    masks = self._compute_masks(x.shape, dP, cellprob, flow_threshold=flow_threshold,                                                                                                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/retger/.cache/uv/environments-v2/segment-cellpose-a0340e657541c014/lib/python3.12/site-packages/cellpose/models.py", line 524, in _compute_masks
    outputs = dynamics.resize_and_compute_masks(
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/retger/.cache/uv/environments-v2/segment-cellpose-a0340e657541c014/lib/python3.12/site-packages/cellpose/dynamics.py", line 610, in resize_and_compute_masks
    mask = compute_masks(dP, cellprob, niter=niter,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/retger/.cache/uv/environments-v2/segment-cellpose-a0340e657541c014/lib/python3.12/site-packages/cellpose/dynamics.py", line 665, in compute_masks
    mask = get_masks_torch(p_final, inds, dP.shape[1:], 
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                                                                                                       
  File "/home/retger/.cache/uv/environments-v2/segment-cellpose-a0340e657541c014/lib/python3.12/site-packages/cellpose/dynamics.py", line 529, in get_masks_torch
    hmax1 = max_pool_nd(h1.unsqueeze(0), kernel_size=5)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/retger/.cache/uv/environments-v2/segment-cellpose-a0340e657541c014/lib/python3.12/site-packages/cellpose/dynamics.py", line 479, in max_pool_nd
    hmax2 = max_pool1d(hmax, kernel_size=kernel_size, axis=2)                
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/retger/.cache/uv/environments-v2/segment-cellpose-a0340e657541c014/lib/python3.12/site-packages/cellpose/dynamics.py", line 456, in max_pool1d                                      out = h.clone()                                                                            
          ^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 5.86 GiB. GPU 0 has a total capacity of 21.95 GiB of which 5.81 GiB is free. Including non-PyTorch memory, this process has 16.1
3 GiB memory in use. Of the allocated memory 15.84 GiB is allocated by PyTorch, and 58.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting 
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

I also see a spike of memory, after a short timeout (GPU usage on the left, GPU RAM on the right):

Sep 04 '25 15:09 retogerber