spatialdata
spatialdata copied to clipboard
exceeding rasterize's maximum label index
I am using sd.rasterize on cell polygons of >100k cells and run into the error.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[14], line 1
----> 1 sdata["cell_labels"] = sd.rasterize(
...
9 )
File ~/miniconda3/envs/g3/lib/python3.11/site-packages/spatialdata/_core/operations/rasterize.py:351, in rasterize(data, axes, min_coordinate, max_coordinate, target_coordinate_system, target_unit_to_pixels, target_width, target_height, target_depth, sdata, value_key, table_name, return_regions_as_labels, agg_func, return_single_channel)
349 return rasterized
350 if model in (PointsModel, ShapesModel):
--> 351 return rasterize_shapes_points(
... 368 )
369 raise ValueError(f"Unsupported model {model}.")
File ~/miniconda3/envs/g3/lib/python3.11/site-packages/spatialdata/_core/operations/rasterize.py:723, in rasterize_shapes_points(data, axes, min_coordinate, max_coordinate, target_coordinate_system, target_unit_to_pixels, target_width, target_height, target_depth, element_name, sdata, value_key, table_name, return_regions_as_labels, agg_func, return_single_channel)
721 max_uint16 = np.iinfo(np.uint16).max
722 if max_label > max_uint16:
--> 723 raise ValueError(f"Maximum label index is {max_label}. Values higher than {max_uint16} are not supported.")
724 agg = agg.astype(np.uint16)
725 return Labels2DModel.parse(agg, transformations=transformations)
ValueError: Maximum label index is 117340. Values higher than 65535 are not supported.
I was wondering if the threshold needs to be that low or could be increased?
To reproduce the error:
import numpy as np
import spatialdata as sd
def get_n_coords(n):
coords = np.random.rand(n, 2)
return coords
sdata1 = sd.SpatialData(points={"points": PointsModel.parse(get_n_coords(65534))})
sdata2 = sd.SpatialData(points={"points": PointsModel.parse(get_n_coords(65536))})
# sdata1 would work
img_extent = sd.get_extent(sdata2['points'])
sd.rasterize(
sdata2["points"],
["x", "y"],
min_coordinate=[int(img_extent["x"][0]), int(img_extent["y"][0])],
max_coordinate=[int(img_extent["x"][1]), int(img_extent["y"][1])],
target_coordinate_system="global",
target_unit_to_pixels=1,
return_regions_as_labels=True,
)
A quick workaround (assuming that cell labels don't overlap) would be the following. Note that I used circles instead of points for the example, which is closer to what I actually wanted. In case of points the .iloc fails since we'd deal with a dask dataframe in that case.
import numpy as np
import spatialdata as sd
from spatialdata.models import ShapesModel
def get_n_coords(n):
coords = np.random.rand(n, 2)
return coords
sdata = sd.SpatialData(shapes={"shapes": ShapesModel.parse(get_n_coords(65536), geometry=0, radius=np.ones(65536))})
img_extent = sd.get_extent(sdata['shapes']) # NOTE: Here I actually use sd.get_extent(sdata['image'])
N = 65535 # number of shapes that are rasterised per step
n_cells = len(sdata['shapes'])
n_iter = n_cells // N + bool(n_cells % N)
rasterize_args = {
"min_coordinate": [int(img_extent["x"][0]), int(img_extent["y"][0])],
"max_coordinate": [int(img_extent["x"][1]), int(img_extent["y"][1])],
"target_coordinate_system": "global",
"target_unit_to_pixels": 1,
"return_regions_as_labels": True,
}
for i in range(n_iter):
labels_image_ = sd.rasterize(sdata['shapes'].iloc[i*N:min((i+1)*N, n_cells)], ["x", "y"], **rasterize_args)
if i == 0:
labels_image = labels_image_
else:
labels_image.values[labels_image_.values > 0] = labels_image_.values[labels_image_.values > 0]
sdata['labels'] = labels_image