[Feature Request] Convert from LabelsModel to ShapesModel
Is your feature request related to a problem? Please describe. I'm trying to get the result of 3D segmentation back into the SpatialData world in order to use sopa but I am struggling with it.
Describe the solution you'd like Would there be a way to convert Labels to Shapes ? Even if it's 2D shapes as long as the ID is the same along the Z axis for the same cell is fine for me.
I saw that there was some work into having 3D or 2.5D in SpatialData, but was still wondering what would be the best approach...
Describe alternatives you've considered Thanks to the new distributed approach for Cellpose, I was able to run a segmentation on a large dataset and read back the results as an xarray that can be read in SpatialData when parsed as Labels3DModel.
I then tried to get the results as ShapesModel as I would like to run Proseg on it (recently integrated into sopa) based on the transcripts. Thanks to AI and googling around, I was able to get this method to convert it to GeoDataframe but it takes ages to load into napari, and I am not sure if that would be the best approach...
def xarray_to_geodataframe_parallel(
dataset,
x_dim="x",
y_dim="y",
z_dim=None,
crs=None,
as_points=True,
n_jobs=-1,
chunk_size=100000,
):
"""Convert an xarray Dataset or DataArray to a geopandas GeoDataFrame with parallel processing.
Parameters
----------
dataset : xarray.Dataset or xarray.DataArray
The dataset to convert to GeoDataFrame.
x_dim : str
Name of the x dimension/coordinate.
y_dim : str
Name of the y dimension/coordinate.
z_dim : str, optional
Name of the z dimension/coordinate.
crs : str or dict, optional
Coordinate reference system to use in the GeoDataFrame.
as_points : bool
If True, create Point geometries. If False, create Polygon geometries.
n_jobs : int
Number of parallel jobs to run. -1 means using all processors.
chunk_size : int
Size of chunks for parallel processing.
Returns
-------
geopandas.GeoDataFrame
GeoDataFrame with Shapely geometries and data values.
Examples
--------
>>> import numpy as np
>>> import xarray as xr
>>> data = np.random.rand(3, 5, 5)
>>> coords = {'z': np.arange(3), 'y': np.arange(5), 'x': np.arange(5)}
>>> ds = xr.DataArray(data, coords=coords, dims=['z', 'y', 'x']).to_dataset(name='value')
>>> gdf = xarray_to_geodataframe_parallel(ds, z_dim='z')
>>> gdf.shape
(75, 5)
"""
import geopandas as gpd
import numpy as np
import pandas as pd
import xarray as xr
from shapely.geometry import Point, Polygon
from joblib import Parallel, delayed
import multiprocessing
# If input is a DataArray, convert to Dataset
if isinstance(dataset, xr.DataArray):
dataset = dataset.to_dataset(name="value")
# Extract coordinates
x_coords = dataset[x_dim].values
y_coords = dataset[y_dim].values
# Convert to numpy arrays first
data_arrays = {}
for var in dataset.data_vars:
data_arrays[var] = dataset[var].values
# Set up number of jobs
if n_jobs == -1:
n_jobs = multiprocessing.cpu_count()
print(f"Using {n_jobs} CPU cores for parallel processing")
# Create the base DataFrame
if z_dim is not None and z_dim in dataset.dims:
# 3D case
z_coords = dataset[z_dim].values
# Create coordinate arrays
if dataset[z_dim].size * dataset[y_dim].size * dataset[x_dim].size > 10000000:
print("Large dataset detected - using chunked approach for indexing")
# For extremely large datasets, create indices differently to avoid memory issues
df_list = []
for z_idx, z_val in enumerate(z_coords):
# Create temporary arrays for this z slice
yy, xx = np.meshgrid(
np.arange(len(y_coords)), np.arange(len(x_coords)), indexing="ij"
)
# Create dataframe for this z slice
df_slice = pd.DataFrame(
{
z_dim: z_val,
y_dim: y_coords[yy.flatten()],
x_dim: x_coords[xx.flatten()],
}
)
# Add data values
for var in dataset.data_vars:
df_slice[var] = data_arrays[var][z_idx].flatten()
df_list.append(df_slice)
df = pd.concat(df_list, ignore_index=True)
else:
# For smaller datasets, use meshgrid approach
zz, yy, xx = np.meshgrid(
np.arange(len(z_coords)),
np.arange(len(y_coords)),
np.arange(len(x_coords)),
indexing="ij",
)
# Create DataFrame
df = pd.DataFrame(
{
z_dim: z_coords[zz.flatten()],
y_dim: y_coords[yy.flatten()],
x_dim: x_coords[xx.flatten()],
}
)
# Add data values
for var in dataset.data_vars:
df[var] = data_arrays[var].flatten()
else:
# 2D case
yy, xx = np.meshgrid(
np.arange(len(y_coords)), np.arange(len(x_coords)), indexing="ij"
)
# Create DataFrame
df = pd.DataFrame(
{y_dim: y_coords[yy.flatten()], x_dim: x_coords[xx.flatten()]}
)
# Add data values
for var in dataset.data_vars:
df[var] = data_arrays[var].flatten()
# Function to process chunks in parallel
def create_geometries_chunk(chunk_df):
if as_points:
# Create point geometries
geometries = [Point(x, y) for x, y in zip(chunk_df[x_dim], chunk_df[y_dim])]
else:
# Create polygon geometries
x_step = np.diff(x_coords).mean() if len(x_coords) > 1 else 1
y_step = np.diff(y_coords).mean() if len(y_coords) > 1 else 1
geometries = [
Polygon(
[(x, y), (x + x_step, y), (x + x_step, y + y_step), (x, y + y_step)]
)
for x, y in zip(chunk_df[x_dim], chunk_df[y_dim])
]
return geometries
# Split into chunks for parallel processing
n_chunks = max(1, len(df) // chunk_size)
df_chunks = np.array_split(df, n_chunks)
print(f"Processing {len(df_chunks)} chunks in parallel...")
# Process each chunk in parallel
all_geometries = []
# Use parallel processing for geometry creation
geometries_chunks = Parallel(n_jobs=n_jobs, verbose=1)(
delayed(create_geometries_chunk)(chunk) for chunk in df_chunks
)
# Flatten list of geometry chunks
geometries = [geom for chunk in geometries_chunks for geom in chunk]
# Create GeoDataFrame with geometries
gdf = gpd.GeoDataFrame(df, geometry=geometries, crs=crs)
print(f"Created GeoDataFrame with {len(gdf)} rows")
return gdf