[Feature Request] Convert from LabelsModel to ShapesModel

Open lguerard opened this issue 9 months ago • 0 comments

Is your feature request related to a problem? Please describe. I'm trying to get the result of 3D segmentation back into the SpatialData world in order to use sopa but I am struggling with it.

Describe the solution you'd like Would there be a way to convert Labels to Shapes ? Even if it's 2D shapes as long as the ID is the same along the Z axis for the same cell is fine for me.

I saw that there was some work into having 3D or 2.5D in SpatialData, but was still wondering what would be the best approach...

Describe alternatives you've considered Thanks to the new distributed approach for Cellpose, I was able to run a segmentation on a large dataset and read back the results as an xarray that can be read in SpatialData when parsed as Labels3DModel.

I then tried to get the results as ShapesModel as I would like to run Proseg on it (recently integrated into sopa) based on the transcripts. Thanks to AI and googling around, I was able to get this method to convert it to GeoDataframe but it takes ages to load into napari, and I am not sure if that would be the best approach...

def xarray_to_geodataframe_parallel(
    dataset,
    x_dim="x",
    y_dim="y",
    z_dim=None,
    crs=None,
    as_points=True,
    n_jobs=-1,
    chunk_size=100000,
):
    """Convert an xarray Dataset or DataArray to a geopandas GeoDataFrame with parallel processing.

    Parameters
    ----------
    dataset : xarray.Dataset or xarray.DataArray
        The dataset to convert to GeoDataFrame.
    x_dim : str
        Name of the x dimension/coordinate.
    y_dim : str
        Name of the y dimension/coordinate.
    z_dim : str, optional
        Name of the z dimension/coordinate.
    crs : str or dict, optional
        Coordinate reference system to use in the GeoDataFrame.
    as_points : bool
        If True, create Point geometries. If False, create Polygon geometries.
    n_jobs : int
        Number of parallel jobs to run. -1 means using all processors.
    chunk_size : int
        Size of chunks for parallel processing.

    Returns
    -------
    geopandas.GeoDataFrame
        GeoDataFrame with Shapely geometries and data values.

    Examples
    --------
    >>> import numpy as np
    >>> import xarray as xr
    >>> data = np.random.rand(3, 5, 5)
    >>> coords = {'z': np.arange(3), 'y': np.arange(5), 'x': np.arange(5)}
    >>> ds = xr.DataArray(data, coords=coords, dims=['z', 'y', 'x']).to_dataset(name='value')
    >>> gdf = xarray_to_geodataframe_parallel(ds, z_dim='z')
    >>> gdf.shape
    (75, 5)
    """
    import geopandas as gpd
    import numpy as np
    import pandas as pd
    import xarray as xr
    from shapely.geometry import Point, Polygon
    from joblib import Parallel, delayed
    import multiprocessing

    # If input is a DataArray, convert to Dataset
    if isinstance(dataset, xr.DataArray):
        dataset = dataset.to_dataset(name="value")

    # Extract coordinates
    x_coords = dataset[x_dim].values
    y_coords = dataset[y_dim].values

    # Convert to numpy arrays first
    data_arrays = {}
    for var in dataset.data_vars:
        data_arrays[var] = dataset[var].values

    # Set up number of jobs
    if n_jobs == -1:
        n_jobs = multiprocessing.cpu_count()
    print(f"Using {n_jobs} CPU cores for parallel processing")

    # Create the base DataFrame
    if z_dim is not None and z_dim in dataset.dims:
        # 3D case
        z_coords = dataset[z_dim].values

        # Create coordinate arrays
        if dataset[z_dim].size * dataset[y_dim].size * dataset[x_dim].size > 10000000:
            print("Large dataset detected - using chunked approach for indexing")
            # For extremely large datasets, create indices differently to avoid memory issues
            df_list = []

            for z_idx, z_val in enumerate(z_coords):
                # Create temporary arrays for this z slice
                yy, xx = np.meshgrid(
                    np.arange(len(y_coords)), np.arange(len(x_coords)), indexing="ij"
                )

                # Create dataframe for this z slice
                df_slice = pd.DataFrame(
                    {
                        z_dim: z_val,
                        y_dim: y_coords[yy.flatten()],
                        x_dim: x_coords[xx.flatten()],
                    }
                )

                # Add data values
                for var in dataset.data_vars:
                    df_slice[var] = data_arrays[var][z_idx].flatten()

                df_list.append(df_slice)

            df = pd.concat(df_list, ignore_index=True)
        else:
            # For smaller datasets, use meshgrid approach
            zz, yy, xx = np.meshgrid(
                np.arange(len(z_coords)),
                np.arange(len(y_coords)),
                np.arange(len(x_coords)),
                indexing="ij",
            )

            # Create DataFrame
            df = pd.DataFrame(
                {
                    z_dim: z_coords[zz.flatten()],
                    y_dim: y_coords[yy.flatten()],
                    x_dim: x_coords[xx.flatten()],
                }
            )

            # Add data values
            for var in dataset.data_vars:
                df[var] = data_arrays[var].flatten()
    else:
        # 2D case
        yy, xx = np.meshgrid(
            np.arange(len(y_coords)), np.arange(len(x_coords)), indexing="ij"
        )

        # Create DataFrame
        df = pd.DataFrame(
            {y_dim: y_coords[yy.flatten()], x_dim: x_coords[xx.flatten()]}
        )

        # Add data values
        for var in dataset.data_vars:
            df[var] = data_arrays[var].flatten()

    # Function to process chunks in parallel
    def create_geometries_chunk(chunk_df):
        if as_points:
            # Create point geometries
            geometries = [Point(x, y) for x, y in zip(chunk_df[x_dim], chunk_df[y_dim])]
        else:
            # Create polygon geometries
            x_step = np.diff(x_coords).mean() if len(x_coords) > 1 else 1
            y_step = np.diff(y_coords).mean() if len(y_coords) > 1 else 1

            geometries = [
                Polygon(
                    [(x, y), (x + x_step, y), (x + x_step, y + y_step), (x, y + y_step)]
                )
                for x, y in zip(chunk_df[x_dim], chunk_df[y_dim])
            ]

        return geometries

    # Split into chunks for parallel processing
    n_chunks = max(1, len(df) // chunk_size)
    df_chunks = np.array_split(df, n_chunks)
    print(f"Processing {len(df_chunks)} chunks in parallel...")

    # Process each chunk in parallel
    all_geometries = []

    # Use parallel processing for geometry creation
    geometries_chunks = Parallel(n_jobs=n_jobs, verbose=1)(
        delayed(create_geometries_chunk)(chunk) for chunk in df_chunks
    )

    # Flatten list of geometry chunks
    geometries = [geom for chunk in geometries_chunks for geom in chunk]

    # Create GeoDataFrame with geometries
    gdf = gpd.GeoDataFrame(df, geometry=geometries, crs=crs)

    print(f"Created GeoDataFrame with {len(gdf)} rows")
    return gdf

Apr 01 '25 13:04 lguerard