ehrapy
ehrapy copied to clipboard
Longitudinal normalization
Description of feature
This is a function that operates on .X or .layers so 3D and array-type and encoding is relevant.
This issue addresses all the current normalization methods, since they are very streamlined:
- [ ] minmax_norm
- [ ] maxabs_norm
- [ ] robust_scale_norm
- [ ] quantile_norm
- [ ] power_norm
- [ ] log_norm
- [ ] offset_negative_values
3D If 3D allowed:
- [ ] function handles 3D data
- [ ] the test of the function tests both the expected 2D and 3D implementation
If 3D not allowed:
- [ ] only_2D decorator
- [ ] the test of the function tests for failure if 3D is passe
array-type The available array types are np.array, dask.array, and scipy.sparse matrices.
- [x] function is single-dispatched, with potential not_implemented errors being raised
- [x] test is parametrized to test the different array-types
Encoding If encoded data is required
- [ ] mention in documentation that encoded data is needed, with ep.pp.encode.
Feel free to take https://github.com/theislab/ehrapy/pull/913 completely over.
from functools import singledispatch
import ehrdata as ed
import numpy as np
import xarray as xr
from ehrapy._compat import _raise_array_type_not_implemented
class StandardScaler3D:
def __init__(self):
"""Standardize features by removing the mean and scaling to unit variance, across all samples and timesteps.
This class is in concept similar to :class:`sklearn.preprocessing.StandardScaler`, but for a 3D array of shape (n_samples, n_features, n_timesteps).
"""
self.mean_ = None
self.std_ = None
def _fit(self, data: np.ndarray) -> None:
# Compute mean and std along the (0, 2) axes (samples and timesteps)
self.mean_ = np.nanmean(data, axis=(0, 2), keepdims=True)
self.scale_ = np.nanstd(data, axis=(0, 2), keepdims=True)
self.scale_[self.scale_ == 0] = 1.0
def fit(self, edata: ed.EHRData) -> None:
"""Fit the StandardScaler3D object to the input data.
Computes the mean and standard deviation for each feature across all samples and timesteps from the input EHRData's `.r` field.
If missing values are present, they are ignored during this computation.
Args:
edata: Input EHRData, from which's `.r` field, the statistics for the normalization are computed on.
"""
if not isinstance(edata, ed.EHRData):
raise ValueError("Input must be an EHRData object.")
if edata.r.ndim != 3:
raise ValueError("Input EHRData's .r field must be a 3D array.")
self._fit(edata.r)
def _transform(self, data: np.ndarray) -> np.ndarray:
return (data - self.mean_) / self.scale_
def transform(self, edata: ed.EHRData, copy: bool = False) -> ed.EHRData | None:
"""
Standardize the input data.
If missing values are present, they are ignored and remain missing values.
Args:
edata: Input EHRData, which's `.r` field, the statistics for the normalization are computed on.
Returns:
The standardized input data.
"""
if edata.r.ndim != 3:
raise ValueError("Input EHRData's .r field must be a 3D array.")
if copy:
edata = edata.copy()
edata.r = self._transform(edata.r)
return edata
else:
edata.r = self._transform(edata.r)
return None
def fit_transform(self, edata: ed.EHRData, copy: bool = False) -> ed.EHRData | None:
"""
Fit the StandardScaler3D to the input data, and apply the normalization to it.
Args:
edata: EHRData, or array of shape (n_samples, n_features, n_timesteps).
Returns:
The standardized input data.
"""
self.fit(edata)
return self.transform(edata, copy=copy)
def scale_norm_3d(edata: ed.EHRData, copy: bool = False) -> ed.EHRData | None:
"""Normalize the input data by scaling each feature across all samples and timesteps.
Args:
edata: Anndata object with shape (n_samples, n_features, n_timesteps).
Returns:
The normalized input data.
"""
scaler = StandardScaler3D()
return scaler.fit_transform(edata, copy=copy)