uxarray icon indicating copy to clipboard operation
uxarray copied to clipboard

Add zonal means (non-conservative)

Open erogluorhan opened this issue 3 years ago • 13 comments
trafficstars

Global and zonal means (conservative and non-conservative)

Originally posted by @erogluorhan in https://github.com/UXARRAY/uxarray/discussions/46#discussioncomment-2746606

erogluorhan avatar Aug 11 '22 20:08 erogluorhan

@hongyuchen1030 is beginning work this week on non-conservative zonal means.

paullric avatar Aug 17 '22 23:08 paullric

@paullric what does a conservative or non-conservative zonal mean mean?

rljacob avatar May 20 '24 17:05 rljacob

@erogluorhan -- Following up from our conversation in the Raijin meeting, here is my simple approach to zonal or meridional means. Not sure whether to put this here or in #527 . The approach is just to make bins based on latitude or longitude (or whatever) and then average the variable of interest.

import xarray as xr
import numpy as np


def binned_average(data, bin_data, bin_width, bin_start, bin_end):
    """
    Generalized binned average function. Returns `data` averaged within
    bins of `bin_data`.

    data : xr.DataArray
        the DataArray to be averaged.

    bin_data : xr.DataArray or np.ndarray
        determines the binning
        Expected to be one-dimensional (ncol),
        and be same size as one of the dimensions of data.
    
    bin_width: float
        width of bins in units that match bin_data

    bin_start: float
        starting value for the bins

    bin_end: float
        ending value of the bins

    """
    # Create bins
    bins = np.arange(bin_start, bin_end+bin_width, bin_width)

    bin_centers = (bins[:-1] + bins[1:]) / 2

    # Assign each value of `bin_data` to a bin
    bin_indices = np.digitize(bin_data, bins) - 1

    n_bins = len(bins) - 1 # number of bins (== len(bin_centers))

    # Create a mask for each bin
    mask = np.zeros((len(bin_data), n_bins))
    mask[np.arange(len(bin_data)), bin_indices] = 1
        
    # Identify the binned dimension in data
    binned_dim = [dim for dim in data.dims if data[dim].size == bin_data.size][0]

    # Compute the sum and count for each bin
    weighted_sum = data.dot(xr.DataArray(mask, dims=[binned_dim, 'bins']))

    count = xr.DataArray(mask, dims=[binned_dim, 'bins']).sum(dim=binned_dim)
    
    # Compute average
    result = weighted_sum / count
    
    # Replace inf and nan with a missing value (e.g., np.nan)
    result = result.where(np.isfinite(result), np.nan)

    # Set coordinates for the bins
    result = result.assign_coords(bins=bin_centers)
    
    # Add attributes if possible
    if hasattr(bin_data, 'name'):
        add_str = f'by {bin_data.name}'
    else:
        add_str = ''
    result.attrs['long_name'] = f'{data.name} binned {add_str}'
    result.attrs['units'] = data.attrs.get('units', '')
    result.bins.attrs['units'] = bin_data.attrs.get('units', '')
    result.bins.attrs['long_name'] = 'bins'
    
    return result 


def lon_binned_average(data, lon, bin_width=10):
    # Ensure data and lon are xarray DataArrays
    if not isinstance(data, xr.DataArray):
        raise TypeError("data must be an xarray DataArray")
    if not isinstance(lon, xr.DataArray):
        raise TypeError("lon must be an xarray DataArray")
    
    # Ensure longitudes are in the range [0, 360)
    lon_values = lon.values % 360

    # Create bins
    bin_range = (0, 360)

    result = binned_average(data, lon, bin_width, bin_range[0], bin_range[1])
    return result 

def zonal_average(data, lat, bin_width=10):
    # Ensure data and lon are xarray DataArrays
    if not isinstance(data, xr.DataArray):
        raise TypeError("data must be an xarray DataArray")
    if not isinstance(lat, xr.DataArray):
        raise TypeError("lat must be an xarray DataArray")

    # Ensure longitudes are in the range [-90, 90)
    # otherwise add an extra bin
    if lat.max().item() >= 90:
        bin_range = (-90, 90+bin_width) # if values are at edges then digitize gives extra value
    else:
        bin_range = (-90, 90)


    result = binned_average(data, lat, bin_width, bin_range[0], bin_range[1])
    return result 

brianpm avatar Aug 19 '24 18:08 brianpm

@erogluorhan -- Following up from our conversation in the Raijin meeting, here is my simple approach to zonal or meridional means. Not sure whether to put this here or in #527 . The approach is just to make bins based on latitude or longitude (or whatever) and then average the variable of interest.

import xarray as xr
import numpy as np


def binned_average(data, bin_data, bin_width, bin_start, bin_end):
    """
    Generalized binned average function. Returns `data` averaged within
    bins of `bin_data`.

    data : xr.DataArray
        the DataArray to be averaged.

    bin_data : xr.DataArray or np.ndarray
        determines the binning
        Expected to be one-dimensional (ncol),
        and be same size as one of the dimensions of data.
    
    bin_width: float
        width of bins in units that match bin_data

    bin_start: float
        starting value for the bins

    bin_end: float
        ending value of the bins

    """
    # Create bins
    bins = np.arange(bin_start, bin_end+bin_width, bin_width)

    bin_centers = (bins[:-1] + bins[1:]) / 2

    # Assign each value of `bin_data` to a bin
    bin_indices = np.digitize(bin_data, bins) - 1

    n_bins = len(bins) - 1 # number of bins (== len(bin_centers))

    # Create a mask for each bin
    mask = np.zeros((len(bin_data), n_bins))
    mask[np.arange(len(bin_data)), bin_indices] = 1
        
    # Identify the binned dimension in data
    binned_dim = [dim for dim in data.dims if data[dim].size == bin_data.size][0]

    # Compute the sum and count for each bin
    weighted_sum = data.dot(xr.DataArray(mask, dims=[binned_dim, 'bins']))

    count = xr.DataArray(mask, dims=[binned_dim, 'bins']).sum(dim=binned_dim)
    
    # Compute average
    result = weighted_sum / count
    
    # Replace inf and nan with a missing value (e.g., np.nan)
    result = result.where(np.isfinite(result), np.nan)

    # Set coordinates for the bins
    result = result.assign_coords(bins=bin_centers)
    
    # Add attributes if possible
    if hasattr(bin_data, 'name'):
        add_str = f'by {bin_data.name}'
    else:
        add_str = ''
    result.attrs['long_name'] = f'{data.name} binned {add_str}'
    result.attrs['units'] = data.attrs.get('units', '')
    result.bins.attrs['units'] = bin_data.attrs.get('units', '')
    result.bins.attrs['long_name'] = 'bins'
    
    return result 


def lon_binned_average(data, lon, bin_width=10):
    # Ensure data and lon are xarray DataArrays
    if not isinstance(data, xr.DataArray):
        raise TypeError("data must be an xarray DataArray")
    if not isinstance(lon, xr.DataArray):
        raise TypeError("lon must be an xarray DataArray")
    
    # Ensure longitudes are in the range [0, 360)
    lon_values = lon.values % 360

    # Create bins
    bin_range = (0, 360)

    result = binned_average(data, lon, bin_width, bin_range[0], bin_range[1])
    return result 

def zonal_average(data, lat, bin_width=10):
    # Ensure data and lon are xarray DataArrays
    if not isinstance(data, xr.DataArray):
        raise TypeError("data must be an xarray DataArray")
    if not isinstance(lat, xr.DataArray):
        raise TypeError("lat must be an xarray DataArray")

    # Ensure longitudes are in the range [-90, 90)
    # otherwise add an extra bin
    if lat.max().item() >= 90:
        bin_range = (-90, 90+bin_width) # if values are at edges then digitize gives extra value
    else:
        bin_range = (-90, 90)


    result = binned_average(data, lat, bin_width, bin_range[0], bin_range[1])
    return result 

Thanks for sharing this! This would be a nice addition to have paired with the more accurate implementation in #785 Do you have any literature or references for the approach you've shared?

philipc2 avatar Aug 19 '24 19:08 philipc2

I don't have any references... it just seemed like the most straightforward approach.

brianpm avatar Aug 19 '24 20:08 brianpm

Hi @brianpm thanks a lot for sharing your code! Hi @paullric @hongyuchen1030 could you have a look into Brian's code and assess the difference between it and your algorithm?

erogluorhan avatar Aug 20 '24 19:08 erogluorhan

Hi @brianpm,

Thank you very much for your provided algorithm, I wonder if you can give some detailed explanation about how to get the cases about an anti-meridian face and that at the equator, two faces are sharing the same interval of the equator? Thanks!

And also for the following codes, I wonder 1. where is the lon_binned_average used? 2. And where is the usage lon_values inside this function as well. Thank you very much again

def lon_binned_average(data, lon, bin_width=10):
    # Ensure data and lon are xarray DataArrays
    if not isinstance(data, xr.DataArray):
        raise TypeError("data must be an xarray DataArray")
    if not isinstance(lon, xr.DataArray):
        raise TypeError("lon must be an xarray DataArray")
    
    # Ensure longitudes are in the range [0, 360)
    lon_values = lon.values % 360

    # Create bins
    bin_range = (0, 360)

    result = binned_average(data, lon, bin_width, bin_range[0], bin_range[1])
    return result 

hongyuchen1030 avatar Aug 20 '24 20:08 hongyuchen1030

@brianpm

In addition to what @hongyuchen1030 mentioned, I'm curious if you've run this or tested the approach on any data? There doesn't appear to be any unstructured connectivity referenced in the algorithm. Is this implementation for a structured grid?

philipc2 avatar Aug 20 '24 21:08 philipc2

@erogluorhan @brianpm @hongyuchen1030 Yes this is the simplest implementation of an approximate zonal mean. Basically identify the face centers and bin the data by latitude or longitude. It'll be fast, but will fail when the bin width is less than the grid spacing or when the grid cells are not uniformly spaced along the line of constant latitude (as with regionally refined grids). I can think of some solutions to this latter issue that avoid geometric operations, but I wouldn't implement it as is without this fix.

paullric avatar Aug 21 '24 21:08 paullric

Hi all-- sorry for the slow response. Agree that there are some potential issues with this approach regarding very non-uniform meshes, and I didn't bother with proper area weighting. I'll try to post an example with actual data to show how it works, I've tested with CAM-SE and regular meshes so far. I'll try to add a MPAS grid when I get a chance to revisit, and I'll try to add the area weighting..... but for sure this was just meant to show the quick and dirty approach.

brianpm avatar Aug 27 '24 20:08 brianpm

Hi all -- here is the expanded example.

First, here is the relevant code:

import xarray as xr
import numpy as np
import matplotlib.pyplot as plt


def binned_average(data, bin_data, bin_width, bin_start, bin_end, weights=None):
    """
    Generalized binned average function. Returns `data` averaged within
    bins of `bin_data`.

    data : xr.DataArray
        the DataArray to be averaged.

    bin_data : xr.DataArray or np.ndarray
        determines the binning
        Expected to be one-dimensional (ncol),
        and be same size as one of the dimensions of data.
    
    bin_width: float
        width of bins in units that match bin_data

    bin_start: float
        starting value for the bins

    bin_end: float
        ending value of the bins

    weights: xr.DataArray
        weighting to apply, must be same shape as data
    """
    if weights is not None:
        assert weights.shape == data.shape, 'Weights must be same shape as data'
    else:
        weights = xr.DataArray(np.ones(data.shape), dims=data.dims, coords=data.coords)

    # Create bins
    bins = np.arange(bin_start, bin_end+bin_width, bin_width)

    bin_centers = (bins[:-1] + bins[1:]) / 2

    # Assign each value of `bin_data` to a bin
    bin_indices = np.digitize(bin_data, bins) - 1

    n_bins = len(bins) - 1 # number of bins (== len(bin_centers))

    # Create a mask for each bin
    mask = np.zeros((len(bin_data), n_bins))
    mask[np.arange(len(bin_data)), bin_indices] = 1
        
    # Identify the binned dimension in data
    binned_dim = [dim for dim in data.dims if data[dim].size == bin_data.size][0]

    # Compute the weighted sum for eacn bin
    weighted_sum = (data * weights).dot(xr.DataArray(mask, dims=[binned_dim, 'bins']))
    # Compute the sum of weights for each bin
    weight_sum = weights.dot(xr.DataArray(mask, dims=[binned_dim, 'bins']))
    # Compute average
    result = weighted_sum / weight_sum
    # Replace inf and nan with np.nan
    result = result.where(np.isfinite(result), np.nan)
    # Set coordinates for the bins
    result = result.assign_coords(bins=bin_centers)
    
    # Add attributes if possible
    if hasattr(bin_data, 'name'):
        add_str = f'by {bin_data.name}'
    else:
        add_str = ''
    result.attrs['long_name'] = f'{data.name} binned {add_str}'
    result.attrs['units'] = data.attrs.get('units', '')
    result.bins.attrs['units'] = bin_data.attrs.get('units', '')
    result.bins.attrs['long_name'] = 'bins'
    
    return result 


def lon_binned_average(data, lon, bin_width=10, weights=None):
    """Example application of binned average. This is the 'meridional average'
       that bins values of longitude. This is used, for example, to visualize
       the SST gradient across the Pacific Ocean, which differs depending on
       the phase of the ENSO cycle. 
    """
    # Ensure data and lon are xarray DataArrays
    if not isinstance(data, xr.DataArray):
        raise TypeError("data must be an xarray DataArray")
    if not isinstance(lon, xr.DataArray):
        raise TypeError("lon must be an xarray DataArray")
    
    # Ensure longitudes are in the range [0, 360)
    lon_values = lon.values % 360

    # Create bins
    bin_range = (0, 360)

    result = binned_average(data, lon, bin_width, bin_range[0], bin_range[1], weights=weights)
    return result 


def zonal_average(data, lat, bin_width=10, weights=None):
    """Example application of binned average. 
       This is the 'zonal average'
       that bins values of latitude.
    """
    # Ensure data and lon are xarray DataArrays
    if not isinstance(data, xr.DataArray):
        raise TypeError("data must be an xarray DataArray")
    if not isinstance(lat, xr.DataArray):
        raise TypeError("lat must be an xarray DataArray")

    # Ensure longitudes are in the range [-90, 90)
    # otherwise add an extra bin
    if lat.max().item() >= 90:
        bin_range = (-90, 90+(0.5*bin_width)) # if values are at edges then digitize gives extra value
    else:
        bin_range = (-90, 90)


    result = binned_average(data, lat, bin_width, bin_range[0], bin_range[1], weights=weights)
    return result 

Second, here are some examples using that code:

# example data
# E3SM output on the cubed sphere mesh
ds = xr.open_dataset("/Volumes/Samsung_T5/E3SM_abrupt4xCO2/20180215.DECKv1b_abrupt4xCO2.ne30_oEC.edison.cam.h0.TS.ncrcat.nc")
# Data note: this dataset has a lot of times, too.

data = ds['TS']
area = ds['area']
lat = ds['lat']
bw = 2.5
zm_data = zonal_average(data, lat, bin_width=bw)
djf_mean = zm_data.sel(time=zm_data.time.dt.month.isin([1,2,12])).mean(dim='time')
jja_mean = zm_data.sel(time=zm_data.time.dt.month.isin([6,7,8])).mean(dim='time')

fig, ax = plt.subplots()
ax.plot(djf_mean.bins, djf_mean, color='navy', label='DJF')
ax.plot(jja_mean.bins, jja_mean, color='firebrick', label='JJA')
ax.legend()
ax.set_ylabel(getattr(data, 'units', 'N/A'))
ax.set_xlabel(f"LATITUDE (bin width: {bw} deg)")
ax.set_xlim([-90, 90])
# include the area weighting
wzm_data = zonal_average(data, lat, bin_width=bw, weights=area.broadcast_like(data))
wzm_data
djf_wmean = wzm_data.sel(time=wzm_data.time.dt.month.isin([1,2,12])).mean(dim='time')
jja_wmean = wzm_data.sel(time=wzm_data.time.dt.month.isin([6,7,8])).mean(dim='time')

fig, ax = plt.subplots()
ax.plot(djf_mean.bins, djf_mean, color='navy', label='DJF')
ax.plot(jja_mean.bins, jja_mean, color='firebrick', label='JJA')
ax.plot(djf_wmean.bins, djf_wmean, color='dodgerblue', linestyle='dashed', label='DJF, weighted')
ax.plot(jja_wmean.bins, jja_wmean, color='coral', linestyle='dashed', label='JJA, weighted')
ax.legend()
ax.set_ylabel(getattr(data, 'units', 'N/A'))
ax.set_xlabel(f"LATITUDE (bin width: {bw} deg)")
ax.set_xlim([-90, 90])
ax.set_title("E3SM Example")
# Add an MPAS example
mpas_ds = xr.open_dataset("/Users/brianpm/Desktop/amip_a240.cam.h0.1981-05.nc")
mpas_data = mpas_ds['TS'].squeeze()
mpas_lat = mpas_ds['lat'].squeeze()
mpas_area = mpas_ds['area']
# dataset note: this example is a single monthly average
mpas_zm = zonal_average(mpas_data, mpas_lat, bin_width=bw)
mpas_wzm = zonal_average(mpas_data, mpas_lat, bin_width=bw, weights=mpas_area)
fig, ax = plt.subplots()
ax.plot(mpas_zm.bins, mpas_zm, color='navy', label='unweighted')
ax.plot(mpas_wzm.bins, mpas_wzm, color='dodgerblue', linestyle='dashed', label='weighted')
ax.legend()
ax.set_ylabel(getattr(mpas_data, 'units', 'N/A'))
ax.set_xlabel(f"LATITUDE (bin width: {bw} deg)")
ax.set_xlim([-90, 90])
ax.set_title("MPAS (240km) Example")

Here are the plots that get generated: image image image

brianpm avatar Sep 07 '24 16:09 brianpm

Here is one additional example that uses a regionally refined mesh that has high resolution in the tropics but low resolution in the extratropics. This simulation was run using an aquaplanet where the sea surface temperature is prescribed using an analytic expression. The example shows getting the zonal average surface temperature from the model output on the same 2.5° latitude grid as above and comparing that with the analytic expression.

# try a regionally refined grid
rr_ds = xr.open_dataset("/Users/brianpm/Desktop/qpc6_trbelt_cam7l93_01.cam.h2a.0001-04-05-21600.nc")
rr_lat = rr_ds['lat']
rr_area = rr_ds['area']
rr_data = rr_ds['TS']
rr_wzm = zonal_average(rr_data, rr_lat, bin_width=bw, weights=rr_area.broadcast_like(rr_data))

# analytical expression used for TS in aquaplanet:
latvals = np.radians(rr_wzm.bins)
qobs = 0.5*((27*(1 - np.sin(3*latvals/2)**2))+(27*(1 - np.sin(3*latvals/2)**4)))
qobs = np.where(np.absolute(latvals) < np.pi/3, qobs, 0)
# transform to K
qobs += 273.15
fig, ax = plt.subplots()
ax.plot(rr_wzm.bins, rr_wzm[0,:], label='averaged')
ax.plot(rr_wzm.bins, qobs, marker='x', linestyle='none', label='Analytic')
ax.legend()
ax.set_title("Aquaplanet Qobs SST")

Here is the resulting plot: image

This data is available on glade here: /glade/derecho/scratch/brianpm/archive/qpc6_trbelt_cam7l93_01/atm/hist/qpc6_trbelt_cam7l93_01.cam.h2a.0001-04-05-21600.nc

brianpm avatar Sep 07 '24 16:09 brianpm

Hi @amberchen122,

Can you look over the Brian's updated zonal_mean function here? Since you're in charge of writing our zonal mean function? thanks

hongyuchen1030 avatar Sep 15 '24 18:09 hongyuchen1030