xcdat
xcdat copied to clipboard
[Feature]: Allow datasets with mixed time types
Is your feature request related to a problem?
One issue with some model datasets is that different chunks of time can be encoded with different calendars. xarray throws an error in these situations.
ds = xc.open_mfdataset("/p/climate/css03/esgf_publish/CMIP6/PMIP/MIROC/MIROC-ES2L/past1000/r1i1p1f2/Amon/tas/gn/v20200318/*nc")
TypeError: Cannot combine along dimension 'time' with mixed types. Found: DatetimeProlepticGregorian, Datetime360Day. If importing data directly from a file then setting
use_cftime=True
may fix this issue.
This occurs because the first file's time axis is encoded with 360_day
and the last is proleptic_gregorian
.
Describe the solution you'd like
It would be helpful to use preprocessing to detect this issue and attempt to decode the datasets to a common calendar.
Describe alternatives you've considered
No response
Additional context
No response
I played around with this a little bit and came up with a hacky prototype that handles both mixed calendar types and overlapping time coordinates. It assumes a lot (e.g., there are bounds, the input is a directory, time
is called time
, etc.). It also forces the issue in some cases (e.g., it will force convert 360_day calendars to real-world calendars).
Use: ds = ugly_open('/p/css03/esgf_publish/CMIP6/PMIP/MIROC/MIROC-ES2L/past1000/r1i1p1f2/Amon/tas/gn/v20200318/')
import xcdat as xc
import glob
import numpy as np
import xarray as xr
def to_cfclass(t, cfclass):
"""
force convert a cftime object to another cftime class (probably not advisable)
"""
return cfclass(t.year, t.month, t.day, t.hour, t.minute, t.second, t.microsecond)
def ugly_open(data_directory):
"""
open a directory of netcdf files. If time steps overlap, the function will subset
and concatenate the file datasets together. If files have varied calendars, the
function will convert to a common calendar type.
"""
# get file list
files = glob.glob(data_directory + '/*.nc')
# initialize lists of file attributes
start_end_timepoints = []
calendars = []
units = []
decimal_times = []
# loop over files and get attributes
for fn in files:
ds = xc.open_dataset(fn)
time = ds.time.values
time_bnds = ds.time_bnds.values
# create decimal year values
# we do this, because mixed cftime calendar types aren't sortable
t0 = ds.time.values[0]
cfclass = type(t0)
t0r = cfclass(t0.year, 1, 1, 0, 0, 0)
rdiff = (t0 - t0r).total_seconds()
fyear = rdiff / (365 * 24 * 60 * 60)
dtime = t0.year + fyear
decimal_times.append(dtime)
# get time span
start_end_timepoints.append([time_bnds[0, 0], time_bnds[-1, 1]])
# get calendar
calendars.append(ds.time.encoding['calendar'])
# get units
units.append(ds.time.encoding['units'])
ds.close()
# order values by time
start_end_timepoints = np.array(start_end_timepoints)
inds = np.argsort(decimal_times)
start_end_timepoints = start_end_timepoints[inds]
files = [files[i] for i in inds]
calendars = [calendars[i] for i in inds]
units = [units[i] for i in inds]
decimal_times = [decimal_times[i] for i in inds]
# search for conflicts
subsetFlag = False
mixedCalendar = False
time_ranges = []
# loop over files to determine if conflicts exist and specify
# parameters to homogenize time axes
for i in range(len(files)-1):
# get first file start / end point
s1 = start_end_timepoints[i, 0]
e1 = start_end_timepoints[i, 1]
# get second file end point
s2 = start_end_timepoints[i+1, 0]
# get calendars
c1 = calendars[i]
c2 = calendars[i + 1]
# homogenize calendars if they conflict
if c1 != c2:
if c1 == '360_day':
# if calendar 1 is 360_day, use calendar 2
default_calendar = c2
default_units = units[i + 1]
default_cfclass = type(s2)
else:
# by default, use the first file's calendar
default_calendar = c1
default_units = units[i]
default_cfclass = type(s1)
# ensure calendars are the same class for comparisons
e1 = to_cfclass(e1, default_cfclass)
s2 = to_cfclass(s2, default_cfclass)
mixedCalendar = True
else:
default_cfclass = type(s1)
# check to see if files overlap
# if they overlap subset the first file
# otherwise keep bounds as-is
if e1 > s2:
subsetFlag = True
time_ranges.append(slice(s1.strftime(), s2.strftime()))
else:
time_ranges.append(slice(s1.strftime(), e1.strftime()))
# if subsetting / calendar conversion is required, subset each dataset
# otherwise use xc.open_mfdataset
if (subsetFlag | mixedCalendar):
time_ranges.append(slice(start_end_timepoints[-1, 0].strftime(), start_end_timepoints[-1, 1].strftime()))
# initialize dataset list
dsList = []
# loop over files to create datasets
for i, fn in enumerate(files):
# open file dataset
ds = xc.open_dataset(fn)
# subset
ds = ds.sel(time=time_ranges[i])
# if dataset has no values, continue
if len(ds.time.values) == 0:
continue
# if the time does not match the default cfclass, convert
if type(ds.time.values[0]) != default_cfclass:
# first try cftime method
# if this fails (non-standard calendar), force the issue
try:
time = [t.change_calendar(default_calendar) for t in ds.time.values]
ds['time'] = time
except ValueError:
time = ds.time.values
time = [to_cfclass(t, default_cfclass) for t in time]
ds['time'] = time
dsList.append(ds)
# concatenate
dsOut = xr.concat(dsList, data_vars='minimal', dim='time')
else:
dsOut = xc.open_mfdataset(data_directory)
return dsOut