xMIP
xMIP copied to clipboard
failed to join/concatenate datasets for model NorESM2-MM
I use the following code to load NorESM2-MM
url = "https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json"
col = intake.open_esm_datastore(url)
model = 'CESM2-FV2'
query = dict(experiment_id=['historical'], table_id='Omon',
variable_id='tos', grid_label=['gn'], source_id=model)
cat = col.search(**query)
print(cat.df['source_id'].unique())
z_kwargs = {'consolidated': True, 'decode_times':False}
tos_dict = cat.to_dataset_dict(zarr_kwargs=z_kwargs, preprocess=combined_preprocessing)
and I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/srv/conda/envs/notebook/lib/python3.8/site-packages/intake_esm/merge_util.py in join_new(dsets, dim_name, coord_value, varname, options, group_key)
55 concat_dim = xr.DataArray(coord_value, dims=(dim_name), name=dim_name)
---> 56 return xr.concat(dsets, dim=concat_dim, data_vars=varname, **options)
57 except Exception as exc:
/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/core/concat.py in concat(objs, dim, data_vars, coords, compat, positions, fill_value, join, combine_attrs)
190 )
--> 191 return f(
192 objs, dim, data_vars, coords, compat, positions, fill_value, join, combine_attrs
/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/core/concat.py in _dataset_concat(datasets, dim, data_vars, coords, compat, positions, fill_value, join, combine_attrs)
383 datasets = list(
--> 384 align(*datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value)
385 )
/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/core/alignment.py in align(join, copy, indexes, exclude, fill_value, *objects)
352 else:
--> 353 new_obj = obj.reindex(
354 copy=copy, fill_value=fill_value, indexers=valid_indexers
/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/core/dataset.py in reindex(self, indexers, method, tolerance, copy, fill_value, **indexers_kwargs)
2622 """
-> 2623 return self._reindex(
2624 indexers,
/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/core/dataset.py in _reindex(self, indexers, method, tolerance, copy, fill_value, sparse, **indexers_kwargs)
2651
-> 2652 variables, indexes = alignment.reindex_variables(
2653 self.variables,
/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/core/alignment.py in reindex_variables(variables, sizes, indexes, indexers, method, tolerance, copy, fill_value, sparse)
564 if not index.is_unique:
--> 565 raise ValueError(
566 "cannot reindex or align along dimension %r because the "
ValueError: cannot reindex or align along dimension 'time' because the index has duplicate values
The above exception was the direct cause of the following exception:
AggregationError Traceback (most recent call last)
<ipython-input-25-1b10fdf987c0> in <module>
37
38 z_kwargs = {'consolidated': True, 'decode_times':True}
---> 39 tos_dict = cat.to_dataset_dict(zarr_kwargs=z_kwargs,
40 preprocess=combined_preprocessing)
41
/srv/conda/envs/notebook/lib/python3.8/site-packages/intake_esm/core.py in to_dataset_dict(self, zarr_kwargs, cdf_kwargs, preprocess, storage_options, progressbar, aggregate)
928 ]
929 for i, task in enumerate(concurrent.futures.as_completed(future_tasks)):
--> 930 key, ds = task.result()
931 self._datasets[key] = ds
932 if self.progressbar:
/srv/conda/envs/notebook/lib/python3.8/concurrent/futures/_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433
434 self._condition.wait(timeout)
/srv/conda/envs/notebook/lib/python3.8/concurrent/futures/_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
/srv/conda/envs/notebook/lib/python3.8/concurrent/futures/thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
/srv/conda/envs/notebook/lib/python3.8/site-packages/intake_esm/core.py in _load_source(key, source)
914
915 def _load_source(key, source):
--> 916 return key, source.to_dask()
917
918 sources = {key: source(**source_kwargs) for key, source in self.items()}
/srv/conda/envs/notebook/lib/python3.8/site-packages/intake_esm/source.py in to_dask(self)
244 def to_dask(self):
245 """Return xarray object (which will have chunks)"""
--> 246 self._load_metadata()
247 return self._ds
248
/srv/conda/envs/notebook/lib/python3.8/site-packages/intake/source/base.py in _load_metadata(self)
124 """load metadata only if needed"""
125 if self._schema is None:
--> 126 self._schema = self._get_schema()
127 self.datashape = self._schema.datashape
128 self.dtype = self._schema.dtype
/srv/conda/envs/notebook/lib/python3.8/site-packages/intake_esm/source.py in _get_schema(self)
173
174 if self._ds is None:
--> 175 self._open_dataset()
176
177 metadata = {
/srv/conda/envs/notebook/lib/python3.8/site-packages/intake_esm/source.py in _open_dataset(self)
230 n_agg = len(self.aggregation_columns)
231
--> 232 ds = _aggregate(
233 self.aggregation_dict,
234 self.aggregation_columns,
/srv/conda/envs/notebook/lib/python3.8/site-packages/intake_esm/merge_util.py in _aggregate(aggregation_dict, agg_columns, n_agg, nd, mapper_dict, group_key)
238 return ds
239
--> 240 return apply_aggregation(nd)
241
242
/srv/conda/envs/notebook/lib/python3.8/site-packages/intake_esm/merge_util.py in apply_aggregation(nd, agg_column, key, level)
194 agg_options = {}
195
--> 196 dsets = [
197 apply_aggregation(value, agg_column, key=key, level=level + 1)
198 for key, value in nd.items()
/srv/conda/envs/notebook/lib/python3.8/site-packages/intake_esm/merge_util.py in <listcomp>(.0)
195
196 dsets = [
--> 197 apply_aggregation(value, agg_column, key=key, level=level + 1)
198 for key, value in nd.items()
199 ]
/srv/conda/envs/notebook/lib/python3.8/site-packages/intake_esm/merge_util.py in apply_aggregation(nd, agg_column, key, level)
216 if agg_type == 'join_new':
217 varname = dsets[0].attrs['intake_esm_varname']
--> 218 ds = join_new(
219 dsets,
220 dim_name=agg_column,
/srv/conda/envs/notebook/lib/python3.8/site-packages/intake_esm/merge_util.py in join_new(dsets, dim_name, coord_value, varname, options, group_key)
69 """
70
---> 71 raise AggregationError(message) from exc
72
73
AggregationError:
Failed to join/concatenate datasets in group with key=CMIP.NCC.NorESM2-MM.historical.Omon.gn along a new dimension `member_id`.
*** Arguments passed to xarray.concat() ***:
- objs: a list of 3 datasets
- dim: <xarray.DataArray 'member_id' (member_id: 3)>
array(['r1i1p1f1', 'r2i1p1f1', 'r3i1p1f1'], dtype='<U8')
Dimensions without coordinates: member_id
- data_vars: ['tos']
- and kwargs: {'coords': 'minimal', 'compat': 'override'}
I have the same problem. 'NorESM2-MM' is one out of several cases where this is happening. For some modelscat.to_dataset_dict() fails to combine multiple esemble members. I have not looked into this further, but maybe the coords are slightly different across the different members? A quick fix for me is to specify the member_id (i.e., pick one out of the list ['r1i1p1f1', 'r2i1p1f1', 'r3i1p1f1']):
query = dict(experiment_id=['historical'], table_id='Omon',
variable_id='tos', grid_label=['gn'], source_id=model,member_id='r2i1p1f1')
I have the same problem. 'NorESM2-MM' is one out of several cases where this is happening. For some models
cat.to_dataset_dict()fails to combine multiple esemble members. I have not looked into this further, but maybe the coords are slightly different across the different members? A quick fix for me is to specify the member_id (i.e., pick one out of the list['r1i1p1f1', 'r2i1p1f1', 'r3i1p1f1']):query = dict(experiment_id=['historical'], table_id='Omon', variable_id='tos', grid_label=['gn'], source_id=model,member_id='r2i1p1f1')
So I found that member r3i1p1f1 has some duplicate time values. I could drop them as non-unique values, but I'm not sure whether the duplicate values are "real".
Ah, that might be the problem. If the time points are exact duplicates, I think that you can just drop them. As I see in your query string, you select monthly data (table_id='Omon') and the historical run (experiment_id=['historical']). The protocol for historical run spans the years 1850 to 2014 . So the length in time for each dataset (irrespective of the member id) should be 1980 (that’s the number of monthly fields between 1850 to 2014). I think this is always the case for the historical runs.
My guess is that these duplicates sneaked in by mistake. It is not caused by the cmip6_preprocessing routine. It could be a mistake that originates from the source data (or maybe happned when uploading it to the cloud archive?).
In general, it would be nice to fix this on the fly without having to select each member_id separately.
@jbusecke Do you think this is possible as part of the combined_preprocessing routine?
Sorry for the late reply. I have noticed some of these myself, and if dropping them actually fixes the problem, we can surely do that from within the combined_preprocessing. I would however first like to know if these are actually just duplicate timesteps or if some of the time got shifted around (that would be way worse). Could you check the duplicate times and assert that the values are the same @sckw or @jetesdal?