climetlab
climetlab copied to clipboard
Merge strategy/options
Hi,
Can I suggest merging strategy/options (similar to what is possible with xr.merge
) for to_xarray
. I'm asking this because I got the following error while making a call to this routine with some of the forecast data missing some steps:
---------------------------------------------------------------------------
DatasetBuildError Traceback (most recent call last)
Input In [12], in <cell line: 1>()
3 ds = cml.load_dataset(
4 "eumetnet-postprocessing-benchmark-training-data-gridded-reforecasts-pressure",
5 date=dates[0],
6 parameter="all",
7 level=level
8 )
10 # download of the ensemble forecast
---> 11 fcs = ds.to_xarray()
12 new_fcs = fcs.isel(step=time_index, longitude=lon_index, latitude=lat_index)
14 # creating a new coordinate for the hdate
File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/climetlab_eumetnet_postprocessing_benchmark/gridded/training_data_forecasts.py:61, in TrainingDataForecast.to_xarray(self, **kwargs)
60 def to_xarray(self, **kwargs):
---> 61 return self.source.to_xarray(xarray_open_dataset_kwargs={"backend_kwargs": {"ignore_keys": ["dataType"]}}, **kwargs)
File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/climetlab/readers/grib/fieldset.py:233, in FieldSet.to_xarray(self, **kwargs)
215 xarray_open_dataset_kwargs[key] = mix_kwargs(
216 user=user_xarray_open_dataset_kwargs.pop(key, {}),
217 default={"errors": "raise"},
(...)
220 logging_main_key=key,
221 )
222 xarray_open_dataset_kwargs.update(
223 mix_kwargs(
224 user=user_xarray_open_dataset_kwargs,
(...)
230 )
231 )
--> 233 result = xr.open_dataset(
234 FieldsetAdapter(self, ignore_keys=ignore_keys),
235 **xarray_open_dataset_kwargs,
236 )
238 def math_prod(lst):
239 if not hasattr(math, "prod"):
240 # python 3.7 does not have math.prod
File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/xarray/backends/api.py:495, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, backend_kwargs, *args, **kwargs)
483 decoders = _resolve_decoders_kwargs(
484 decode_cf,
485 open_backend_dataset_parameters=backend.open_dataset_parameters,
(...)
491 decode_coords=decode_coords,
492 )
494 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 495 backend_ds = backend.open_dataset(
496 filename_or_obj,
497 drop_variables=drop_variables,
498 **decoders,
499 **kwargs,
500 )
501 ds = _dataset_from_backend_dataset(
502 backend_ds,
503 filename_or_obj,
(...)
510 **kwargs,
511 )
512 return ds
File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/cfgrib/xarray_plugin.py:100, in CfGribBackend.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, lock, indexpath, filter_by_keys, read_keys, encode_cf, squeeze, time_dims, errors, extra_coords)
78 def open_dataset(
79 self,
80 filename_or_obj: T.Union[str, abc.MappingFieldset[T.Any, abc.Field]],
(...)
97 extra_coords: T.Dict[str, str] = {},
98 ) -> xr.Dataset:
--> 100 store = CfGribDataStore(
101 filename_or_obj,
102 indexpath=indexpath,
103 filter_by_keys=filter_by_keys,
104 read_keys=read_keys,
105 encode_cf=encode_cf,
106 squeeze=squeeze,
107 time_dims=time_dims,
108 lock=lock,
109 errors=errors,
110 extra_coords=extra_coords,
111 )
112 with xr.core.utils.close_on_error(store):
113 vars, attrs = store.load() # type: ignore
File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/cfgrib/xarray_plugin.py:40, in CfGribDataStore.__init__(self, filename, lock, **backend_kwargs)
38 else:
39 opener = dataset.open_fieldset
---> 40 self.ds = opener(filename, **backend_kwargs)
File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/cfgrib/dataset.py:730, in open_fieldset(fieldset, indexpath, filter_by_keys, read_keys, time_dims, extra_coords, computed_keys, log, **kwargs)
728 index = messages.FieldsetIndex.from_fieldset(fieldset, index_keys, computed_keys)
729 filtered_index = index.subindex(filter_by_keys)
--> 730 return open_from_index(filtered_index, read_keys, time_dims, extra_coords, **kwargs)
File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/cfgrib/dataset.py:706, in open_from_index(index, read_keys, time_dims, extra_coords, **kwargs)
699 def open_from_index(
700 index: abc.Index[T.Any, abc.Field],
701 read_keys: T.Sequence[str] = (),
(...)
704 **kwargs: T.Any,
705 ) -> Dataset:
--> 706 dimensions, variables, attributes, encoding = build_dataset_components(
707 index, read_keys=read_keys, time_dims=time_dims, extra_coords=extra_coords, **kwargs
708 )
709 return Dataset(dimensions, variables, attributes, encoding)
File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/cfgrib/dataset.py:660, in build_dataset_components(index, errors, encode_cf, squeeze, log, read_keys, time_dims, extra_coords)
658 short_name = var_name
659 try:
--> 660 dict_merge(variables, coord_vars)
661 dict_merge(variables, {short_name: data_var})
662 dict_merge(dimensions, dims)
File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/cfgrib/dataset.py:591, in dict_merge(master, update)
589 pass
590 else:
--> 591 raise DatasetBuildError(
592 "key present and new value is different: "
593 "key=%r value=%r new_value=%r" % (key, master[key], value)
594 )
DatasetBuildError: key present and new value is different: key='step' value=Variable(dimensions=('step',), data=array([ 0., 12., 24., 36., 48., 60., 72., 84., 96., 108., 120.,
132., 144., 156., 168., 180., 192., 204., 216., 228., 240.])) new_value=Variable(dimensions=('step',), data=array([ 0., 6., 12., 18., 24., 30., 36., 42., 48., 54., 60.,
66., 72., 78., 84., 90., 96., 102., 108., 114., 120., 126.,
132., 138., 144., 150., 156., 162., 168., 174., 180., 186., 192.,
198., 204., 210., 216., 222., 228., 234., 240.]))
. With xr.merge
, it is possible to merge these data together and the missing steps are filled with NaN. Actually it is my current workaround for this problem, manually merging the different forecast data, but I wonder if I could suggest to to_xarray
to do it automatically?
Thank you in advance,
Jonathan
What happens is that cfgrib sees only one file with inconsistent values for the dimension "step".
The error is raised by cfgrib, change cfgrib to make it create NaN values when the data is missing. This would be an issue for cfgrib, but it is not likely to be implemented as it would open a path to silently ignore missing data and create NaN. There must be a more robust way to achieve this.
Do you have a pointer/snippet of your xarray workaround?
Thinking further about this, it would need some coding but CliMetLab could help there. What about the following API?
To add nan to make sure all variables have the requested shape:
ds.to_xarray(ensure_dims = dict(step = [0., ..., 240.], fill_with_nan=True)
To add nan to make sure all variables have the same shape:
ds.to_xarray(fill_with_nan=True)
To check sizes and crash if this is not ok:
ds.to_xarray(ensure_dims = dict(step = [0., ..., 240.])
What happens is that cfgrib sees only one file with inconsistent values for the dimension "step".
The error is raised by cfgrib, change cfgrib to make it create NaN values when the data is missing. This would be an issue for cfgrib, but it is not likely to be implemented as it would open a path to silently ignore missing data and create NaN. There must be a more robust way to achieve this.
Do you have a pointer/snippet of your xarray workaround?
Hi @floriankrb ,
Sorry for the late answer to this. Here is a code snippet for this using the EUMETNET benchmark plugin. Look at the if
exception below:
for level in levels:
if level==700:
ds = cml.load_dataset(
"eumetnet-postprocessing-benchmark-training-data-gridded-reforecasts-pressure",
date=dates[0],
parameter=['u', 'v'],
level=level
)
fcs_wind = ds.to_xarray()
obs_wind = ds.get_observations_as_xarray()
ds = cml.load_dataset(
"eumetnet-postprocessing-benchmark-training-data-gridded-reforecasts-pressure",
date=dates[0],
parameter='q',
level=level
)
fcs_q = ds.to_xarray()
obs_q = ds.get_observations_as_xarray()
fcs = xr.merge([fcs_wind, fcs_q])
obs = xr.merge([obs_wind, obs_q])
else:
ds = cml.load_dataset(
"eumetnet-postprocessing-benchmark-training-data-gridded-reforecasts-pressure",
date=dates[0],
parameter="all",
level=level
)
fcs = ds.to_xarray()
obs = ds.get_observations_as_xarray()
...
So you see that I simply merge and it does not complain, simply filling the "holes" with NaNs. I think your proposed API would be very good indeed, so to be tested !