climetlab icon indicating copy to clipboard operation
climetlab copied to clipboard

Merge strategy/options

Open jodemaey opened this issue 2 years ago • 3 comments

Hi,

Can I suggest merging strategy/options (similar to what is possible with xr.merge) for to_xarray. I'm asking this because I got the following error while making a call to this routine with some of the forecast data missing some steps:

---------------------------------------------------------------------------
DatasetBuildError                         Traceback (most recent call last)
Input In [12], in <cell line: 1>()
      3 ds = cml.load_dataset(
      4     "eumetnet-postprocessing-benchmark-training-data-gridded-reforecasts-pressure",
      5     date=dates[0],
      6     parameter="all",
      7     level=level
      8 )
     10 # download of the ensemble forecast
---> 11 fcs = ds.to_xarray()
     12 new_fcs = fcs.isel(step=time_index, longitude=lon_index, latitude=lat_index)
     14 # creating a new coordinate for the hdate

File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/climetlab_eumetnet_postprocessing_benchmark/gridded/training_data_forecasts.py:61, in TrainingDataForecast.to_xarray(self, **kwargs)
     60 def to_xarray(self, **kwargs):
---> 61     return self.source.to_xarray(xarray_open_dataset_kwargs={"backend_kwargs": {"ignore_keys": ["dataType"]}}, **kwargs)

File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/climetlab/readers/grib/fieldset.py:233, in FieldSet.to_xarray(self, **kwargs)
    215     xarray_open_dataset_kwargs[key] = mix_kwargs(
    216         user=user_xarray_open_dataset_kwargs.pop(key, {}),
    217         default={"errors": "raise"},
   (...)
    220         logging_main_key=key,
    221     )
    222 xarray_open_dataset_kwargs.update(
    223     mix_kwargs(
    224         user=user_xarray_open_dataset_kwargs,
   (...)
    230     )
    231 )
--> 233 result = xr.open_dataset(
    234     FieldsetAdapter(self, ignore_keys=ignore_keys),
    235     **xarray_open_dataset_kwargs,
    236 )
    238 def math_prod(lst):
    239     if not hasattr(math, "prod"):
    240         # python 3.7 does not have math.prod

File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/xarray/backends/api.py:495, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, backend_kwargs, *args, **kwargs)
    483 decoders = _resolve_decoders_kwargs(
    484     decode_cf,
    485     open_backend_dataset_parameters=backend.open_dataset_parameters,
   (...)
    491     decode_coords=decode_coords,
    492 )
    494 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 495 backend_ds = backend.open_dataset(
    496     filename_or_obj,
    497     drop_variables=drop_variables,
    498     **decoders,
    499     **kwargs,
    500 )
    501 ds = _dataset_from_backend_dataset(
    502     backend_ds,
    503     filename_or_obj,
   (...)
    510     **kwargs,
    511 )
    512 return ds

File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/cfgrib/xarray_plugin.py:100, in CfGribBackend.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, lock, indexpath, filter_by_keys, read_keys, encode_cf, squeeze, time_dims, errors, extra_coords)
     78 def open_dataset(
     79     self,
     80     filename_or_obj: T.Union[str, abc.MappingFieldset[T.Any, abc.Field]],
   (...)
     97     extra_coords: T.Dict[str, str] = {},
     98 ) -> xr.Dataset:
--> 100     store = CfGribDataStore(
    101         filename_or_obj,
    102         indexpath=indexpath,
    103         filter_by_keys=filter_by_keys,
    104         read_keys=read_keys,
    105         encode_cf=encode_cf,
    106         squeeze=squeeze,
    107         time_dims=time_dims,
    108         lock=lock,
    109         errors=errors,
    110         extra_coords=extra_coords,
    111     )
    112     with xr.core.utils.close_on_error(store):
    113         vars, attrs = store.load()  # type: ignore

File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/cfgrib/xarray_plugin.py:40, in CfGribDataStore.__init__(self, filename, lock, **backend_kwargs)
     38 else:
     39     opener = dataset.open_fieldset
---> 40 self.ds = opener(filename, **backend_kwargs)

File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/cfgrib/dataset.py:730, in open_fieldset(fieldset, indexpath, filter_by_keys, read_keys, time_dims, extra_coords, computed_keys, log, **kwargs)
    728 index = messages.FieldsetIndex.from_fieldset(fieldset, index_keys, computed_keys)
    729 filtered_index = index.subindex(filter_by_keys)
--> 730 return open_from_index(filtered_index, read_keys, time_dims, extra_coords, **kwargs)

File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/cfgrib/dataset.py:706, in open_from_index(index, read_keys, time_dims, extra_coords, **kwargs)
    699 def open_from_index(
    700     index: abc.Index[T.Any, abc.Field],
    701     read_keys: T.Sequence[str] = (),
   (...)
    704     **kwargs: T.Any,
    705 ) -> Dataset:
--> 706     dimensions, variables, attributes, encoding = build_dataset_components(
    707         index, read_keys=read_keys, time_dims=time_dims, extra_coords=extra_coords, **kwargs
    708     )
    709     return Dataset(dimensions, variables, attributes, encoding)

File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/cfgrib/dataset.py:660, in build_dataset_components(index, errors, encode_cf, squeeze, log, read_keys, time_dims, extra_coords)
    658     short_name = var_name
    659 try:
--> 660     dict_merge(variables, coord_vars)
    661     dict_merge(variables, {short_name: data_var})
    662     dict_merge(dimensions, dims)

File ~/fileserver/home/anaconda3/envs/climetlab/lib/python3.9/site-packages/cfgrib/dataset.py:591, in dict_merge(master, update)
    589     pass
    590 else:
--> 591     raise DatasetBuildError(
    592         "key present and new value is different: "
    593         "key=%r value=%r new_value=%r" % (key, master[key], value)
    594     )

DatasetBuildError: key present and new value is different: key='step' value=Variable(dimensions=('step',), data=array([  0.,  12.,  24.,  36.,  48.,  60.,  72.,  84.,  96., 108., 120.,
       132., 144., 156., 168., 180., 192., 204., 216., 228., 240.])) new_value=Variable(dimensions=('step',), data=array([  0.,   6.,  12.,  18.,  24.,  30.,  36.,  42.,  48.,  54.,  60.,
        66.,  72.,  78.,  84.,  90.,  96., 102., 108., 114., 120., 126.,
       132., 138., 144., 150., 156., 162., 168., 174., 180., 186., 192.,
       198., 204., 210., 216., 222., 228., 234., 240.]))

. With xr.merge, it is possible to merge these data together and the missing steps are filled with NaN. Actually it is my current workaround for this problem, manually merging the different forecast data, but I wonder if I could suggest to to_xarray to do it automatically?

Thank you in advance,

Jonathan

jodemaey avatar Mar 28 '22 09:03 jodemaey

What happens is that cfgrib sees only one file with inconsistent values for the dimension "step".

The error is raised by cfgrib, change cfgrib to make it create NaN values when the data is missing. This would be an issue for cfgrib, but it is not likely to be implemented as it would open a path to silently ignore missing data and create NaN. There must be a more robust way to achieve this.

Do you have a pointer/snippet of your xarray workaround?

floriankrb avatar Mar 30 '22 07:03 floriankrb

Thinking further about this, it would need some coding but CliMetLab could help there. What about the following API?

To add nan to make sure all variables have the requested shape:

ds.to_xarray(ensure_dims = dict(step = [0., ..., 240.], fill_with_nan=True)  

To add nan to make sure all variables have the same shape:

ds.to_xarray(fill_with_nan=True)  

To check sizes and crash if this is not ok:

ds.to_xarray(ensure_dims = dict(step = [0., ..., 240.])  

floriankrb avatar Mar 31 '22 13:03 floriankrb

What happens is that cfgrib sees only one file with inconsistent values for the dimension "step".

The error is raised by cfgrib, change cfgrib to make it create NaN values when the data is missing. This would be an issue for cfgrib, but it is not likely to be implemented as it would open a path to silently ignore missing data and create NaN. There must be a more robust way to achieve this.

Do you have a pointer/snippet of your xarray workaround?

Hi @floriankrb ,

Sorry for the late answer to this. Here is a code snippet for this using the EUMETNET benchmark plugin. Look at the if exception below:

for level in levels:
    
    if level==700:
        ds = cml.load_dataset(
            "eumetnet-postprocessing-benchmark-training-data-gridded-reforecasts-pressure",
            date=dates[0],
            parameter=['u', 'v'],
            level=level
        )
        
        fcs_wind = ds.to_xarray()
        obs_wind = ds.get_observations_as_xarray()
        
        ds = cml.load_dataset(
            "eumetnet-postprocessing-benchmark-training-data-gridded-reforecasts-pressure",
            date=dates[0],
            parameter='q',
            level=level
        )

        fcs_q = ds.to_xarray()
        obs_q = ds.get_observations_as_xarray()

        fcs = xr.merge([fcs_wind, fcs_q])
        obs = xr.merge([obs_wind, obs_q])
        
    else:
        ds = cml.load_dataset(
            "eumetnet-postprocessing-benchmark-training-data-gridded-reforecasts-pressure",
            date=dates[0],
            parameter="all",
            level=level
        )

        fcs = ds.to_xarray()
        obs = ds.get_observations_as_xarray()
...

So you see that I simply merge and it does not complain, simply filling the "holes" with NaNs. I think your proposed API would be very good indeed, so to be tested !

jodemaey avatar Aug 03 '22 10:08 jodemaey