xarray icon indicating copy to clipboard operation
xarray copied to clipboard

ValueError: conflicting sizes for dimension in xr.open_dataset("reference://"...) VS. no error in xr.open_dataset(direct_file_path) for h5

Open ksharonin opened this issue 1 year ago • 3 comments

What is your issue?

Hi all, on a project I am attempting a dataset read using the xarray JSON reference system. Metadata for this file (an ATL03 h5 file) can be found here: https://nsidc.org/sites/default/files/icesat2_atl03_data_dict_v005.pdf

  1. Reading a group with variables that have 2 dimensions or less produces no issues. E.g Group "gt1l/heights" (documented as /gtx/heights in the PDF)
ds = xr.open_dataset("reference://", engine="zarr", backend_kwargs={
                    "consolidated": False,
                    "storage_options": {"fo": JSON_PATH},
                    "group": "gt1l/heights"
                    })
  1. Reading a group with a variable that has 3+ dimensions causes the following error. The group "ancillary_data/calibrations/dead_time_radiometric_signal_loss/gt1l" contains variable rad_corr which contains 3 dimensions.
ds = xr.open_dataset("reference://", engine="zarr", backend_kwargs={
                    "consolidated": False,
                    "storage_options": {"fo": JSON_PATH},
                    "group": "ancillary_data/calibrations/dead_time_radiometric_signal_loss/gt1l"
                    })
{
	"name": "ValueError",
	"message": "conflicting sizes for dimension 'phony_dim_1': length 498 on 'width' and length 160 on {'phony_dim_0': 'dead_time', 'phony_dim_1': 'rad_corr', 'phony_dim_2': 'rad_corr'}",
	"stack": "---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[2], line 1
----> 1 ds = xr.open_dataset(\"reference://\", engine=\"zarr\", backend_kwargs={
      2                     \"consolidated\": False,
      3                     \"storage_options\": {\"fo\": JSON_PATH},
      4                     \"group\": group_path
      5                     })

File ~/opt/anaconda3/envs/kerchunkc/lib/python3.8/site-packages/xarray/backends/api.py:539, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, backend_kwargs, **kwargs)
    527 decoders = _resolve_decoders_kwargs(
    528     decode_cf,
    529     open_backend_dataset_parameters=backend.open_dataset_parameters,
   (...)
    535     decode_coords=decode_coords,
    536 )
    538 overwrite_encoded_chunks = kwargs.pop(\"overwrite_encoded_chunks\", None)
--> 539 backend_ds = backend.open_dataset(
    540     filename_or_obj,
    541     drop_variables=drop_variables,
    542     **decoders,
    543     **kwargs,
    544 )
    545 ds = _dataset_from_backend_dataset(
    546     backend_ds,
    547     filename_or_obj,
   (...)
    555     **kwargs,
    556 )
    557 return ds

File ~/opt/anaconda3/envs/kerchunkc/lib/python3.8/site-packages/xarray/backends/zarr.py:862, in ZarrBackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, synchronizer, consolidated, chunk_store, storage_options, stacklevel)
    860 store_entrypoint = StoreBackendEntrypoint()
    861 with close_on_error(store):
--> 862     ds = store_entrypoint.open_dataset(
    863         store,
    864         mask_and_scale=mask_and_scale,
    865         decode_times=decode_times,
    866         concat_characters=concat_characters,
    867         decode_coords=decode_coords,
    868         drop_variables=drop_variables,
    869         use_cftime=use_cftime,
    870         decode_timedelta=decode_timedelta,
    871     )
    872 return ds

File ~/opt/anaconda3/envs/kerchunkc/lib/python3.8/site-packages/xarray/backends/store.py:43, in StoreBackendEntrypoint.open_dataset(self, store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta)
     29 encoding = store.get_encoding()
     31 vars, attrs, coord_names = conventions.decode_cf_variables(
     32     vars,
     33     attrs,
   (...)
     40     decode_timedelta=decode_timedelta,
     41 )
---> 43 ds = Dataset(vars, attrs=attrs)
     44 ds = ds.set_coords(coord_names.intersection(vars))
     45 ds.set_close(store.close)

File ~/opt/anaconda3/envs/kerchunkc/lib/python3.8/site-packages/xarray/core/dataset.py:604, in Dataset.__init__(self, data_vars, coords, attrs)
    601 if isinstance(coords, Dataset):
    602     coords = coords.variables
--> 604 variables, coord_names, dims, indexes, _ = merge_data_and_coords(
    605     data_vars, coords, compat=\"broadcast_equals\"
    606 )
    608 self._attrs = dict(attrs) if attrs is not None else None
    609 self._close = None

File ~/opt/anaconda3/envs/kerchunkc/lib/python3.8/site-packages/xarray/core/merge.py:575, in merge_data_and_coords(data_vars, coords, compat, join)
    573 objects = [data_vars, coords]
    574 explicit_coords = coords.keys()
--> 575 return merge_core(
    576     objects,
    577     compat,
    578     join,
    579     explicit_coords=explicit_coords,
    580     indexes=Indexes(indexes, coords),
    581 )

File ~/opt/anaconda3/envs/kerchunkc/lib/python3.8/site-packages/xarray/core/merge.py:761, in merge_core(objects, compat, join, combine_attrs, priority_arg, explicit_coords, indexes, fill_value)
    756 prioritized = _get_priority_vars_and_indexes(aligned, priority_arg, compat=compat)
    757 variables, out_indexes = merge_collected(
    758     collected, prioritized, compat=compat, combine_attrs=combine_attrs
    759 )
--> 761 dims = calculate_dimensions(variables)
    763 coord_names, noncoord_names = determine_coords(coerced)
    764 if explicit_coords is not None:

File ~/opt/anaconda3/envs/kerchunkc/lib/python3.8/site-packages/xarray/core/variable.py:3208, in calculate_dimensions(variables)
   3206             last_used[dim] = k
   3207         elif dims[dim] != size:
-> 3208             raise ValueError(
   3209                 f\"conflicting sizes for dimension {dim!r}: \"
   3210                 f\"length {size} on {k!r} and length {dims[dim]} on {last_used!r}\"
   3211             )
   3212 return dims

ValueError: conflicting sizes for dimension 'phony_dim_1': length 498 on 'width' and length 160 on {'phony_dim_0': 'dead_time', 'phony_dim_1': 'rad_corr', 'phony_dim_2': 'rad_corr'}"
}

  1. Now, contrast with reading the same group with the 3+ dimension variables, but using the direct h5 file path. This does not produce an error
ds = xr.open_dataset("/Users/katrinasharonin/Downloads/ATL03_20230816235231_08822014_006_01.h5", group="ancillary_data/calibrations/dead_time_radiometric_signal_loss/gt1l")

The JSON reference file has been attached for reference ATL03_REF_NONUTM.json

ksharonin avatar Jan 05 '24 06:01 ksharonin

Thanks for opening your first issue here at xarray! Be sure to follow the issue template! If you have an idea for a solution, we would really welcome a Pull Request with proposed changes. See the Contributing Guide for more. It may take us a while to respond here, but we really value your contribution. Contributors like you help make xarray better. Thank you!

welcome[bot] avatar Jan 05 '24 06:01 welcome[bot]

I am also seeing this when trying to open a multiscale OME-Zarr dataset, since the scale arrays all have different dimensions.

  File "C:\Users\cameron.arshadi\repos\aind-morphology-utils\src\aind_morphology_utils\movie_maker.py", line 592, in <module>
    main()
  File "C:\Users\cameron.arshadi\repos\aind-morphology-utils\src\aind_morphology_utils\movie_maker.py", line 572, in main
    ds = xarray.open_zarr(config.zarr_path, chunks=None, consolidated=False)
  File "C:\Users\cameron.arshadi\AppData\Local\miniconda3\envs\amu\lib\site-packages\xarray\backends\zarr.py", line 825, in open_zarr
    ds = open_dataset(
  File "C:\Users\cameron.arshadi\AppData\Local\miniconda3\envs\amu\lib\site-packages\xarray\backends\api.py", line 541, in open_dataset
    backend_ds = backend.open_dataset(
  File "C:\Users\cameron.arshadi\AppData\Local\miniconda3\envs\amu\lib\site-packages\xarray\backends\zarr.py", line 903, in open_dataset
    ds = store_entrypoint.open_dataset(
  File "C:\Users\cameron.arshadi\AppData\Local\miniconda3\envs\amu\lib\site-packages\xarray\backends\store.py", line 47, in open_dataset
    ds = Dataset(vars, attrs=attrs)
  File "C:\Users\cameron.arshadi\AppData\Local\miniconda3\envs\amu\lib\site-packages\xarray\core\dataset.py", line 613, in __init__
    variables, coord_names, dims, indexes, _ = merge_data_and_coords(
  File "C:\Users\cameron.arshadi\AppData\Local\miniconda3\envs\amu\lib\site-packages\xarray\core\merge.py", line 575, in merge_data_and_coords
    return merge_core(
  File "C:\Users\cameron.arshadi\AppData\Local\miniconda3\envs\amu\lib\site-packages\xarray\core\merge.py", line 761, in merge_core
    dims = calculate_dimensions(variables)
  File "C:\Users\cameron.arshadi\AppData\Local\miniconda3\envs\amu\lib\site-packages\xarray\core\variable.py", line 3216, in calculate_dimensions
    raise ValueError(
ValueError: conflicting sizes for dimension 'z': length 14421 on '1' and length 28842 on {'t': '0', 'c': '0', 'z': '0', 'y': '0', 'x': '0'}

carshadi avatar Jan 09 '24 00:01 carshadi

Not an expert here, but presumably if a file uses the same name for differently sized dimensions, xarray isn't going to be able to open it?

One workaround may be to open dataarrays separately, rename dimension names, and then combine into a dataset. But not confident.

Does anyone have other thoughts or should we close?

max-sixty avatar Feb 26 '24 06:02 max-sixty