intake-esm icon indicating copy to clipboard operation
intake-esm copied to clipboard

Error using `to_dataset_dict()` with opendap url as path

Open tlogan2000 opened this issue 1 year ago • 3 comments

I am having errors loading data via to_dataset_dict when the path field contains an opendap url installed intake_esm version : 2021.8.17

This works fine

import xarray as xr
import intake
cat = intake.open_esm_datastore("https://pavics.ouranos.ca/catalog/climex.json")  # TEST_USE_PROD_DATA
print(cat.df.head())

url = cat.df.path[0]
ds = xr.open_dataset(url, chunks=dict(realization=1, time=365))

However try to use to_dataset_dict() results in a error :

dsdict = cat.to_dataset_dict(cdf_kwargs=dict(chunks=dict(realization=1, time=365)))

traceback:

---------------------------------------------------------------------------
ClientResponseError                       Traceback (most recent call last)
File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/implementations/http.py:383, in HTTPFileSystem._info(self, url, **kwargs)
    381 try:
    382     info.update(
--> 383         await _file_info(
    384             url,
    385             size_policy=policy,
    386             session=session,
    387             **self.kwargs,
    388             **kwargs,
    389         )
    390     )
    391     if info.get("size") is not None:

File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/implementations/http.py:734, in _file_info(url, session, size_policy, **kwargs)
    733 async with r:
--> 734     r.raise_for_status()
    736     # TODO:
    737     #  recognise lack of 'Accept-Ranges',
    738     #                 or 'Accept-Ranges': 'none' (not 'bytes')
    739     #  to mean streaming only, no random access => return None

File /opt/conda/envs/birdy/lib/python3.8/site-packages/aiohttp/client_reqrep.py:1004, in ClientResponse.raise_for_status(self)
   1003 self.release()
-> 1004 raise ClientResponseError(
   1005     self.request_info,
   1006     self.history,
   1007     status=self.status,
   1008     message=self.reason,
   1009     headers=self.headers,
   1010 )

ClientResponseError: 400, message='Bad Request', url=URL('https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/datasets/simulations/climex/day_climex-crcm5_historical+rcp85.ncml')

The above exception was the direct cause of the following exception:

FileNotFoundError                         Traceback (most recent call last)
Input In [3], in <cell line: 1>()
----> 1 dsdict = cat.to_dataset_dict(cdf_kwargs=dict(chunks=dict(realization=1, time=365)))

File /opt/conda/envs/birdy/lib/python3.8/site-packages/intake_esm/core.py:922, in esm_datastore.to_dataset_dict(self, zarr_kwargs, cdf_kwargs, preprocess, storage_options, progressbar, aggregate)
    918 future_tasks = [
    919     executor.submit(_load_source, key, source) for key, source in sources.items()
    920 ]
    921 for i, task in enumerate(concurrent.futures.as_completed(future_tasks)):
--> 922     key, ds = task.result()
    923     self._datasets[key] = ds
    924     if self.progressbar:

File /opt/conda/envs/birdy/lib/python3.8/concurrent/futures/_base.py:437, in Future.result(self, timeout)
    435     raise CancelledError()
    436 elif self._state == FINISHED:
--> 437     return self.__get_result()
    439 self._condition.wait(timeout)
    441 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:

File /opt/conda/envs/birdy/lib/python3.8/concurrent/futures/_base.py:389, in Future.__get_result(self)
    387 if self._exception:
    388     try:
--> 389         raise self._exception
    390     finally:
    391         # Break a reference cycle with the exception in self._exception
    392         self = None

File /opt/conda/envs/birdy/lib/python3.8/concurrent/futures/thread.py:57, in _WorkItem.run(self)
     54     return
     56 try:
---> 57     result = self.fn(*self.args, **self.kwargs)
     58 except BaseException as exc:
     59     self.future.set_exception(exc)

File /opt/conda/envs/birdy/lib/python3.8/site-packages/intake_esm/core.py:908, in esm_datastore.to_dataset_dict.<locals>._load_source(key, source)
    907 def _load_source(key, source):
--> 908     return key, source.to_dask()

File /opt/conda/envs/birdy/lib/python3.8/site-packages/intake_esm/source.py:89, in ESMDataSource.to_dask(self)
     87 def to_dask(self):
     88     """Return xarray object (which will have chunks)"""
---> 89     self._load_metadata()
     90     return self._ds

File /opt/conda/envs/birdy/lib/python3.8/site-packages/intake/source/base.py:236, in DataSourceBase._load_metadata(self)
    234 """load metadata only if needed"""
    235 if self._schema is None:
--> 236     self._schema = self._get_schema()
    237     self.dtype = self._schema.dtype
    238     self.shape = self._schema.shape

File /opt/conda/envs/birdy/lib/python3.8/site-packages/intake_esm/source.py:57, in ESMDataSource._get_schema(self)
     54 def _get_schema(self):
     56     if self._ds is None:
---> 57         self._open_dataset()
     59         metadata = {
     60             'dims': dict(self._ds.dims),
     61             'data_vars': {k: list(self._ds[k].coords) for k in self._ds.data_vars.keys()},
     62             'coords': tuple(self._ds.coords.keys()),
     63         }
     64         self._schema = Schema(
     65             datashape=None,
     66             dtype=None,
   (...)
     69             extra_metadata=metadata,
     70         )

File /opt/conda/envs/birdy/lib/python3.8/site-packages/intake_esm/source.py:75, in ESMDataSource._open_dataset(self)
     73 def _open_dataset(self):
     74     mapper = _path_to_mapper(self.row[self.path_column], self.storage_options, self.data_format)
---> 75     ds = _open_asset(
     76         mapper,
     77         data_format=self.data_format,
     78         zarr_kwargs=self.zarr_kwargs,
     79         cdf_kwargs=self.cdf_kwargs,
     80         preprocess=self.preprocess,
     81         requested_variables=self.requested_variables,
     82     )
     83     ds.attrs['intake_esm_dataset_key'] = self.key
     84     self._ds = ds

File /opt/conda/envs/birdy/lib/python3.8/site-packages/intake_esm/merge_util.py:266, in _open_asset(path, data_format, zarr_kwargs, cdf_kwargs, preprocess, varname, requested_variables)
    264     protocol = normalize_protocol(path.fs.protocol)
    265     root = path.path
--> 266     path = path.open()
    268 if data_format == 'zarr':
    269     try:

File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/core.py:140, in OpenFile.open(self)
    132 def open(self):
    133     """Materialise this as a real open file without context
    134 
    135     The file should be explicitly closed to avoid enclosed file
   (...)
    138     been deleted; but a with-context is better style.
    139     """
--> 140     out = self.__enter__()
    141     closer = out.close
    142     fobjects = self.fobjects.copy()[:-1]

File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/core.py:103, in OpenFile.__enter__(self)
    100 def __enter__(self):
    101     mode = self.mode.replace("t", "").replace("b", "") + "b"
--> 103     f = self.fs.open(self.path, mode=mode)
    105     self.fobjects = [f]
    107     if self.compression is not None:

File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/spec.py:1009, in AbstractFileSystem.open(self, path, mode, block_size, cache_options, compression, **kwargs)
   1007 else:
   1008     ac = kwargs.pop("autocommit", not self._intrans)
-> 1009     f = self._open(
   1010         path,
   1011         mode=mode,
   1012         block_size=block_size,
   1013         autocommit=ac,
   1014         cache_options=cache_options,
   1015         **kwargs,
   1016     )
   1017     if compression is not None:
   1018         from fsspec.compression import compr

File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/implementations/http.py:343, in HTTPFileSystem._open(self, path, mode, block_size, autocommit, cache_type, cache_options, size, **kwargs)
    341 kw["asynchronous"] = self.asynchronous
    342 kw.update(kwargs)
--> 343 size = size or self.info(path, **kwargs)["size"]
    344 session = sync(self.loop, self.set_session)
    345 if block_size and size:

File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/asyn.py:85, in sync_wrapper.<locals>.wrapper(*args, **kwargs)
     82 @functools.wraps(func)
     83 def wrapper(*args, **kwargs):
     84     self = obj or args[0]
---> 85     return sync(self.loop, func, *args, **kwargs)

File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/asyn.py:65, in sync(loop, func, timeout, *args, **kwargs)
     63     raise FSTimeoutError from return_result
     64 elif isinstance(return_result, BaseException):
---> 65     raise return_result
     66 else:
     67     return return_result

File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/asyn.py:25, in _runner(event, coro, result, timeout)
     23     coro = asyncio.wait_for(coro, timeout=timeout)
     24 try:
---> 25     result[0] = await coro
     26 except Exception as ex:
     27     result[0] = ex

File /opt/conda/envs/birdy/lib/python3.8/site-packages/fsspec/implementations/http.py:396, in HTTPFileSystem._info(self, url, **kwargs)
    393     except Exception as exc:
    394         if policy == "get":
    395             # If get failed, then raise a FileNotFoundError
--> 396             raise FileNotFoundError(url) from exc
    397         logger.debug(str(exc))
    399 return {"name": url, "size": None, **info, "type": "file"}

FileNotFoundError: https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/datasets/simulations/climex/day_climex-crcm5_historical+rcp85.ncml

tlogan2000 avatar Sep 09 '22 13:09 tlogan2000

Note I have also tried forcing engine='pydap' via dsdict = cat.to_dataset_dict(cdf_kwargs=dict(chunks=dict(realization=1, time=365), engine='pydap')) but end up with the same error

tlogan2000 avatar Sep 09 '22 13:09 tlogan2000

@tlogan2000 , I tried reading in the catalog and received the following error:

ValidationError                           Traceback (most recent call last)
Input In [3], in <cell line: 1>()
----> 1 catalog = intake.open_esm_datastore("https://pavics.ouranos.ca/catalog/climex.json")

File ~/miniforge3/envs/pyart-docs/lib/python3.10/site-packages/intake_esm/core.py:94, in esm_datastore.__init__(self, obj, progressbar, sep, registry, read_csv_kwargs, storage_options, intake_kwargs)
     92     self.esmcat = ESMCatalogModel.from_dict(obj)
     93 else:
---> 94     self.esmcat = ESMCatalogModel.load(
     95         obj, storage_options=self.storage_options, read_csv_kwargs=read_csv_kwargs
     96     )
     98 self.derivedcat = registry or default_registry
     99 self._entries = {}

File ~/miniforge3/envs/pyart-docs/lib/python3.10/site-packages/intake_esm/cat.py:226, in ESMCatalogModel.load(cls, json_file, storage_options, read_csv_kwargs)
    224 if 'last_updated' not in data:
    225     data['last_updated'] = None
--> 226 cat = cls.parse_obj(data)
    227 if cat.catalog_file:
    228     if _mapper.fs.exists(cat.catalog_file):

File ~/miniforge3/envs/pyart-docs/lib/python3.10/site-packages/pydantic/main.py:521, in pydantic.main.BaseModel.parse_obj()

File ~/miniforge3/envs/pyart-docs/lib/python3.10/site-packages/pydantic/main.py:341, in pydantic.main.BaseModel.__init__()

ValidationError: 1 validation error for ESMCatalogModel
aggregation_control
  field required (type=value_error.missing)

Did you set the aggregation control in the catalog?

mgrover1 avatar Sep 14 '22 18:09 mgrover1

No, the specs mention it is optional.

huard avatar Sep 19 '22 14:09 huard