xarray
xarray copied to clipboard
Saving a DataArray of datetime objects as zarr is not a lazy operation despite compute=False
What happened?
Trying to save a lazy xr.DataArray of datetime objects as a zarr forces a dask.compute operation and retrieves the data to the local notebook. This is generally not a problem for indices of datetime objects as that is already locally store and generally small in size.
However, if the whole underlying array is a datetime object, that can be a serious problem. In my case it simply crashed the scheduler upon attempting to retrieve the data persisted on workers.
I managed to isolate the problem on this call stack. The issue is in the encode_cf_datetime
function
What did you expect to happen?
Storing the data in zarr format to be performed directly by dask workers bypassing the scheduler/Client if compute=True
, and complete lazy operation if compute=False
Minimal Complete Verifiable Example
import numpy as np
import xarray as xr
import dask.array as da
test = xr.DataArray(
data = da.full((20000, 20000), np.datetime64('2005-02-25T03:30', 'ns')),
coords = {'x': range(20000), 'y': range(20000)}
).to_dataset(name='test')
print(test.test.dtype)
# dtype('<M8[ns]')
test.to_zarr('test.zarr', compute=False)
# this will take a while and trigger the computation of the array. No data will be actually saved though
MVCE confirmation
- [X] Minimal example — the example is as focused as reasonably possible to demonstrate the underlying issue in xarray.
- [x] Complete example — the example is self-contained, including all data and the text of any traceback.
- [x] Verifiable example — the example copy & pastes into an IPython prompt or Binder notebook, returning the result.
- [X] New issue — a search of GitHub Issues suggests this is not a duplicate.
Relevant log output
File /env/lib/python3.8/site-packages/xarray/core/dataset.py:2036, in Dataset.to_zarr(self, store, chunk_store, mode, synchronizer, group, encoding, compute, consolidated, append_dim, region, safe_chunks, storage_options)
2033 if encoding is None:
2034 encoding = {}
-> 2036 return to_zarr(
2037 self,
2038 store=store,
2039 chunk_store=chunk_store,
2040 storage_options=storage_options,
2041 mode=mode,
2042 synchronizer=synchronizer,
2043 group=group,
2044 encoding=encoding,
2045 compute=compute,
2046 consolidated=consolidated,
2047 append_dim=append_dim,
2048 region=region,
2049 safe_chunks=safe_chunks,
2050 )
File /env/lib/python3.8/site-packages/xarray/backends/api.py:1431, in to_zarr(dataset, store, chunk_store, mode, synchronizer, group, encoding, compute, consolidated, append_dim, region, safe_chunks, storage_options)
1429 writer = ArrayWriter()
1430 # TODO: figure out how to properly handle unlimited_dims
-> 1431 dump_to_store(dataset, zstore, writer, encoding=encoding)
1432 writes = writer.sync(compute=compute)
1434 if compute:
File /env/lib/python3.8/site-packages/xarray/backends/api.py:1119, in dump_to_store(dataset, store, writer, encoder, encoding, unlimited_dims)
1116 if encoder:
1117 variables, attrs = encoder(variables, attrs)
-> 1119 store.store(variables, attrs, check_encoding, writer, unlimited_dims=unlimited_dims)
File /env/lib/python3.8/site-packages/xarray/backends/zarr.py:500, in ZarrStore.store(self, variables, attributes, check_encoding_set, writer, unlimited_dims)
498 new_variables = set(variables) - existing_variable_names
499 variables_without_encoding = {vn: variables[vn] for vn in new_variables}
--> 500 variables_encoded, attributes = self.encode(
501 variables_without_encoding, attributes
502 )
504 if existing_variable_names:
505 # Decode variables directly, without going via xarray.Dataset to
506 # avoid needing to load index variables into memory.
507 # TODO: consider making loading indexes lazy again?
508 existing_vars, _, _ = conventions.decode_cf_variables(
509 self.get_variables(), self.get_attrs()
510 )
File /env/lib/python3.8/site-packages/xarray/backends/common.py:200, in AbstractWritableDataStore.encode(self, variables, attributes)
183 def encode(self, variables, attributes):
184 """
185 Encode the variables and attributes in this store
186
(...)
198
199 """
--> 200 variables = {k: self.encode_variable(v) for k, v in variables.items()}
201 attributes = {k: self.encode_attribute(v) for k, v in attributes.items()}
202 return variables, attributes
File /env/lib/python3.8/site-packages/xarray/backends/common.py:200, in <dictcomp>(.0)
183 def encode(self, variables, attributes):
184 """
185 Encode the variables and attributes in this store
186
(...)
198
199 """
--> 200 variables = {k: self.encode_variable(v) for k, v in variables.items()}
201 attributes = {k: self.encode_attribute(v) for k, v in attributes.items()}
202 return variables, attributes
File /env/lib/python3.8/site-packages/xarray/backends/zarr.py:459, in ZarrStore.encode_variable(self, variable)
458 def encode_variable(self, variable):
--> 459 variable = encode_zarr_variable(variable)
460 return variable
File /env/lib/python3.8/site-packages/xarray/backends/zarr.py:258, in encode_zarr_variable(var, needs_copy, name)
237 def encode_zarr_variable(var, needs_copy=True, name=None):
238 """
239 Converts an Variable into an Variable which follows some
240 of the CF conventions:
(...)
255 A variable which has been encoded as described above.
256 """
--> 258 var = conventions.encode_cf_variable(var, name=name)
260 # zarr allows unicode, but not variable-length strings, so it's both
261 # simpler and more compact to always encode as UTF-8 explicitly.
262 # TODO: allow toggling this explicitly via dtype in encoding.
263 coder = coding.strings.EncodedStringCoder(allows_unicode=True)
File /env/lib/python3.8/site-packages/xarray/conventions.py:273, in encode_cf_variable(var, needs_copy, name)
264 ensure_not_multiindex(var, name=name)
266 for coder in [
267 times.CFDatetimeCoder(),
268 times.CFTimedeltaCoder(),
(...)
271 variables.UnsignedIntegerCoder(),
272 ]:
--> 273 var = coder.encode(var, name=name)
275 # TODO(shoyer): convert all of these to use coders, too:
276 var = maybe_encode_nonstring_dtype(var, name=name)
File /env/lib/python3.8/site-packages/xarray/coding/times.py:659, in CFDatetimeCoder.encode(self, variable, name)
655 dims, data, attrs, encoding = unpack_for_encoding(variable)
656 if np.issubdtype(data.dtype, np.datetime64) or contains_cftime_datetimes(
657 variable
658 ):
--> 659 (data, units, calendar) = encode_cf_datetime(
660 data, encoding.pop("units", None), encoding.pop("calendar", None)
661 )
662 safe_setitem(attrs, "units", units, name=name)
663 safe_setitem(attrs, "calendar", calendar, name=name)
File /env/lib/python3.8/site-packages/xarray/coding/times.py:592, in encode_cf_datetime(dates, units, calendar)
582 def encode_cf_datetime(dates, units=None, calendar=None):
583 """Given an array of datetime objects, returns the tuple `(num, units,
584 calendar)` suitable for a CF compliant time variable.
585
(...)
590 cftime.date2num
591 """
--> 592 dates = np.asarray(dates)
594 if units is None:
595 units = infer_datetime_units(dates)
Anything else we need to know?
Our system uses dask_gateway in a AWS infrastructure (S3 for storage)
Environment
xarray: 2022.3.0 pandas: 1.5.0 numpy: 1.22.4 scipy: 1.9.1 netCDF4: 1.6.1 pydap: installed h5netcdf: 1.0.2 h5py: 3.7.0 Nio: None zarr: 2.13.2 cftime: 1.6.2 nc_time_axis: None PseudoNetCDF: None rasterio: 1.3.2 cfgrib: None iris: None bottleneck: 1.3.5 dask: 2022.9.2 distributed: 2022.9.2 matplotlib: 3.6.0 cartopy: 0.20.2 seaborn: 0.12.0 numbagg: None fsspec: 2022.8.2 cupy: None pint: None sparse: 0.13.0 setuptools: 65.4.1 pip: 22.2.2 conda: None pytest: 7.1.3 IPython: 8.5.0 sphinx: None