xarray icon indicating copy to clipboard operation
xarray copied to clipboard

netCDF encoding and decoding issues.

Open Thomas-Z opened this issue 2 months ago • 7 comments

What happened?

Reading or writing netCDF variables containing scale_factor and/or fill_value might raise the following error:

UFuncTypeError: Cannot cast ufunc 'multiply' output from dtype('float64') to dtype('int64') with casting rule 'same_kind'

This problem might be related to the following changes: #7654.

What did you expect to happen?

I'm expecting it to work like it did before xarray 2024.03.0!

Minimal Complete Verifiable Example

# Example 1, decoding problem.

import netCDF4 as nc
import numpy as np
import xarray as xr

with nc.Dataset("test1.nc", mode="w") as ncds:
    ncds.createDimension(dimname="d")
    ncx = ncds.createVariable(
        varname="x",
        datatype=np.int64,
        dimensions=("d",),
        fill_value=-1,
    )

    ncx.scale_factor = 1e-3
    ncx.units = "seconds"

    ncx[:] = np.array([0.001, 0.002, 0.003])

# This will raise the error
xr.load_dataset("test1.nc")


# Example 2, encoding problem.

import netCDF4 as nc
import numpy as np
import xarray as xr

with nc.Dataset("test2.nc", mode="w") as ncds:
    ncds.createDimension(dimname="d")
    ncx = ncds.createVariable(varname="x", datatype=np.int8, dimensions=("d",))

    ncx.scale_factor = 1000

    ncx[:] = np.array([1000, 2000, 3000])

# Reading it does work
data = xr.load_dataset("test2.nc")

# Writing read data does not work
data.to_netcdf("text2x.nc")

MVCE confirmation

  • [X] Minimal example — the example is as focused as reasonably possible to demonstrate the underlying issue in xarray.
  • [X] Complete example — the example is self-contained, including all data and the text of any traceback.
  • [X] Verifiable example — the example copy & pastes into an IPython prompt or Binder notebook, returning the result.
  • [X] New issue — a search of GitHub Issues suggests this is not a duplicate.
  • [X] Recent environment — the issue occurs with the latest version of xarray and its dependencies.

Relevant log output

# Example 1 error
---------------------------------------------------------------------------
UFuncTypeError                            Traceback (most recent call last)
Cell In[38], line 1
----> 1 xr.load_dataset("test2.nc")

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/backends/api.py:280, in load_dataset(filename_or_obj, **kwargs)
    277     raise TypeError("cache has no effect in this context")
    279 with open_dataset(filename_or_obj, **kwargs) as ds:
--> 280     return ds.load()

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/core/dataset.py:855, in Dataset.load(self, **kwargs)
    853 for k, v in self.variables.items():
    854     if k not in lazy_data:
--> 855         v.load()
    857 return self

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/core/variable.py:961, in Variable.load(self, **kwargs)
    944 def load(self, **kwargs):
    945     """Manually trigger loading of this variable's data from disk or a
    946     remote source into memory and return this variable.
    947 
   (...)
    959     dask.array.compute
    960     """
--> 961     self._data = to_duck_array(self._data, **kwargs)
    962     return self

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/namedarray/pycompat.py:134, in to_duck_array(data, **kwargs)
    131     return loaded_data
    133 if isinstance(data, ExplicitlyIndexed):
--> 134     return data.get_duck_array()  # type: ignore[no-untyped-call, no-any-return]
    135 elif is_duck_array(data):
    136     return data

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/core/indexing.py:809, in MemoryCachedArray.get_duck_array(self)
    808 def get_duck_array(self):
--> 809     self._ensure_cached()
    810     return self.array.get_duck_array()

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/core/indexing.py:803, in MemoryCachedArray._ensure_cached(self)
    802 def _ensure_cached(self):
--> 803     self.array = as_indexable(self.array.get_duck_array())

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/core/indexing.py:760, in CopyOnWriteArray.get_duck_array(self)
    759 def get_duck_array(self):
--> 760     return self.array.get_duck_array()

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/core/indexing.py:630, in LazilyIndexedArray.get_duck_array(self)
    625 # self.array[self.key] is now a numpy array when
    626 # self.array is a BackendArray subclass
    627 # and self.key is BasicIndexer((slice(None, None, None),))
    628 # so we need the explicit check for ExplicitlyIndexed
    629 if isinstance(array, ExplicitlyIndexed):
--> 630     array = array.get_duck_array()
    631 return _wrap_numpy_scalars(array)

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/coding/variables.py:81, in _ElementwiseFunctionArray.get_duck_array(self)
     80 def get_duck_array(self):
---> 81     return self.func(self.array.get_duck_array())

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/coding/variables.py:81, in _ElementwiseFunctionArray.get_duck_array(self)
     80 def get_duck_array(self):
---> 81     return self.func(self.array.get_duck_array())

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/coding/variables.py:399, in _scale_offset_decoding(data, scale_factor, add_offset, dtype)
    397 data = data.astype(dtype=dtype, copy=True)
    398 if scale_factor is not None:
--> 399     data *= scale_factor
    400 if add_offset is not None:
    401     data += add_offset

UFuncTypeError: Cannot cast ufunc 'multiply' output from dtype('float64') to dtype('int64') with casting rule 'same_kind'

# Example 2 error
---------------------------------------------------------------------------
UFuncTypeError                            Traceback (most recent call last)
Cell In[42], line 1
----> 1 data.to_netcdf("text1x.nc")

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/core/dataset.py:2298, in Dataset.to_netcdf(self, path, mode, format, group, engine, encoding, unlimited_dims, compute, invalid_netcdf)
   2295     encoding = {}
   2296 from xarray.backends.api import to_netcdf
-> 2298 return to_netcdf(  # type: ignore  # mypy cannot resolve the overloads:(
   2299     self,
   2300     path,
   2301     mode=mode,
   2302     format=format,
   2303     group=group,
   2304     engine=engine,
   2305     encoding=encoding,
   2306     unlimited_dims=unlimited_dims,
   2307     compute=compute,
   2308     multifile=False,
   2309     invalid_netcdf=invalid_netcdf,
   2310 )

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/backends/api.py:1339, in to_netcdf(dataset, path_or_file, mode, format, group, engine, encoding, unlimited_dims, compute, multifile, invalid_netcdf)
   1334 # TODO: figure out how to refactor this logic (here and in save_mfdataset)
   1335 # to avoid this mess of conditionals
   1336 try:
   1337     # TODO: allow this work (setting up the file for writing array data)
   1338     # to be parallelized with dask
-> 1339     dump_to_store(
   1340         dataset, store, writer, encoding=encoding, unlimited_dims=unlimited_dims
   1341     )
   1342     if autoclose:
   1343         store.close()

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/backends/api.py:1386, in dump_to_store(dataset, store, writer, encoder, encoding, unlimited_dims)
   1383 if encoder:
   1384     variables, attrs = encoder(variables, attrs)
-> 1386 store.store(variables, attrs, check_encoding, writer, unlimited_dims=unlimited_dims)

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/backends/common.py:393, in AbstractWritableDataStore.store(self, variables, attributes, check_encoding_set, writer, unlimited_dims)
    390 if writer is None:
    391     writer = ArrayWriter()
--> 393 variables, attributes = self.encode(variables, attributes)
    395 self.set_attributes(attributes)
    396 self.set_dimensions(variables, unlimited_dims=unlimited_dims)

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/backends/common.py:482, in WritableCFDataStore.encode(self, variables, attributes)
    479 def encode(self, variables, attributes):
    480     # All NetCDF files get CF encoded by default, without this attempting
    481     # to write times, for example, would fail.
--> 482     variables, attributes = cf_encoder(variables, attributes)
    483     variables = {k: self.encode_variable(v) for k, v in variables.items()}
    484     attributes = {k: self.encode_attribute(v) for k, v in attributes.items()}

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/conventions.py:795, in cf_encoder(variables, attributes)
    792 # add encoding for time bounds variables if present.
    793 _update_bounds_encoding(variables)
--> 795 new_vars = {k: encode_cf_variable(v, name=k) for k, v in variables.items()}
    797 # Remove attrs from bounds variables (issue #2921)
    798 for var in new_vars.values():

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/conventions.py:196, in encode_cf_variable(var, needs_copy, name)
    183 ensure_not_multiindex(var, name=name)
    185 for coder in [
    186     times.CFDatetimeCoder(),
    187     times.CFTimedeltaCoder(),
   (...)
    194     variables.BooleanCoder(),
    195 ]:
--> 196     var = coder.encode(var, name=name)
    198 # TODO(kmuehlbauer): check if ensure_dtype_not_object can be moved to backends:
    199 var = ensure_dtype_not_object(var, name=name)

File .../conda/envs/ong312_local/lib/python3.12/site-packages/xarray/coding/variables.py:476, in CFScaleOffsetCoder.encode(self, variable, name)
    474     data -= pop_to(encoding, attrs, "add_offset", name=name)
    475 if "scale_factor" in encoding:
--> 476     data /= pop_to(encoding, attrs, "scale_factor", name=name)
    478 return Variable(dims, data, attrs, encoding, fastpath=True)

UFuncTypeError: Cannot cast ufunc 'divide' output from dtype('float64') to dtype('int64') with casting rule 'same_kind'

Anything else we need to know?

No response

Environment

INSTALLED VERSIONS

commit: None python: 3.12.3 | packaged by conda-forge | (main, Apr 15 2024, 18:38:13) [GCC 12.3.0] python-bits: 64 OS: Linux OS-release: 5.15.0-92-generic machine: x86_64 processor: x86_64 byteorder: little LC_ALL: None LANG: fr_FR.UTF-8 LOCALE: ('fr_FR', 'UTF-8') libhdf5: 1.14.3 libnetcdf: 4.9.2

xarray: 2024.3.0 pandas: 2.2.2 numpy: 1.26.4 scipy: 1.13.0 netCDF4: 1.6.5 pydap: None h5netcdf: 1.3.0 h5py: 3.11.0 Nio: None zarr: 2.17.2 cftime: 1.6.3 nc_time_axis: None iris: None bottleneck: None dask: 2024.4.1 distributed: 2024.4.1 matplotlib: 3.8.4 cartopy: 0.23.0 seaborn: None numbagg: None fsspec: 2024.3.1 cupy: None pint: 0.23 sparse: None flox: None numpy_groupies: None setuptools: 69.5.1 pip: 24.0 conda: 24.3.0 pytest: 8.1.1 mypy: 1.9.0 IPython: 8.22.2 sphinx: 7.3.5

Thomas-Z avatar Apr 18 '24 13:04 Thomas-Z