Observations with wide index can overflow storage
After refactor of summary data, got reports of:
File ".../ert/callbacks.py", line 68, in _write_responses_to_storage
run_arg.ensemble_storage.save_response(config.name, ds, run_arg.iens)
File ".../ert/storage/local_ensemble.py", line 578, in save_response
data.to_netcdf(output_path / f"{group}.nc", engine="scipy")
File ".../xarray/core/dataset.py", line 1912, in to_netcdf
return to_netcdf( # type: ignore # mypy cannot resolve the overloads:(
File ".../xarray/backends/api.py", line 1232, in to_netcdf
dump_to_store(
File ".../xarray/backends/api.py", line 1279, in dump_to_store
store.store(variables, attrs, check_encoding, writer, unlimited_dims=unlimited_dims)
File ".../xarray/backends/common.py", line 266, in store
variables, attributes = self.encode(variables, attributes)
File ".../xarray/backends/common.py", line 356, in encode
variables = {k: self.encode_variable(v) for k, v in variables.items()}
File ".../xarray/backends/common.py", line 356, in <dictcomp>
variables = {k: self.encode_variable(v) for k, v in variables.items()}
File ".../xarray/backends/scipy_.py", line 206, in encode_variable
variable = encode_nc3_variable(variable)
File ".../xarray/backends/netcdf3.py", line 97, in encode_nc3_variable
data = coerce_nc3_dtype(var.data)
File ".../xarray/backends/netcdf3.py", line 68, in coerce_nc3_dtype
raise ValueError(
ValueError: could not safely cast array from dtype int64 to int32
Reproducible with Drogon DST
Can be reproduced like this:
import datetime
import numpy as np
from ert.config import EnkfObs
from ert.config.enkf_observation_implementation_type import (
EnkfObservationImplementationType,
)
from ert.config.observation_vector import GenObservation, ObsVector
from ert.storage import open_storage
def test_other_failure(tmpdir):
storage = open_storage(tmpdir, "w")
_ = storage.create_experiment(
[],
observations=EnkfObs(
obs_vectors={
"A": ObsVector(
observation_type=EnkfObservationImplementationType.GEN_OBS,
observation_key="A",
data_key="A",
observations={
datetime.datetime(2000, 1, 1, 0, 0): GenObservation(
values=np.array([0.0]),
stds=np.array([0.1]),
indices=np.array([0], dtype=np.int32),
std_scaling=np.array([1.0]),
),
datetime.datetime(2001, 1, 1, 1, 0, 0, 1): GenObservation(
values=np.array([0.0]),
stds=np.array([1.0]),
indices=np.array([0], dtype=np.int32),
std_scaling=np.array([0.0]),
),
},
)
},
obs_time=[datetime.datetime(2001, 1, 1, 0, 0)],
).datasets,
)
Problem seems to be related to there both being dates with millisecond time and second time present.
The issue can be provoked with the following
dataset = xr.combine_by_coords(
[
xr.Dataset(
{"value": (["step"], [0.0])},
coords={
"step": np.array(
[datetime.datetime(2000, 1, 1, 0, 0)], dtype="datetime64[ns]"
),
},
),
xr.Dataset(
{"value": (["step"], [0.0])},
coords={
"step": np.array(
[datetime.datetime(2000, 1, 1, 1, 0, 0, 1)],
dtype="datetime64[ns]",
),
},
),
]
)
dataset.to_netcdf(tmp_path / "netcdf", engine="scipy")
There is no problem when using netcdf4:
dataset.to_netcdf(tmp_path / "netcdf")
The same problem occurrs with 1second precision after ~70 years:
def test_netcdf_second_precision(tmp_path):
dataset = xr.combine_by_coords(
[
xr.Dataset(
{"value": (["step"], [0.0])},
coords={
"step": np.array(
[datetime.datetime(1930, 1, 1, 0, 0, 0)],
dtype="datetime64[ns]",
),
},
),
xr.Dataset(
{"value": (["step"], [0.0])},
coords={
"step": np.array(
[datetime.datetime(2000, 1, 1, 1, 0, 1)],
dtype="datetime64[ns]",
),
},
),
]
)
dataset.to_netcdf(tmp_path / "netcdf", engine="scipy")
The same can happen when the index is large (mind still within int32 so 2gb worth of 32 bit floats):
storage = open_storage(tmpdir, "w")
_ = storage.create_experiment(
[],
observations=EnkfObs(
obs_vectors={'A': ObsVector(
observation_type=EnkfObservationImplementationType.GEN_OBS,
observation_key='A',
data_key='A',
observations={0: GenObservation(
values=array([0., 0.]),
stds=array([1., 1.]),
indices=[0, 2147483648],
std_scaling=array([0., 0.]),
)},
)},
obs_time=[],
), parameters=[])
).datasets,
)
Blocked by #7015
This has been fixed