datasets
datasets copied to clipboard
Formatted map + with_format(None) changes array dtype for iterable datasets
Describe the bug
When applying with_format -> map -> with_format(None), array dtypes seem to change, even if features are passed
Steps to reproduce the bug
features=Features(**{"array0": Array3D((None, 10, 10), dtype="float32")})
dataset = Dataset.from_dict({f"array0": [np.zeros((100,10,10), dtype=np.float32)]*25}, features=features)
ds = dataset.to_iterable_dataset().with_format("numpy").map(lambda x: x, features=features)
ex_0 = next(iter(ds))
ds = dataset.to_iterable_dataset().with_format("numpy").map(lambda x: x, features=features).with_format(None)
ex_1 = next(iter(ds))
assert ex_1["array0"].dtype == ex_0["array0"].dtype, f"{ex_1['array0'].dtype} {ex_0['array0'].dtype}"
Expected behavior
Dtypes should be preserved.
Environment info
3.0.2
possibly due to this logic:
def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray:
if isinstance(pa_array, pa.ChunkedArray):
if isinstance(pa_array.type, _ArrayXDExtensionType):
# don't call to_pylist() to preserve dtype of the fixed-size array
zero_copy_only = _is_zero_copy_only(pa_array.type.storage_dtype, unnest=True)
array: List = [
row for chunk in pa_array.chunks for row in chunk.to_numpy(zero_copy_only=zero_copy_only)
]
else:
zero_copy_only = _is_zero_copy_only(pa_array.type) and all(
not _is_array_with_nulls(chunk) for chunk in pa_array.chunks
)
array: List = [
row for chunk in pa_array.chunks for row in chunk.to_numpy(zero_copy_only=zero_copy_only)
]
else:
if isinstance(pa_array.type, _ArrayXDExtensionType):
# don't call to_pylist() to preserve dtype of the fixed-size array
zero_copy_only = _is_zero_copy_only(pa_array.type.storage_dtype, unnest=True)
array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only)
else:
zero_copy_only = _is_zero_copy_only(pa_array.type) and not _is_array_with_nulls(pa_array)
array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()