datasets
datasets copied to clipboard
[Apple M1] MemoryError: Cannot allocate write+execute memory for ffi.callback()
Describe the bug
I'm trying to run cast_column("audio", Audio())
on Apple M1 Pro, but it seems that it doesn't work.
Steps to reproduce the bug
import datasets
dataset = load_dataset("csv", data_files="./train.csv")["train"]
dataset = dataset.map(lambda x: {"audio": str(DATA_DIR / "audio" / x["audio"])})
dataset = dataset.cast_column("audio", Audio())
dataset[0]
Expected results
{'audio': {'bytes': None,
'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav'},
'english_transcription': 'I would like to set up a joint account with my partner',
'intent_class': 11,
'lang_id': 4,
'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
'transcription': 'I would like to set up a joint account with my partner'}
Actual results
MemoryError Traceback (most recent call last)
Input In [6], in <cell line: 1>()
----> 1 dataset[0]
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/datasets/arrow_dataset.py:2165, in Dataset.__getitem__(self, key)
2163 def __getitem__(self, key): # noqa: F811
2164 """Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools)."""
-> 2165 return self._getitem(
2166 key,
2167 )
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/datasets/arrow_dataset.py:2150, in Dataset._getitem(self, key, decoded, **kwargs)
2148 formatter = get_formatter(format_type, features=self.features, decoded=decoded, **format_kwargs)
2149 pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
-> 2150 formatted_output = format_table(
2151 pa_subtable, key, formatter=formatter, format_columns=format_columns, output_all_columns=output_all_columns
2152 )
2153 return formatted_output
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/datasets/formatting/formatting.py:532, in format_table(table, key, formatter, format_columns, output_all_columns)
530 python_formatter = PythonFormatter(features=None)
531 if format_columns is None:
--> 532 return formatter(pa_table, query_type=query_type)
533 elif query_type == "column":
534 if key in format_columns:
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/datasets/formatting/formatting.py:281, in Formatter.__call__(self, pa_table, query_type)
279 def __call__(self, pa_table: pa.Table, query_type: str) -> Union[RowFormat, ColumnFormat, BatchFormat]:
280 if query_type == "row":
--> 281 return self.format_row(pa_table)
282 elif query_type == "column":
283 return self.format_column(pa_table)
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/datasets/formatting/formatting.py:312, in PythonFormatter.format_row(self, pa_table)
310 row = self.python_arrow_extractor().extract_row(pa_table)
311 if self.decoded:
--> 312 row = self.python_features_decoder.decode_row(row)
313 return row
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/datasets/formatting/formatting.py:221, in PythonFeaturesDecoder.decode_row(self, row)
220 def decode_row(self, row: dict) -> dict:
--> 221 return self.features.decode_example(row) if self.features else row
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/datasets/features/features.py:1647, in Features.decode_example(self, example, token_per_repo_id)
1634 def decode_example(self, example: dict, token_per_repo_id: Optional[Dict[str, Union[str, bool, None]]] = None):
1635 """Decode example with custom feature decoding.
1636
1637 Args:
(...)
1644 :obj:`dict[str, Any]`
1645 """
-> 1647 return {
1648 column_name: decode_nested_example(feature, value, token_per_repo_id=token_per_repo_id)
1649 if self._column_requires_decoding[column_name]
1650 else value
1651 for column_name, (feature, value) in zip_dict(
1652 {key: value for key, value in self.items() if key in example}, example
1653 )
1654 }
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/datasets/features/features.py:1648, in <dictcomp>(.0)
1634 def decode_example(self, example: dict, token_per_repo_id: Optional[Dict[str, Union[str, bool, None]]] = None):
1635 """Decode example with custom feature decoding.
1636
1637 Args:
(...)
1644 :obj:`dict[str, Any]`
1645 """
1647 return {
-> 1648 column_name: decode_nested_example(feature, value, token_per_repo_id=token_per_repo_id)
1649 if self._column_requires_decoding[column_name]
1650 else value
1651 for column_name, (feature, value) in zip_dict(
1652 {key: value for key, value in self.items() if key in example}, example
1653 )
1654 }
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/datasets/features/features.py:1260, in decode_nested_example(schema, obj, token_per_repo_id)
1257 # Object with special decoding:
1258 elif isinstance(schema, (Audio, Image)):
1259 # we pass the token to read and decode files from private repositories in streaming mode
-> 1260 return schema.decode_example(obj, token_per_repo_id=token_per_repo_id) if obj is not None else None
1261 return obj
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/datasets/features/audio.py:156, in Audio.decode_example(self, value, token_per_repo_id)
154 array, sampling_rate = self._decode_non_mp3_file_like(file)
155 else:
--> 156 array, sampling_rate = self._decode_non_mp3_path_like(path, token_per_repo_id=token_per_repo_id)
157 return {"path": path, "array": array, "sampling_rate": sampling_rate}
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/datasets/features/audio.py:257, in Audio._decode_non_mp3_path_like(self, path, format, token_per_repo_id)
254 use_auth_token = None
256 with xopen(path, "rb", use_auth_token=use_auth_token) as f:
--> 257 array, sampling_rate = librosa.load(f, sr=self.sampling_rate, mono=self.mono)
258 return array, sampling_rate
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/librosa/util/decorators.py:88, in deprecate_positional_args.<locals>._inner_deprecate_positional_args.<locals>.inner_f(*args, **kwargs)
86 extra_args = len(args) - len(all_args)
87 if extra_args <= 0:
---> 88 return f(*args, **kwargs)
90 # extra_args > 0
91 args_msg = [
92 "{}={}".format(name, arg)
93 for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:])
94 ]
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/librosa/core/audio.py:164, in load(path, sr, mono, offset, duration, dtype, res_type)
161 else:
162 # Otherwise try soundfile first, and then fall back if necessary
163 try:
--> 164 y, sr_native = __soundfile_load(path, offset, duration, dtype)
166 except RuntimeError as exc:
167 # If soundfile failed, try audioread instead
168 if isinstance(path, (str, pathlib.PurePath)):
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/librosa/core/audio.py:195, in __soundfile_load(path, offset, duration, dtype)
192 context = path
193 else:
194 # Otherwise, create the soundfile object
--> 195 context = sf.SoundFile(path)
197 with context as sf_desc:
198 sr_native = sf_desc.samplerate
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/soundfile.py:629, in SoundFile.__init__(self, file, mode, samplerate, channels, subtype, endian, format, closefd)
626 self._mode = mode
627 self._info = _create_info_struct(file, mode, samplerate, channels,
628 format, subtype, endian)
--> 629 self._file = self._open(file, mode_int, closefd)
630 if set(mode).issuperset('r+') and self.seekable():
631 # Move write position to 0 (like in Python file objects)
632 self.seek(0)
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/soundfile.py:1179, in SoundFile._open(self, file, mode_int, closefd)
1177 file_ptr = _snd.sf_open_fd(file, mode_int, self._info, closefd)
1178 elif _has_virtual_io_attrs(file, mode_int):
-> 1179 file_ptr = _snd.sf_open_virtual(self._init_virtual_io(file),
1180 mode_int, self._info, _ffi.NULL)
1181 else:
1182 raise TypeError("Invalid file: {0!r}".format(self.name))
File ~/miniconda3/envs/rodan/lib/python3.8/site-packages/soundfile.py:1197, in SoundFile._init_virtual_io(self, file)
1194 def _init_virtual_io(self, file):
1195 """Initialize callback functions for sf_open_virtual()."""
1196 @_ffi.callback("sf_vio_get_filelen")
-> 1197 def vio_get_filelen(user_data):
1198 curr = file.tell()
1199 file.seek(0, SEEK_END)
MemoryError: Cannot allocate write+execute memory for ffi.callback(). You might be running on a system that prevents this. For more information, see https://cffi.readthedocs.io/en/latest/using.html#callbacks
```
## Environment info
- `datasets` version: 2.4.0
- Platform: macOS-12.5.1-arm64-arm-64bit
- Python version: 3.8.13
- PyArrow version: 9.0.0
- Pandas version: 1.4.4
Hi! This seems like a bug in soundfile
. Could you please open an issue in their repo? soundfile
works without any issues on my M1, so I'm not sure we can help.
Hi @mariosasko, can you share how you installed soundfile
on your mac M1?
Hi @hoangtnm - I upgraded to python 3.10 and it fixed the problem for me. I was also running 3.8 on an M1 mac.
Same here, upgrade python didn't work for me
MemoryError: Cannot allocate write+execute memory for ffi.callback()
any idea?
This is a soundfile
issue, so there isn't much we can do about it. Hopefully, it gets fixed soon.
Hi @hoangtnm - I upgraded to python 3.10 and it fixed the problem for me. I was also running 3.8 on an M1 mac.
it work for me too