datasets
datasets copied to clipboard
Error when processing speech_commands dataset
/!\ PLEASE INCLUDE THE FULL STACKTRACE AND CODE SNIPPET
Short description An error occurs when processing the speech_commands dataset.
Environment information
- Operating System: mac0S
- Python version: Python 3.10
-
tensorflow-datasets
/tfds-nightly
version: 4.9.4 -
tensorflow
/tf-nightly
version: 2.16.1
Reproduction instructions
import tensorflow_datasets as tfds
dataset_name = "speech_commands"
tfds_path = "/Users/glo/tensorflow-datasets" # Change the path
builder = tfds.builder(dataset_name, data_dir=tfds_path)
builder.download_and_prepare()
Link to logs
File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tensorflow_datasets/core/dataset_builder.py:1584, in GeneratorBasedBuilder._download_and_prepare(self, dl_manager, download_config) 1572 for split_name, generator in utils.tqdm( 1573 split_generators.items(), 1574 desc="Generating splits...", 1575 unit=" splits", 1576 leave=False, 1577 ): 1578 filename_template = naming.ShardedFileTemplate( 1579 split=split_name, 1580 dataset_name=self.name, 1581 data_dir=self.data_path, 1582 filetype_suffix=path_suffix, 1583 ) -> 1584 future = split_builder.submit_split_generation( 1585 split_name=split_name, 1586 generator=generator, 1587 filename_template=filename_template, 1588 disable_shuffling=self.info.disable_shuffling, 1589 ) 1590 split_info_futures.append(future) 1592 # Process the result of the beam pipeline.
File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tensorflow_datasets/core/split_builder.py:341, in SplitBuilder.submit_split_generation(self, split_name, generator, filename_template, disable_shuffling)
338 # Depending on the type of generator, we use the corresponding
339 # _build_from_xyz
method.
340 if isinstance(generator, collections.abc.Iterable):
--> 341 return self._build_from_generator(**build_kwargs)
342 else: # Otherwise, beam required
343 unknown_generator_type = TypeError(
344 f'Invalid split generator value for split {split_name}
. '
345 'Expected generator or apache_beam object. Got: '
346 f'{type(generator)}'
347 )
File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tensorflow_datasets/core/split_builder.py:406, in SplitBuilder._build_from_generator(self, split_name, generator, filename_template, disable_shuffling) 396 serialized_info = self._features.get_serialized_info() 397 writer = writer_lib.Writer( 398 serializer=example_serializer.ExampleSerializer(serialized_info), 399 filename_template=filename_template, (...) 404 shard_config=self._shard_config, 405 ) --> 406 for key, example in utils.tqdm( 407 generator, 408 desc=f'Generating {split_name} examples...', 409 unit=' examples', 410 total=total_num_examples, 411 leave=False, 412 mininterval=1.0, 413 ): 414 try: 415 example = self._features.encode_example(example)
File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tqdm/notebook.py:249, in tqdm_notebook.iter(self) 247 try: 248 it = super(tqdm_notebook, self).iter() --> 249 for obj in it: 250 # return super(tqdm...) will not catch exception 251 yield obj 252 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt
File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tqdm/std.py:1182, in tqdm.iter(self) 1179 time = self._time 1181 try: -> 1182 for obj in iterable: 1183 yield obj 1184 # Update and possibly print the progressbar. 1185 # Note: does not call self.update(1) for speed optimisation.
File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tensorflow_datasets/datasets/speech_commands/speech_commands_dataset_builder.py:138, in Builder._generate_examples(self, archive, file_list) 134 else: 135 try: 136 example = { 137 'audio': np.array( --> 138 lazy_imports_lib.lazy_imports.pydub.AudioSegment.from_file( 139 file_obj, format='wav' 140 ).get_array_of_samples() 141 ), 142 'label': label, 143 } 144 yield example_id, example 145 except ( 146 lazy_imports_lib.lazy_imports.pydub.exceptions.CouldntDecodeError 147 ):
File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/pydub/audio_segment.py:728, in AudioSegment.from_file(cls, file, format, codec, parameters, start_second, duration, **kwargs) 726 info = None 727 else: --> 728 info = mediainfo_json(orig_file, read_ahead_limit=read_ahead_limit) 729 if info: 730 audio_streams = [x for x in info['streams'] 731 if x['codec_type'] == 'audio']
File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/pydub/utils.py:279, in mediainfo_json(filepath, read_ahead_limit) 276 output = output.decode("utf-8", 'ignore') 277 stderr = stderr.decode("utf-8", 'ignore') --> 279 info = json.loads(output) 281 if not info: 282 # If ffprobe didn't give any information, just return it 283 # (for example, because the file doesn't exist) 284 return info
File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/json/init.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw) 341 s = s.decode(detect_encoding(s), 'surrogatepass') 343 if (cls is None and object_hook is None and 344 parse_int is None and parse_float is None and 345 parse_constant is None and object_pairs_hook is None and not kw): --> 346 return _default_decoder.decode(s) 347 if cls is None: 348 cls = JSONDecoder
File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/json/decoder.py:337, in JSONDecoder.decode(self, s, _w)
332 def decode(self, s, _w=WHITESPACE.match):
333 """Return the Python representation of s
(a str
instance
334 containing a JSON document).
335
336 """
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
338 end = _w(s, end).end()
339 if end != len(s):
File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/json/decoder.py:355, in JSONDecoder.raw_decode(self, s, idx) 353 obj, end = self.scan_once(s, idx) 354 except StopIteration as err: --> 355 raise JSONDecodeError("Expecting value", s, err.value) from None 356 return obj, end
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Expected behavior No error during the processing of the speech_command dataset.
Did you also get this error without a custom data dir? I cannot reproduce the error running in Colab the following:
import tensorflow_datasets as tfds
dataset_name = "speech_commands"
builder = tfds.builder(dataset_name)
builder.download_and_prepare()
I reproduced the same problem. I was able to fix it by downgrading pydub to version 0.23.1 instead of 0.25.1