datasets icon indicating copy to clipboard operation
datasets copied to clipboard

Error when processing speech_commands dataset

Open guillaumelorre28 opened this issue 10 months ago • 2 comments

/!\ PLEASE INCLUDE THE FULL STACKTRACE AND CODE SNIPPET

Short description An error occurs when processing the speech_commands dataset.

Environment information

  • Operating System: mac0S
  • Python version: Python 3.10
  • tensorflow-datasets/tfds-nightly version: 4.9.4
  • tensorflow/tf-nightly version: 2.16.1

Reproduction instructions

import tensorflow_datasets as tfds

dataset_name = "speech_commands"
tfds_path = "/Users/glo/tensorflow-datasets"  # Change the path

builder = tfds.builder(dataset_name, data_dir=tfds_path)
builder.download_and_prepare()

Link to logs

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tensorflow_datasets/core/dataset_builder.py:1584, in GeneratorBasedBuilder._download_and_prepare(self, dl_manager, download_config) 1572 for split_name, generator in utils.tqdm( 1573 split_generators.items(), 1574 desc="Generating splits...", 1575 unit=" splits", 1576 leave=False, 1577 ): 1578 filename_template = naming.ShardedFileTemplate( 1579 split=split_name, 1580 dataset_name=self.name, 1581 data_dir=self.data_path, 1582 filetype_suffix=path_suffix, 1583 ) -> 1584 future = split_builder.submit_split_generation( 1585 split_name=split_name, 1586 generator=generator, 1587 filename_template=filename_template, 1588 disable_shuffling=self.info.disable_shuffling, 1589 ) 1590 split_info_futures.append(future) 1592 # Process the result of the beam pipeline.

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tensorflow_datasets/core/split_builder.py:341, in SplitBuilder.submit_split_generation(self, split_name, generator, filename_template, disable_shuffling) 338 # Depending on the type of generator, we use the corresponding 339 # _build_from_xyz method. 340 if isinstance(generator, collections.abc.Iterable): --> 341 return self._build_from_generator(**build_kwargs) 342 else: # Otherwise, beam required 343 unknown_generator_type = TypeError( 344 f'Invalid split generator value for split {split_name}. ' 345 'Expected generator or apache_beam object. Got: ' 346 f'{type(generator)}' 347 )

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tensorflow_datasets/core/split_builder.py:406, in SplitBuilder._build_from_generator(self, split_name, generator, filename_template, disable_shuffling) 396 serialized_info = self._features.get_serialized_info() 397 writer = writer_lib.Writer( 398 serializer=example_serializer.ExampleSerializer(serialized_info), 399 filename_template=filename_template, (...) 404 shard_config=self._shard_config, 405 ) --> 406 for key, example in utils.tqdm( 407 generator, 408 desc=f'Generating {split_name} examples...', 409 unit=' examples', 410 total=total_num_examples, 411 leave=False, 412 mininterval=1.0, 413 ): 414 try: 415 example = self._features.encode_example(example)

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tqdm/notebook.py:249, in tqdm_notebook.iter(self) 247 try: 248 it = super(tqdm_notebook, self).iter() --> 249 for obj in it: 250 # return super(tqdm...) will not catch exception 251 yield obj 252 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tqdm/std.py:1182, in tqdm.iter(self) 1179 time = self._time 1181 try: -> 1182 for obj in iterable: 1183 yield obj 1184 # Update and possibly print the progressbar. 1185 # Note: does not call self.update(1) for speed optimisation.

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tensorflow_datasets/datasets/speech_commands/speech_commands_dataset_builder.py:138, in Builder._generate_examples(self, archive, file_list) 134 else: 135 try: 136 example = { 137 'audio': np.array( --> 138 lazy_imports_lib.lazy_imports.pydub.AudioSegment.from_file( 139 file_obj, format='wav' 140 ).get_array_of_samples() 141 ), 142 'label': label, 143 } 144 yield example_id, example 145 except ( 146 lazy_imports_lib.lazy_imports.pydub.exceptions.CouldntDecodeError 147 ):

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/pydub/audio_segment.py:728, in AudioSegment.from_file(cls, file, format, codec, parameters, start_second, duration, **kwargs) 726 info = None 727 else: --> 728 info = mediainfo_json(orig_file, read_ahead_limit=read_ahead_limit) 729 if info: 730 audio_streams = [x for x in info['streams'] 731 if x['codec_type'] == 'audio']

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/pydub/utils.py:279, in mediainfo_json(filepath, read_ahead_limit) 276 output = output.decode("utf-8", 'ignore') 277 stderr = stderr.decode("utf-8", 'ignore') --> 279 info = json.loads(output) 281 if not info: 282 # If ffprobe didn't give any information, just return it 283 # (for example, because the file doesn't exist) 284 return info

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/json/init.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw) 341 s = s.decode(detect_encoding(s), 'surrogatepass') 343 if (cls is None and object_hook is None and 344 parse_int is None and parse_float is None and 345 parse_constant is None and object_pairs_hook is None and not kw): --> 346 return _default_decoder.decode(s) 347 if cls is None: 348 cls = JSONDecoder

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/json/decoder.py:337, in JSONDecoder.decode(self, s, _w) 332 def decode(self, s, _w=WHITESPACE.match): 333 """Return the Python representation of s (a str instance 334 containing a JSON document). 335 336 """ --> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end()) 338 end = _w(s, end).end() 339 if end != len(s):

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/json/decoder.py:355, in JSONDecoder.raw_decode(self, s, idx) 353 obj, end = self.scan_once(s, idx) 354 except StopIteration as err: --> 355 raise JSONDecodeError("Expecting value", s, err.value) from None 356 return obj, end

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

Expected behavior No error during the processing of the speech_command dataset.

guillaumelorre28 avatar Apr 24 '24 11:04 guillaumelorre28

Did you also get this error without a custom data dir? I cannot reproduce the error running in Colab the following:

import tensorflow_datasets as tfds

dataset_name = "speech_commands"

builder = tfds.builder(dataset_name)
builder.download_and_prepare()

fylux avatar Apr 29 '24 07:04 fylux

I reproduced the same problem. I was able to fix it by downgrading pydub to version 0.23.1 instead of 0.25.1

Toby1218 avatar Jul 27 '24 09:07 Toby1218