datasets icon indicating copy to clipboard operation
datasets copied to clipboard

Dataset scripts are no longer supported, but found superb.py

Open edwinzajac opened this issue 5 months ago • 19 comments

Describe the bug

Hello,

I'm trying to follow the Hugging Face Pipelines tutorial but the tutorial seems to work only on old datasets versions.

I then get the error :

--------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[65], [line 1](vscode-notebook-cell:?execution_count=65&line=1)
----> [1](vscode-notebook-cell:?execution_count=65&line=1) dataset = datasets.load_dataset("superb", name="asr", split="test")
      3 # KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
      4 # as we're not interested in the *target* part of the dataset. For sentence pair use KeyPairDataset
      5 for out in tqdm(pipe(KeyDataset(dataset, "file"))):

File ~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:1392, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, **config_kwargs)
   1387 verification_mode = VerificationMode(
   1388     (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
   1389 )
   1391 # Create a dataset builder
-> [1392](https://file+.vscode-resource.vscode-cdn.net/home/edwin/Desktop/debug/llm_course/~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:1392) builder_instance = load_dataset_builder(
   1393     path=path,
   1394     name=name,
   1395     data_dir=data_dir,
   1396     data_files=data_files,
   1397     cache_dir=cache_dir,
   1398     features=features,
   1399     download_config=download_config,
   1400     download_mode=download_mode,
   1401     revision=revision,
   1402     token=token,
   1403     storage_options=storage_options,
   1404     **config_kwargs,
   1405 )
   1407 # Return iterable dataset in case of streaming
   1408 if streaming:

File ~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:1132, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, storage_options, **config_kwargs)
   1130 if features is not None:
   1131     features = _fix_for_backward_compatible_features(features)
-> [1132](https://file+.vscode-resource.vscode-cdn.net/home/edwin/Desktop/debug/llm_course/~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:1132) dataset_module = dataset_module_factory(
   1133     path,
   1134     revision=revision,
   1135     download_config=download_config,
   1136     download_mode=download_mode,
   1137     data_dir=data_dir,
   1138     data_files=data_files,
   1139     cache_dir=cache_dir,
   1140 )
   1141 # Get dataset builder class
   1142 builder_kwargs = dataset_module.builder_kwargs

File ~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:1031, in dataset_module_factory(path, revision, download_config, download_mode, data_dir, data_files, cache_dir, **download_kwargs)
   1026             if isinstance(e1, FileNotFoundError):
   1027                 raise FileNotFoundError(
   1028                     f"Couldn't find any data file at {relative_to_absolute_path(path)}. "
   1029                     f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
   1030                 ) from None
-> [1031](https://file+.vscode-resource.vscode-cdn.net/home/edwin/Desktop/debug/llm_course/~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:1031)             raise e1 from None
   1032 else:
   1033     raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")

File ~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:989, in dataset_module_factory(path, revision, download_config, download_mode, data_dir, data_files, cache_dir, **download_kwargs)
    981 try:
    982     api.hf_hub_download(
    983         repo_id=path,
    984         filename=filename,
   (...)    987         proxies=download_config.proxies,
    988     )
--> [989](https://file+.vscode-resource.vscode-cdn.net/home/edwin/Desktop/debug/llm_course/~/Desktop/debug/llm_course/.venv/lib/python3.11/site-packages/datasets/load.py:989)     raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}")
    990 except EntryNotFoundError:
    991     # Use the infos from the parquet export except in some cases:
    992     if data_dir or data_files or (revision and revision != "main"):

RuntimeError: Dataset scripts are no longer supported, but found superb.py

NB : I tried to replace "superb" by "anton-l/superb_demo" but I get a 'torchcodec' importing error. Maybe I misunderstood something.

Steps to reproduce the bug

import datasets
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
dataset = datasets.load_dataset("superb", name="asr", split="test")

# KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
# as we're not interested in the *target* part of the dataset. For sentence pair use KeyPairDataset
for out in tqdm(pipe(KeyDataset(dataset, "file"))):
    print(out)
    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
    # {"text": ....}
    # ....

Expected behavior

Get the tutorial expected results

Environment info

--- SYSTEM INFO --- Operating System: Ubuntu 24.10
Kernel: Linux 6.11.0-29-generic Architecture: x86-64

--- PYTHON --- Python 3.11.13

--- VENV INFO ---- datasets=4.0.0 transformers=4.53 tqdm=4.67.1

edwinzajac avatar Jul 20 '25 13:07 edwinzajac

I got a pretty similar issue when I try to load bigbio/neurotrial_ner dataset. Dataset scripts are no longer supported, but found neurotrial_ner.py

dejokz avatar Jul 21 '25 14:07 dejokz

Same here. I was running this tutorial and got a similar error: https://github.com/openai/whisper/discussions/654 (I'm a first-time transformers library user)

RuntimeError: Dataset scripts are no longer supported, but found librispeech_asr.py

What am I supposed to do at this point?

Thanks

gMontoyaSpeech avatar Jul 22 '25 10:07 gMontoyaSpeech

hey I got the same error and I have tried to downgrade version to 3.6.0 and it works. pip install datasets==3.6.0

Tin-viAct avatar Jul 22 '25 15:07 Tin-viAct

Thank you very much @Tin-viAct . That indeed did the trick for me :) Now the code continue its normal flow

gMontoyaSpeech avatar Jul 22 '25 17:07 gMontoyaSpeech

Thanks @Tin-viAct, Works!

johnbarb71 avatar Jul 24 '25 14:07 johnbarb71

I converted openslr/librispeech_asr to Parquet - thanks for reporting.

It's now compatible with datasets 4.0 !

I'll try to ping the authors of the other datasets like s3prl/superb and espnet/yodas2

lhoestq avatar Jul 25 '25 15:07 lhoestq

How come a breaking change was allowed and now requires extra work from individual authors for things to be usable?

https://en.wikipedia.org/wiki/Backward_compatibility

pgzmnk avatar Jul 29 '25 10:07 pgzmnk

We follow semantic versioning so that breaking changes only occur in major releases. Also note that dataset scripts have been legacy for some time now, with a message on the dataset pages to ask authors to update their datasets.

It's ok to ping older versions of datasets, but imo a few remaining datasets need to be converted since they are valuable to the community.

lhoestq avatar Jul 30 '25 15:07 lhoestq

I was facing the same issue with a not so familiar dataset in hugging hub . downgrading the datasets version worked ❤️. Thank you @Tin-viAct .

rimshidali avatar Aug 05 '25 11:08 rimshidali

Thank you so much, @Tin-viAct ! I’ve been struggling with this issue for about 3 hours, and your suggestion to downgrade datasets worked perfectly. I really appreciate the help—you saved me!

ZahraDehghani99 avatar Aug 08 '25 07:08 ZahraDehghani99

hey I got the same error and I have tried to downgrade version to 3.6.0 and it works. pip install datasets==3.6.0

Thank you so much! I was following the quickstart and the very first sample fails. Not a good way to get started....

Elissen avatar Aug 08 '25 09:08 Elissen

hey I got the same error and I have tried to downgrade version to 3.6.0 and it works. pip install datasets==3.6.0 thank you! I get it.

nev8r avatar Aug 08 '25 14:08 nev8r

I updated hotpot_qa and pinged the PolyAI folks to update the dataset used in the quickstart as well: https://huggingface.co/datasets/PolyAI/minds14/discussions/35 edit: merged ! edit2: quickstart dataset is also fixed !

lhoestq avatar Aug 11 '25 10:08 lhoestq

LegalBench is downloaded 10k times a month and is now broken. Would be great to have this fixed.

umarbutler avatar Aug 18 '25 08:08 umarbutler

I opened a PR to convert LegalBench to Parquet and reached out to the author: https://huggingface.co/datasets/nguha/legalbench/discussions/34

lhoestq avatar Aug 18 '25 16:08 lhoestq

Thank you very much @Tin-viAct! I’d been looking everywhere for a fix, and your reply saved me :)

Hamidreza-Gandomi avatar Aug 20 '25 15:08 Hamidreza-Gandomi

Tried downgrading the datasets version. But the problem with this is that it had led to compatibility issues and other breaking changes and more errors on other parts of my code

sa-abdullah avatar Sep 03 '25 18:09 sa-abdullah

I opened a few more PRs and reached out to the authors:

  • https://huggingface.co/datasets/Skylion007/openwebtext/discussions/22
  • https://huggingface.co/datasets/stas/openwebtext-10k/discussions/2

Btw if you want to open a PR to a dataset to convert it to Parquet here is the command:

uv run --with "datasets==3.6.0" datasets-cli convert_to_parquet <username/dataset-name> --trust_remote_code

(just replace the <username/dataset-name> with the dataset repository name)

lhoestq avatar Sep 04 '25 10:09 lhoestq

The datasets.load_dataset(<username/dataset-name>, revision='refs/convert/parquet') method can also be used to download converted branches.

Dxee-e avatar Dec 02 '25 05:12 Dxee-e

Hello, I have a similar message: "RuntimeError: Dataset scripts are no longer supported, but found code_search_net.py" while following the LLM Course (https://huggingface.co/learn/llm-course/chapter6/2).

Downgrading via !pip install datasets==3.6.0 works as a workaround.

@lhoestq I tried to update the dataset, but run into an exception, any suggestions on how to solve it? Thanks!

(dataset3) D:\code_search_net>uv run --with "datasets==3.6.0" datasets-cli convert_to_parquet code-search-net/code_search_net --trust_remote_code code-search-net/code_search_net configs = ['all', 'java', 'go', 'python', 'javascript', 'ruby', 'php'] default_config = 'all' config = 'all' Loading Dataset Infos from C:....\cache\huggingface\modules\datasets_modules\datasets\code-search-net--code_search_net\8f2524e6b62f65af5f5d65c53715c654db7b08dc93e0b7bcce2ab2f286a75be1 Generating dataset code_search_net (C:/.../.cache/huggingface/datasets/code-search-net___code_search_net/all/1.0.0/8f2524e6b62f65af5f5d65c53715c654db7b08dc93e0b7bcce2ab2f286a75be1) Downloading and preparing dataset code_search_net/all to C:/.../.cache/huggingface/datasets/code-search-net___code_search_net/all/1.0.0/8f2524e6b62f65af5f5d65c53715c654db7b08dc93e0b7bcce2ab2f286a75be1... Downloading took 0.0 min Checksum Computation took 0.0 min Generating train split Generating train split: 0%| | 1184/1880853 [00:00<13:30, 2318.73 examples/s] BUILDER_EXCEPTION: Unterminated string starting at: line 1 column 3072 (char 3071) 0_1537 {'repository_name': 'pandas-dev/pandas', 'func_path_in_repository': 'pandas/io/msgpack/init.py', 'func_name': 'pack', 'whole_func_string': 'def pack(o, stream, **kwargs):\n """\n Pack object o and write it to stream\n\n See :class:Packer for options.\n """\n packer = Packer(**kwargs)\n stream.write(packer.pack(o))', 'language': 'python', 'func_code_string': 'def pack(o, stream, **kwargs):\n """\n Pack object o and write it to stream\n\n See :class:Packer for options.\n """\n packer = Packer(**kwargs)\n stream.write(packer.pack(o))', 'func_code_tokens': ['def', 'pack', '(', 'o', ',', 'stream', ',', '', '', 'kwargs', ')', ':', 'packer', '=', 'Packer', '(', '', '', 'kwargs', ')', 'stream', '.', 'write', '(', 'packer', '.', 'pack', '(', 'o', ')', ')'], 'func_documentation_string': 'Pack object o and write it to stream\n\n See :class:Packer for options.', 'func_documentation_tokens': ['Pack', 'object', 'o', 'and', 'write', 'it', 'to', 'stream'], 'split_name': 'train', 'func_code_url': 'https://github.com/pandas-dev/pandas/blob/9feb3ad92cc0397a04b665803a49299ee7aa1037/pandas/io/msgpack/init.py#L26-L33'}

Generating train split: 0%| | 1538/1880853 [00:00<12:33, 2493.92 examples/s] Traceback (most recent call last): File "C:...AppData\Local\uv\cache\archive-v0\PE425lPXpjbYG9_KUpXIJ\Lib\site-packages\datasets\builder.py", line 1608, in _prepare_split_single for key, record in generator: ^^^^^^^^^ File "C:....cache\huggingface\modules\datasets_modules\datasets\code-search-net--code_search_net\8f2524e6b62f65af5f5d65c53715c654db7b08dc93e0b7bcce2ab2f286a75be1\code_search_net.py", line 206, in generate_examples data = json.loads(row) File "C:\Program Files\Python\Lib\json_init.py", line 347, in loads return _default_decoder.decode(s) ~~~~~~~~~~~~~~~~~~~~~~~^^^ File "C:\Program Files\Python\Lib\json\decoder.py", line 345, in decode obj, end = self.raw_decode(s, idx=_w(s, 0).end()) ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Program Files\Python\Lib\json\decoder.py", line 362, in raw_decode obj, end = self.scan_once(s, idx) ~~~~~~~~~~~~~~^^^^^^^^ json.decoder.JSONDecodeError: Unterminated string starting at: line 1 column 3072 (char 3071)

The above exception was the direct cause of the following exception:

Traceback (most recent call last): File "", line 198, in _run_module_as_main File "", line 88, in run_code File "C:...AppData\Local\uv\cache\builds-v0.tmpFG00JC\Scripts\datasets-cli.exe_main.py", line 10, in sys.exit(main()) ~~~~^^ File "C:...AppData\Local\uv\cache\archive-v0\PE425lPXpjbYG9_KUpXIJ\Lib\site-packages\datasets\commands\datasets_cli.py", line 39, in main service.run() ~~~~~~~~~~~^^ File "C:...AppData\Local\uv\cache\archive-v0\PE425lPXpjbYG9_KUpXIJ\Lib\site-packages\datasets\commands\convert_to_parquet.py", line 44, in run _ = convert_to_parquet( self._dataset_id, revision=self._revision, token=self._token, trust_remote_code=self._trust_remote_code ) File "C:...AppData\Local\uv\cache\archive-v0\PE425lPXpjbYG9_KUpXIJ\Lib\site-packages\datasets\hub.py", line 71, in convert_to_parquet dataset = load_dataset(repo_id, config, revision=revision, trust_remote_code=trust_remote_code) File "C:...AppData\Local\uv\cache\archive-v0\PE425lPXpjbYG9_KUpXIJ\Lib\site-packages\datasets\load.py", line 2084, in load_dataset builder_instance.download_and_prepare( ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ download_config=download_config, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ...<3 lines>... storage_options=storage_options, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "C:...AppData\Local\uv\cache\archive-v0\PE425lPXpjbYG9_KUpXIJ\Lib\site-packages\datasets\builder.py", line 925, in download_and_prepare self._download_and_prepare( ~~~~~~~~~~~~~~~~~~~~~~~~~~^ dl_manager=dl_manager, ^^^^^^^^^^^^^^^^^^^^^^ ...<2 lines>... **download_and_prepare_kwargs, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "C:...AppData\Local\uv\cache\archive-v0\PE425lPXpjbYG9_KUpXIJ\Lib\site-packages\datasets\builder.py", line 1651, in _download_and_prepare super()._download_and_prepare( ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ dl_manager, ^^^^^^^^^^^ ...<3 lines>... **prepare_splits_kwargs, ^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "C:...AppData\Local\uv\cache\archive-v0\PE425lPXpjbYG9_KUpXIJ\Lib\site-packages\datasets\builder.py", line 1001, in _download_and_prepare self._prepare_split(split_generator, **prepare_split_kwargs) ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:...\AppData\Local\uv\cache\archive-v0\PE425lPXpjbYG9_KUpXIJ\Lib\site-packages\datasets\builder.py", line 1487, in _prepare_split for job_id, done, content in self._prepare_split_single( ~~~~~~~~~~~~~~~~~~~~~~~~~~^ gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ): ^ File "C:...\AppData\Local\uv\cache\archive-v0\PE425lPXpjbYG9_KUpXIJ\Lib\site-packages\datasets\builder.py", line 1646, in _prepare_split_single raise DatasetGenerationError("An error occurred while generating the dataset") from e datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset

quilaztlia avatar Dec 18 '25 15:12 quilaztlia