Chapter 6 - Issue Loading `cnn_dailymail` dataset

Open hariravichandran opened this issue 1 year ago • 1 comments

Describe the bug

So I am getting this bug when I try to run cell 4 of the Chapter 6 notebook code: dataset = load_dataset("ccdv/cnn_dailymail", version="3.0.0")

Error Message:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[4], line 4
      1 #hide_output
      2 from datasets import load_dataset
----> 4 dataset = load_dataset("ccdv/cnn_dailymail", version="3.0.0")
      7 # dataset = load_dataset("ccdv/cnn_dailymail", version="3.0.0", trust_remote_code=True)
      8 print(f"Features: {dataset['train'].column_names}")

File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\load.py:2587, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
   2583 # Build dataset for splits
   2584 keep_in_memory = (
   2585     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   2586 )
-> 2587 ds = builder_instance.as_dataset(split=split, verification_mode=verification_mode, in_memory=keep_in_memory)
   2588 # Rename and cast features to match task schema
   2589 if task is not None:
   2590     # To avoid issuing the same warning twice

File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\builder.py:1244, in DatasetBuilder.as_dataset(self, split, run_post_process, verification_mode, ignore_verifications, in_memory)
   1241 verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS)
   1243 # Create a dataset for each of the given splits
-> 1244 datasets = map_nested(
   1245     partial(
   1246         self._build_single_dataset,
   1247         run_post_process=run_post_process,
   1248         verification_mode=verification_mode,
   1249         in_memory=in_memory,
   1250     ),
   1251     split,
   1252     map_tuple=True,
   1253     disable_tqdm=True,
   1254 )
   1255 if isinstance(datasets, dict):
   1256     datasets = DatasetDict(datasets)

File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\utils\py_utils.py:477, in map_nested(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, types, disable_tqdm, desc)
    466     mapped = [
    467         map_nested(
    468             function=function,
   (...)
    474         for obj in iterable
    475     ]
    476 elif num_proc != -1 and num_proc <= 1 or len(iterable) < parallel_min_length:
--> 477     mapped = [
    478         _single_map_nested((function, obj, types, None, True, None))
    479         for obj in hf_tqdm(iterable, disable=disable_tqdm, desc=desc)
    480     ]
    481 else:
    482     with warnings.catch_warnings():

File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\utils\py_utils.py:478, in <listcomp>(.0)
    466     mapped = [
    467         map_nested(
    468             function=function,
   (...)
    474         for obj in iterable
    475     ]
    476 elif num_proc != -1 and num_proc <= 1 or len(iterable) < parallel_min_length:
    477     mapped = [
--> 478         _single_map_nested((function, obj, types, None, True, None))
    479         for obj in hf_tqdm(iterable, disable=disable_tqdm, desc=desc)
    480     ]
    481 else:
    482     with warnings.catch_warnings():

File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\utils\py_utils.py:370, in _single_map_nested(args)
    368 # Singleton first to spare some computation
    369 if not isinstance(data_struct, dict) and not isinstance(data_struct, types):
--> 370     return function(data_struct)
    372 # Reduce logging to keep things readable in multiprocessing with tqdm
    373 if rank is not None and logging.get_verbosity() < logging.WARNING:

File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\builder.py:1274, in DatasetBuilder._build_single_dataset(self, split, run_post_process, verification_mode, in_memory)
   1271     split = Split(split)
   1273 # Build base dataset
-> 1274 ds = self._as_dataset(
   1275     split=split,
   1276     in_memory=in_memory,
   1277 )
   1278 if run_post_process:
   1279     for resource_file_name in self._post_processing_resources(split).values():

File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\builder.py:1348, in DatasetBuilder._as_dataset(self, split, in_memory)
   1346 if self._check_legacy_cache():
   1347     dataset_name = self.name
-> 1348 dataset_kwargs = ArrowReader(cache_dir, self.info).read(
   1349     name=dataset_name,
   1350     instructions=split,
   1351     split_infos=self.info.splits.values(),
   1352     in_memory=in_memory,
   1353 )
   1354 fingerprint = self._get_dataset_fingerprint(split)
   1355 return Dataset(fingerprint=fingerprint, **dataset_kwargs)

File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\arrow_reader.py:254, in BaseReader.read(self, name, instructions, split_infos, in_memory)
    252 if not files:
    253     msg = f'Instruction "{instructions}" corresponds to no data!'
--> 254     raise ValueError(msg)
    255 return self.read_files(files=files, original_instructions=instructions, in_memory=in_memory)

**ValueError: Instruction "validation" corresponds to no data!**

Looks like the data is not being loaded. Any advice would be appreciated. Thanks!

Steps to reproduce the bug

Run all cells of Chapter 6 notebook.

Expected behavior

Data should load correctly without any errors.

Environment info

datasets version: 2.17.0
Platform: Windows-10-10.0.19045-SP0
Python version: 3.9.18
huggingface_hub version: 0.20.3
PyArrow version: 15.0.0
Pandas version: 2.2.0
fsspec version: 2023.10.0

Feb 16 '24 04:02 hariravichandran

Here's a snippet that works with the current datasets:

dataset = load_dataset("cnn_dailymail", "3.0.0")

(Drop the version= part.)

Oct 24 '25 18:10 jurca