datasets
datasets copied to clipboard
Chapter 6 - Issue Loading `cnn_dailymail` dataset
Describe the bug
So I am getting this bug when I try to run cell 4 of the Chapter 6 notebook code:
dataset = load_dataset("ccdv/cnn_dailymail", version="3.0.0")
Error Message:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[4], line 4
1 #hide_output
2 from datasets import load_dataset
----> 4 dataset = load_dataset("ccdv/cnn_dailymail", version="3.0.0")
7 # dataset = load_dataset("ccdv/cnn_dailymail", version="3.0.0", trust_remote_code=True)
8 print(f"Features: {dataset['train'].column_names}")
File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\load.py:2587, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
2583 # Build dataset for splits
2584 keep_in_memory = (
2585 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
2586 )
-> 2587 ds = builder_instance.as_dataset(split=split, verification_mode=verification_mode, in_memory=keep_in_memory)
2588 # Rename and cast features to match task schema
2589 if task is not None:
2590 # To avoid issuing the same warning twice
File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\builder.py:1244, in DatasetBuilder.as_dataset(self, split, run_post_process, verification_mode, ignore_verifications, in_memory)
1241 verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS)
1243 # Create a dataset for each of the given splits
-> 1244 datasets = map_nested(
1245 partial(
1246 self._build_single_dataset,
1247 run_post_process=run_post_process,
1248 verification_mode=verification_mode,
1249 in_memory=in_memory,
1250 ),
1251 split,
1252 map_tuple=True,
1253 disable_tqdm=True,
1254 )
1255 if isinstance(datasets, dict):
1256 datasets = DatasetDict(datasets)
File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\utils\py_utils.py:477, in map_nested(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, types, disable_tqdm, desc)
466 mapped = [
467 map_nested(
468 function=function,
(...)
474 for obj in iterable
475 ]
476 elif num_proc != -1 and num_proc <= 1 or len(iterable) < parallel_min_length:
--> 477 mapped = [
478 _single_map_nested((function, obj, types, None, True, None))
479 for obj in hf_tqdm(iterable, disable=disable_tqdm, desc=desc)
480 ]
481 else:
482 with warnings.catch_warnings():
File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\utils\py_utils.py:478, in <listcomp>(.0)
466 mapped = [
467 map_nested(
468 function=function,
(...)
474 for obj in iterable
475 ]
476 elif num_proc != -1 and num_proc <= 1 or len(iterable) < parallel_min_length:
477 mapped = [
--> 478 _single_map_nested((function, obj, types, None, True, None))
479 for obj in hf_tqdm(iterable, disable=disable_tqdm, desc=desc)
480 ]
481 else:
482 with warnings.catch_warnings():
File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\utils\py_utils.py:370, in _single_map_nested(args)
368 # Singleton first to spare some computation
369 if not isinstance(data_struct, dict) and not isinstance(data_struct, types):
--> 370 return function(data_struct)
372 # Reduce logging to keep things readable in multiprocessing with tqdm
373 if rank is not None and logging.get_verbosity() < logging.WARNING:
File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\builder.py:1274, in DatasetBuilder._build_single_dataset(self, split, run_post_process, verification_mode, in_memory)
1271 split = Split(split)
1273 # Build base dataset
-> 1274 ds = self._as_dataset(
1275 split=split,
1276 in_memory=in_memory,
1277 )
1278 if run_post_process:
1279 for resource_file_name in self._post_processing_resources(split).values():
File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\builder.py:1348, in DatasetBuilder._as_dataset(self, split, in_memory)
1346 if self._check_legacy_cache():
1347 dataset_name = self.name
-> 1348 dataset_kwargs = ArrowReader(cache_dir, self.info).read(
1349 name=dataset_name,
1350 instructions=split,
1351 split_infos=self.info.splits.values(),
1352 in_memory=in_memory,
1353 )
1354 fingerprint = self._get_dataset_fingerprint(split)
1355 return Dataset(fingerprint=fingerprint, **dataset_kwargs)
File ~\anaconda3\envs\nlp-transformers\lib\site-packages\datasets\arrow_reader.py:254, in BaseReader.read(self, name, instructions, split_infos, in_memory)
252 if not files:
253 msg = f'Instruction "{instructions}" corresponds to no data!'
--> 254 raise ValueError(msg)
255 return self.read_files(files=files, original_instructions=instructions, in_memory=in_memory)
**ValueError: Instruction "validation" corresponds to no data!**
Looks like the data is not being loaded. Any advice would be appreciated. Thanks!
Steps to reproduce the bug
Run all cells of Chapter 6 notebook.
Expected behavior
Data should load correctly without any errors.
Environment info
datasetsversion: 2.17.0- Platform: Windows-10-10.0.19045-SP0
- Python version: 3.9.18
huggingface_hubversion: 0.20.3- PyArrow version: 15.0.0
- Pandas version: 2.2.0
fsspecversion: 2023.10.0
Here's a snippet that works with the current datasets:
dataset = load_dataset("cnn_dailymail", "3.0.0")
(Drop the version= part.)