pytorch-transformer
pytorch-transformer copied to clipboard
Error: DatasetGenerationError: An error occurred while generating the dataset
What I have done?
- I have cloned the project in my Macbook Pro and using Jupyter Notebook, I have run the file https://github.com/hkproj/pytorch-transformer/blob/main/Local_Train.ipynb
How to solve the code that did produce the following error:
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
File ~/anaconda3/lib/python3.11/site-packages/datasets/builder.py:1873, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1866 writer = writer_class(
1867 features=writer._features,
1868 path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
(...)
1871 embed_local_files=embed_local_files,
1872 )
-> 1873 writer.write_table(table)
1874 num_examples_progress_update += len(table)
File ~/anaconda3/lib/python3.11/site-packages/datasets/arrow_writer.py:568, in ArrowWriter.write_table(self, pa_table, writer_batch_size)
567 pa_table = pa_table.combine_chunks()
--> 568 pa_table = table_cast(pa_table, self._schema)
569 if self.embed_local_files:
File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:2290, in table_cast(table, schema)
2289 if table.schema != schema:
-> 2290 return cast_table_to_schema(table, schema)
2291 elif table.schema.metadata != schema.metadata:
File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:2249, in cast_table_to_schema(table, schema)
2248 raise ValueError(f"Couldn't cast\n{table.schema}\nto\n{features}\nbecause column names don't match")
-> 2249 arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
2250 return pa.Table.from_arrays(arrays, schema=schema)
File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:2249, in <listcomp>(.0)
2248 raise ValueError(f"Couldn't cast\n{table.schema}\nto\n{features}\nbecause column names don't match")
-> 2249 arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
2250 return pa.Table.from_arrays(arrays, schema=schema)
File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:1817, in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs)
1816 if isinstance(array, pa.ChunkedArray):
-> 1817 return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
1818 else:
File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:1817, in <listcomp>(.0)
1816 if isinstance(array, pa.ChunkedArray):
-> 1817 return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
1818 else:
File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:2109, in cast_array_to_feature(array, feature, allow_number_to_str)
2108 elif not isinstance(feature, (Sequence, dict, list, tuple)):
-> 2109 return array_cast(array, feature(), allow_number_to_str=allow_number_to_str)
2110 raise TypeError(f"Couldn't cast array of type\n{array.type}\nto\n{feature}")
File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:1819, in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs)
1818 else:
-> 1819 return func(array, *args, **kwargs)
File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:2000, in array_cast(array, pa_type, allow_number_to_str)
1999 return array.cast(pa_type)
-> 2000 raise TypeError(f"Couldn't cast array of type\n{array.type}\nto\n{pa_type}")
TypeError: Couldn't cast array of type
struct<ca: string, en: string>
to
struct<ca: string, de: string>
The above exception was the direct cause of the following exception:
DatasetGenerationError Traceback (most recent call last)
Cell In[1], line 9
5 cfg['num_epochs'] = 1
7 from train import train_model
----> 9 train_model(cfg)
File ~/git/github/pytorch-transformer/train.py:198, in train_model(config)
195 # Make sure the weights folder exists
196 Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)
--> 198 train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
199 model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
200 # Tensorboard
File ~/git/github/pytorch-transformer/train.py:143, in get_ds(config)
141 def get_ds(config):
142 # It only has the train split, so we divide it overselves
--> 143 ds_raw = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='train')
145 # Build tokenizers
146 tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
File ~/anaconda3/lib/python3.11/site-packages/datasets/load.py:1797, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
1794 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
1796 # Download and prepare data
-> 1797 builder_instance.download_and_prepare(
1798 download_config=download_config,
1799 download_mode=download_mode,
1800 verification_mode=verification_mode,
1801 try_from_hf_gcs=try_from_hf_gcs,
1802 num_proc=num_proc,
1803 storage_options=storage_options,
1804 )
1806 # Build dataset for splits
1807 keep_in_memory = (
1808 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
1809 )
File ~/anaconda3/lib/python3.11/site-packages/datasets/builder.py:890, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
888 if num_proc is not None:
889 prepare_split_kwargs["num_proc"] = num_proc
--> 890 self._download_and_prepare(
891 dl_manager=dl_manager,
892 verification_mode=verification_mode,
893 **prepare_split_kwargs,
894 **download_and_prepare_kwargs,
895 )
896 # Sync info
897 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
File ~/anaconda3/lib/python3.11/site-packages/datasets/builder.py:985, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
981 split_dict.add(split_generator.split_info)
983 try:
984 # Prepare split will record examples associated to the split
--> 985 self._prepare_split(split_generator, **prepare_split_kwargs)
986 except OSError as e:
987 raise OSError(
988 "Cannot find data file. "
989 + (self.manual_download_instructions or "")
990 + "\nOriginal error:\n"
991 + str(e)
992 ) from None
File ~/anaconda3/lib/python3.11/site-packages/datasets/builder.py:1746, in ArrowBasedBuilder._prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
1744 job_id = 0
1745 with pbar:
-> 1746 for job_id, done, content in self._prepare_split_single(
1747 gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
1748 ):
1749 if done:
1750 result = content
File ~/anaconda3/lib/python3.11/site-packages/datasets/builder.py:1891, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1889 if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
1890 e = e.__context__
-> 1891 raise DatasetGenerationError("An error occurred while generating the dataset") from e
1893 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
DatasetGenerationError: An error occurred while generating the dataset
My '!pip list' output in Jypyter Notebook
----------------------------- ------------
absl-py 2.0.0
accelerate 0.28.0
aiobotocore 2.5.0
aiofiles 22.1.0
aiohttp 3.8.5
aioitertools 0.7.1
aiosignal 1.2.0
aiosqlite 0.18.0
alabaster 0.7.12
alembic 1.13.1
anaconda-anon-usage 0.4.2
anaconda-catalogs 0.2.0
anaconda-client 1.12.1
anaconda-cloud-auth 0.1.3
anaconda-navigator 2.5.0
anaconda-project 0.11.1
anyio 3.5.0
appdirs 1.4.4
applaunchservices 0.3.0
appnope 0.1.2
appscript 1.1.2
argon2-cffi 21.3.0
argon2-cffi-bindings 21.2.0
arrow 1.2.3
astroid 2.14.2
astropy 5.1
asttokens 2.0.5
astunparse 1.6.3
async-timeout 4.0.2
atomicwrites 1.4.0
attrs 22.1.0
audioread 3.0.1
Automat 20.2.0
autopep8 1.6.0
Babel 2.11.0
backcall 0.2.0
backports.functools-lru-cache 1.6.4
backports.tempfile 1.0
backports.weakref 1.0.post1
bcrypt 3.2.0
beautifulsoup4 4.12.2
binaryornot 0.4.4
black 0.0
bleach 4.1.0
bokeh 3.2.1
boltons 23.0.0
botocore 1.29.76
Bottleneck 1.3.5
brotlipy 0.7.0
cachetools 5.3.1
certifi 2023.7.22
cffi 1.15.1
chardet 4.0.0
charset-normalizer 2.0.4
click 8.0.4
cloudpickle 2.2.1
clyent 1.2.2
colorama 0.4.6
colorcet 3.0.1
colorlog 6.8.2
comm 0.1.2
conda 23.7.4
conda-build 3.26.1
conda-content-trust 0.2.0
conda_index 0.3.0
conda-libmamba-solver 23.9.1
conda-pack 0.6.0
conda-package-handling 2.2.0
conda_package_streaming 0.9.0
conda-repo-cli 1.0.75
conda-token 0.4.0
conda-verify 3.4.2
constantly 15.1.0
contourpy 1.0.5
cookiecutter 1.7.3
cryptography 41.0.3
cssselect 1.1.0
cycler 0.11.0
cytoolz 0.12.0
dask 2023.6.0
datasets 2.12.0
datashader 0.15.2
datashape 0.5.4
debugpy 1.6.7
decorator 5.1.1
defusedxml 0.7.1
diff-match-patch 20200713
diffuser 0.0.1
diffusers 0.27.0
dill 0.3.6
distributed 2023.6.0
docstring-to-markdown 0.11
docutils 0.18.1
entrypoints 0.4
et-xmlfile 1.1.0
executing 0.8.3
fastjsonschema 2.16.2
filelock 3.9.0
filterpy 1.4.5
flake8 6.0.0
Flask 2.2.2
flatbuffers 23.5.26
fonttools 4.25.0
frozenlist 1.3.3
fsspec 2023.5.0
future 0.18.3
gast 0.5.4
gensim 4.3.0
glob2 0.7
gmpy2 2.1.2
google-auth 2.23.3
google-auth-oauthlib 1.0.0
google-pasta 0.2.0
graphviz 0.20.3
greenlet 2.0.1
grpcio 1.59.0
h5py 3.9.0
HeapDict 1.0.1
holoviews 1.17.1
huggingface-hub 0.21.4
hvplot 0.8.4
hyperlink 21.0.0
idna 3.4
imagecodecs 2023.1.23
imageio 2.31.1
imagesize 1.4.1
imbalanced-learn 0.10.1
importlib-metadata 6.0.0
incremental 21.3.0
inflection 0.5.1
iniconfig 1.1.1
intake 0.6.8
intervaltree 3.1.0
ipykernel 6.25.0
ipython 8.15.0
ipython-genutils 0.2.0
ipywidgets 8.0.4
isort 5.9.3
itemadapter 0.3.0
itemloaders 1.0.4
itsdangerous 2.0.1
jaraco.classes 3.2.1
jedi 0.18.1
jellyfish 1.0.1
Jinja2 3.1.2
jinja2-time 0.2.0
jmespath 0.10.0
joblib 1.2.0
json5 0.9.6
jsonpatch 1.32
jsonpointer 2.1
jsonschema 4.17.3
jupyter 1.0.0
jupyter_client 7.4.9
jupyter-console 6.6.3
jupyter_core 5.3.0
jupyter-events 0.6.3
jupyter-server 1.23.4
jupyter_server_fileid 0.9.0
jupyter_server_ydoc 0.8.0
jupyter-ydoc 0.2.4
jupyterlab 3.6.3
jupyterlab-pygments 0.1.2
jupyterlab_server 2.22.0
jupyterlab-widgets 3.0.5
kaggle 1.5.16
kaleido 0.2.1
keras 2.14.0
keras-tuner 1.4.6
keyring 23.13.1
kiwisolver 1.4.4
kt-legacy 1.0.5
lazy_loader 0.2
lazy-object-proxy 1.6.0
libarchive-c 2.9
libclang 16.0.6
libmambapy 1.5.1
librosa 0.10.1
lightning-utilities 0.11.2
linkify-it-py 2.0.0
llvmlite 0.40.0
lmdb 1.4.1
locket 1.0.0
lxml 4.9.3
lz4 4.3.2
Mako 1.3.5
Markdown 3.4.1
markdown-it-py 2.2.0
MarkupSafe 2.1.1
matplotlib 3.7.2
matplotlib-inline 0.1.6
mccabe 0.7.0
mdit-py-plugins 0.3.0
mdurl 0.1.0
mistune 0.8.4
ml-dtypes 0.2.0
more-itertools 8.12.0
mpmath 1.3.0
msgpack 1.0.3
multidict 6.0.2
multipledispatch 0.6.0
multiprocess 0.70.14
munkres 1.1.4
mypy-extensions 1.0.0
navigator-updater 0.4.0
nbclassic 0.5.5
nbclient 0.5.13
nbconvert 6.5.4
nbformat 5.9.2
nest-asyncio 1.5.6
networkx 3.1
nltk 3.8.1
notebook 6.5.4
notebook_shim 0.2.2
numba 0.57.0
numexpr 2.8.4
numpy 1.24.3
numpydoc 1.5.0
oauthlib 3.2.2
openpyxl 3.0.10
opt-einsum 3.3.0
optuna 3.6.1
packaging 23.1
pandas 2.0.3
pandocfilters 1.5.0
panel 1.2.3
param 1.13.0
parsel 1.6.0
parso 0.8.3
partd 1.4.0
pathlib 1.0.1
pathspec 0.10.3
patsy 0.5.3
pep8 1.7.1
pexpect 4.8.0
pickleshare 0.7.5
Pillow 9.4.0
pip 23.2.1
pkce 1.0.3
pkginfo 1.9.6
platformdirs 3.10.0
plotly 5.9.0
pluggy 1.0.0
ply 3.11
pooch 1.8.0
portalocker 2.8.2
poyo 0.5.0
prometheus-client 0.14.1
prompt-toolkit 3.0.36
Protego 0.1.16
protobuf 4.24.4
psutil 5.9.0
ptyprocess 0.7.0
pure-eval 0.2.2
py-cpuinfo 8.0.0
pyarrow 11.0.0
pyasn1 0.4.8
pyasn1-modules 0.2.8
pycodestyle 2.10.0
pycosat 0.6.4
pycparser 2.21
pyct 0.5.0
pycurl 7.45.2
pydantic 1.10.8
PyDispatcher 2.0.5
pydocstyle 6.3.0
pyerfa 2.0.0
pyflakes 3.0.1
Pygments 2.15.1
PyJWT 2.4.0
pylint 2.16.2
pylint-venv 2.3.0
pyls-spyder 0.4.0
pyobjc-core 9.0
pyobjc-framework-Cocoa 9.0
pyobjc-framework-CoreServices 9.0
pyobjc-framework-FSEvents 9.0
pyodbc 4.0.34
pyOpenSSL 23.2.0
pyparsing 3.0.9
PyQt5-sip 12.11.0
pyrsistent 0.18.0
PySocks 1.7.1
pytest 7.4.0
python-dateutil 2.8.2
python-dotenv 0.21.0
python-json-logger 2.0.7
python-lsp-black 1.2.1
python-lsp-jsonrpc 1.0.0
python-lsp-server 1.7.2
python-slugify 5.0.2
python-snappy 0.6.1
pytoolconfig 1.2.5
pytz 2023.3.post1
pyviz-comms 2.3.0
PyWavelets 1.4.1
PyYAML 6.0
pyzmq 23.2.0
QDarkStyle 3.0.2
qstylizer 0.2.2
QtAwesome 1.2.2
qtconsole 5.4.2
QtPy 2.2.0
queuelib 1.5.0
regex 2022.7.9
requests 2.31.0
requests-file 1.5.1
requests-oauthlib 1.3.1
requests-toolbelt 1.0.0
resampy 0.4.2
responses 0.13.3
rfc3339-validator 0.1.4
rfc3986-validator 0.1.1
rope 1.7.0
rsa 4.9
Rtree 1.0.1
ruamel.yaml 0.17.21
ruamel-yaml-conda 0.17.21
s3fs 2023.4.0
safetensors 0.3.2
scikit-image 0.20.0
scikit-learn 1.3.0
scipy 1.11.1
Scrapy 2.8.0
seaborn 0.12.2
Send2Trash 1.8.0
service-identity 18.1.0
setuptools 68.0.0
sip 6.6.2
six 1.16.0
smart-open 5.2.1
sniffio 1.2.0
snowballstemmer 2.2.0
sortedcontainers 2.4.0
soundfile 0.12.1
soupsieve 2.4
soxr 0.3.7
Sphinx 5.0.2
sphinxcontrib-applehelp 1.0.2
sphinxcontrib-devhelp 1.0.2
sphinxcontrib-htmlhelp 2.0.0
sphinxcontrib-jsmath 1.0.1
sphinxcontrib-qthelp 1.0.3
sphinxcontrib-serializinghtml 1.1.5
spyder 5.4.3
spyder-kernels 2.4.4
SQLAlchemy 1.4.39
stack-data 0.2.0
statsmodels 0.14.0
sympy 1.11.1
tables 3.8.0
tabulate 0.8.10
tblib 1.7.0
tenacity 8.2.2
tensorboard 2.14.1
tensorboard-data-server 0.7.1
tensorflow 2.14.0
tensorflow-estimator 2.14.0
tensorflow-io 0.34.0
tensorflow-io-gcs-filesystem 0.33.0
termcolor 2.3.0
terminado 0.17.1
text-unidecode 1.3
textdistance 4.2.1
threadpoolctl 2.2.0
three-merge 0.1.1
tifffile 2023.4.12
tinycss2 1.2.1
tldextract 3.2.0
tokenizers 0.13.2
toml 0.10.2
tomlkit 0.11.1
toolz 0.12.0
torch 2.2.2
torchdata 0.7.1
torchmetrics 1.4.0.post0
torchtext 0.17.2
torchvision 0.17.2
torchviz 0.0.2
tornado 6.3.2
tqdm 4.65.0
traitlets 5.7.1
transformers 4.32.1
Twisted 22.10.0
typing_extensions 4.10.0
tzdata 2023.3
uc-micro-py 1.0.1
ujson 5.4.0
Unidecode 1.2.0
urllib3 1.26.16
w3lib 1.21.0
watchdog 2.1.6
wcwidth 0.2.5
webencodings 0.5.1
websocket-client 0.58.0
Werkzeug 2.2.3
whatthepatch 1.0.2
wheel 0.38.4
widgetsnbextension 4.0.5
wrapt 1.14.1
wurlitzer 3.0.2
xarray 2023.6.0
xlwings 0.29.1
xxhash 2.0.2
xyzservices 2022.9.0
y-py 0.5.9
yapf 0.31.0
yarl 1.8.1
ypy-websocket 0.8.2
zict 2.2.0
zipp 3.11.0
zope.interface 5.4.0
zstandard 0.19.0 ```
Hi, if you do
get_dataset_config_names("opus_books")
and get only
['ca-de']
then this maybe caused by using an old version of datasets.
For me the code fails with datasets-2.13.2, yet works with datasets-3.2.0. So you may consider update your datasets package as I saw that you have datasets-2.12.0.
You may also refer to this post which is basically just me asking the same question in the Huggingface forum.
Hope these help!