text
text copied to clipboard
IWSLT datasets are not properly unpacked from `tgz` file
🐛 Bug
IWSLT datasets are not properly unpacked from the downloaded tgz
file when using torchtext.datasets.IWSLT*
.
When I unpack the nested tgz
files by hand everything works as expected.
To Reproduce
E.g. for IWSLT2017 download the .tgz
file from here and put it under ./data/datasets/IWSLT2017
import torchtext
a = iter(torchtext.datasets.IWSLT2017(root="./data", split="train"))
next(a)
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
Input In [9], in <cell line: 4>()
1 import torchtext
3 a = iter(torchtext.datasets.IWSLT2016(root="./data", split="train"))
----> 4 next(a)
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/_typing.py:514, in hook_iterator.<locals>.wrap_generator(*args, **kwargs)
512 response = gen.send(None)
513 else:
--> 514 response = gen.send(None)
516 while True:
517 request = yield response
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/iter/grouping.py:41, in ShardingFilterIterDataPipe.__iter__(self)
40 def __iter__(self):
---> 41 for i, item in enumerate(self.source_datapipe):
42 if i % self.num_of_instances == self.instance_id:
43 yield item
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/_typing.py:514, in hook_iterator.<locals>.wrap_generator(*args, **kwargs)
512 response = gen.send(None)
513 else:
--> 514 response = gen.send(None)
516 while True:
517 request = yield response
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/iter/combinatorics.py:122, in ShufflerIterDataPipe.__iter__(self)
120 def __iter__(self) -> Iterator[T_co]:
121 if not self._enabled:
--> 122 for x in self.datapipe:
123 yield x
124 else:
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/_typing.py:514, in hook_iterator.<locals>.wrap_generator(*args, **kwargs)
512 response = gen.send(None)
513 else:
--> 514 response = gen.send(None)
516 while True:
517 request = yield response
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/iter/combining.py:513, in ZipperIterDataPipe.__iter__(self)
512 def __iter__(self) -> Iterator[Tuple[T_co]]:
--> 513 for data in zip(*self.datapipes):
514 yield data
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/_typing.py:514, in hook_iterator.<locals>.wrap_generator(*args, **kwargs)
512 response = gen.send(None)
513 else:
--> 514 response = gen.send(None)
516 while True:
517 request = yield response
File /user/conda/envs/storch/lib/python3.9/site-packages/torchdata/datapipes/iter/util/plain_text_reader.py:121, in LineReaderIterDataPipe.__iter__(self)
120 def __iter__(self) -> Iterator[Union[Str_Or_Bytes, Tuple[str, Str_Or_Bytes]]]:
--> 121 for path, file in self.source_datapipe:
122 stream = self._helper.skip_lines(file)
123 stream = self._helper.strip_newline(stream)
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/_typing.py:514, in hook_iterator.<locals>.wrap_generator(*args, **kwargs)
512 response = gen.send(None)
513 else:
--> 514 response = gen.send(None)
516 while True:
517 request = yield response
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/iter/fileopener.py:68, in FileOpenerIterDataPipe.__iter__(self)
67 def __iter__(self):
---> 68 yield from get_file_binaries_from_pathnames(self.datapipe, self.mode, self.encoding)
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/utils/common.py:86, in get_file_binaries_from_pathnames(pathnames, mode, encoding)
83 if mode in ('b', 't'):
84 mode = 'r' + mode
---> 86 for pathname in pathnames:
87 if not isinstance(pathname, str):
88 raise TypeError("Expected string type for pathname, but got {}"
89 .format(type(pathname)))
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/_typing.py:514, in hook_iterator.<locals>.wrap_generator(*args, **kwargs)
512 response = gen.send(None)
513 else:
--> 514 response = gen.send(None)
516 while True:
517 request = yield response
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/iter/combining.py:51, in ConcaterIterDataPipe.__iter__(self)
49 def __iter__(self) -> Iterator:
50 for dp in self.datapipes:
---> 51 for data in dp:
52 yield data
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/_typing.py:514, in hook_iterator.<locals>.wrap_generator(*args, **kwargs)
512 response = gen.send(None)
513 else:
--> 514 response = gen.send(None)
516 while True:
517 request = yield response
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/iter/filelister.py:58, in FileListerIterDataPipe.__iter__(self)
57 def __iter__(self) -> Iterator[str] :
---> 58 for path in self.datapipe:
59 yield from get_file_pathnames_from_root(path, self.masks, self.recursive, self.abspath, self.non_deterministic)
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/_typing.py:514, in hook_iterator.<locals>.wrap_generator(*args, **kwargs)
512 response = gen.send(None)
513 else:
--> 514 response = gen.send(None)
516 while True:
517 request = yield response
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/iter/callable.py:116, in MapperIterDataPipe.__iter__(self)
114 def __iter__(self) -> Iterator[T_co]:
115 for data in self.datapipe:
--> 116 yield self._apply_fn(data)
File /user/conda/envs/storch/lib/python3.9/site-packages/torch/utils/data/datapipes/iter/callable.py:81, in MapperIterDataPipe._apply_fn(self, data)
79 def _apply_fn(self, data):
80 if self.input_col is None and self.output_col is None:
---> 81 return self.fn(data)
83 if self.input_col is None:
84 res = self.fn(data)
File /user/conda/envs/storch/lib/python3.9/site-packages/torchdata/datapipes/iter/util/cacheholder.py:300, in _wait_promise_fn(timeout, filename)
298 time.sleep(0.01)
299 if time.time() - start > timeout:
--> 300 raise Exception(
301 f"OnDiskCache Exception: {filename} expected to be written by different process, "
302 + f"but file is not ready in {timeout} seconds."
303 )
304 return filename
Exception: OnDiskCache Exception: ./data/datasets/IWSLT2016/2016-01/texts/de/en/de-en/train.de-en.de expected to be written by different process, but file is not ready in 300 seconds.
This exception is thrown by __iter__ of MapperIterDataPipe(datapipe=UnBatcherIterDataPipe, fn=functools.partial(<function _wait_promise_fn at 0x7ff9b89dfee0>, 300), input_col=None, output_col=None)
Expected behavior
Dataset is properly unpacked from the downloaded tgz
file when using torchtext.datasets.IWSLT*
Environment
Collecting environment information...
PyTorch version: 1.12.1
Is debug build: False
CUDA used to build PyTorch: 11.6
ROCM used to build PyTorch: N/A
OS: Ubuntu 22.04.1 LTS (x86_64)
GCC version: (Ubuntu 11.2.0-19ubuntu1) 11.2.0
Clang version: Could not collect
CMake version: Could not collect
Libc version: glibc-2.35
Python version: 3.9.12 (main, Jun 1 2022, 11:38:51) [GCC 7.5.0] (64-bit runtime)
Python platform: Linux-5.15.0-46-generic-x86_64-with-glibc2.35
Is CUDA available: True
CUDA runtime version: Could not collect
GPU models and configuration:
GPU 0: NVIDIA A100-SXM4-80GB
GPU 1: NVIDIA A100-SXM4-80GB
GPU 2: NVIDIA A100-SXM4-80GB
GPU 3: NVIDIA A100-SXM4-80GB
Nvidia driver version: 470.141.03
cuDNN version: Could not collect
HIP runtime version: N/A
MIOpen runtime version: N/A
Is XNNPACK available: True
Versions of relevant libraries:
[pip3] mypy==0.971
[pip3] mypy-extensions==0.4.3
[pip3] numpy==1.22.4
[pip3] pytest-mypy==0.9.1
[pip3] pytorch-lightning==1.6.5
[pip3] torch==1.12.1
[pip3] torch-fidelity==0.3.0
[pip3] torchdata==0.4.1
[pip3] torchmetrics==0.9.3
[pip3] torchtext==0.13.1
[pip3] torchvision==0.13.1
[conda] blas 1.0 mkl conda-forge
[conda] cudatoolkit 11.6.0 hecad31d_10 conda-forge
[conda] mkl 2022.0.1 h06a4308_117
[conda] numpy 1.22.4 pypi_0 pypi
[conda] pytorch 1.12.1 py3.9_cuda11.6_cudnn8.3.2_0 pytorch
[conda] pytorch-lightning 1.6.5 pypi_0 pypi
[conda] pytorch-mutex 1.0 cuda pytorch
[conda] torch-fidelity 0.3.0 pypi_0 pypi
[conda] torchdata 0.4.1 pypi_0 pypi
[conda] torchmetrics 0.9.3 pypi_0 pypi
[conda] torchtext 0.13.1 pypi_0 pypi
[conda] torchvision 0.13.1 pypi_0 pypi
cc @ejguan and @VitalyFedyunin as this seems to be an issue with the CacheHolder datapipe from torchdata
Will take a look
Likely related to not yet landed race condition fix
Likely related to not yet landed race condition fix
@VitalyFedyunin Can you link the Issue/PR?
Yep, In my TODO for 1st half of the week.
@VitalyFedyunin is there any update on this?
Working on it right now. Want to make sure it is part of the next official branch cut. So it requires intensive testing.
As mitigation cleaning cache folder and restart should help.
https://github.com/pytorch/data/pull/652 should fix it.
This is fixed by https://github.com/pytorch/text/pull/1942 and https://github.com/pytorch/data/pull/810