LAVIS
LAVIS copied to clipboard
Unable to download SNLI_VE and NLVR dataset
Hi. I am trying to download & load SNLI_VE dataset using load_dataset("snli_ve"). I got the following error. The URL is not correct. Same for NLVR dataset. Could you fix this? Thank you!
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ <stdin>:1 in <module> │
│ │
│ ./LAVIS/lavis/datasets/builders/__init__.py:105 in load_dataset │
│ │
│ 102 │ │ │
│ 103 │ │ builder.config.build_info.get(data_type).storage = vis_path │
│ 104 │ │
│ ❱ 105 │ dataset = builder.build_datasets() │
│ 106 │ return dataset │
│ 107 │
│ 108 │
│ │
│ ./LAVIS/lavis/datasets/builders/base_dataset_builder.py:51 in build_datasets │
│ │
│ 48 │ │ # only called on 1 GPU/TPU in distributed │
│ 49 │ │ │
│ 50 │ │ if is_main_process(): │
│ ❱ 51 │ │ │ self._download_data() │
│ 52 │ │ │
│ 53 │ │ if is_dist_avail_and_initialized(): │
│ 54 │ │ │ dist.barrier() │
│ │
│ ./LAVIS/lavis/datasets/builders/base_dataset_builder.py:98 in _download_data │
│ │
│ 95 │ │ return utils.get_abs_path(cls.DATASET_CONFIG_DICT[type]) │
│ 96 │ │
│ 97 │ def _download_data(self): │
│ ❱ 98 │ │ self._download_ann() │
│ 99 │ │ self._download_vis() │
│ 100 │ │
│ 101 │ def _download_ann(self): │
│ │
│ ./LAVIS/lavis/datasets/builders/base_dataset_builder.py:156 in _download_ann │
│ │
│ 153 │ │ │ │ │ else: │
│ 154 │ │ │ │ │ │ filename = os.path.basename(storage_path) │
│ 155 │ │ │ │ │ │
│ ❱ 156 │ │ │ │ │ breakpoint() │
│ 157 │ │ │ │ │ download_url(url=url_or_filename, root=dirname, filename=filename) │
│ 158 │ │
│ 159 │ def _download_vis(self): │
│ │
│ /home/user/miniconda3/envs/lavis/lib/python3.8/site-packages/torchvision/datasets/utils.py:134 in │
│ download_url │
│ │
│ 131 │ │ _download_file_from_remote_location(fpath, url) │
│ 132 │ else: │
│ 133 │ │ # expand redirect chain if needed │
│ ❱ 134 │ │ url = _get_redirect_url(url, max_hops=max_redirect_hops) │
│ 135 │ │ │
│ 136 │ │ # check if file is located on Google Drive │
│ 137 │ │ file_id = _get_google_drive_file_id(url) │
│ │
│ /home/user/miniconda3/envs/lavis/lib/python3.8/site-packages/torchvision/datasets/utils.py:82 in │
│ _get_redirect_url │
│ │
│ 79 │ headers = {"Method": "HEAD", "User-Agent": USER_AGENT} │
│ 80 │ │
│ 81 │ for _ in range(max_hops + 1): │
│ ❱ 82 │ │ with urllib.request.urlopen(urllib.request.Request(url, headers=headers)) as res │
│ 83 │ │ │ if response.url == url or response.url is None: │
│ 84 │ │ │ │ return url │
│ 85 │
│ │
│ /home/user/miniconda3/envs/lavis/lib/python3.8/urllib/request.py:328 in __init__ │
│ │
│ 325 │ def __init__(self, url, data=None, headers={}, │
│ 326 │ │ │ │ origin_req_host=None, unverifiable=False, │
│ 327 │ │ │ │ method=None): │
│ ❱ 328 │ │ self.full_url = url │
│ 329 │ │ self.headers = {} │
│ 330 │ │ self.unredirected_hdrs = {} │
│ 331 │ │ self._data = None │
│ │
│ /home/user/miniconda3/envs/lavis/lib/python3.8/urllib/request.py:354 in full_url │
│ │
│ 351 │ │ # unwrap('<URL:type://host/path>') --> 'type://host/path' │
│ 352 │ │ self._full_url = unwrap(url) │
│ 353 │ │ self._full_url, self.fragment = _splittag(self._full_url) │
│ ❱ 354 │ │ self._parse() │
│ 355 │ │
│ 356 │ @full_url.deleter │
│ 357 │ def full_url(self): │
│ │
│ /home/user/miniconda3/envs/lavis/lib/python3.8/urllib/request.py:383 in _parse │
│ │
│ 380 │ def _parse(self): │
│ 381 │ │ self.type, rest = _splittype(self._full_url) │
│ 382 │ │ if self.type is None: │
│ ❱ 383 │ │ │ raise ValueError("unknown url type: %r" % self.full_url) │
│ 384 │ │ self.host, self.selector = _splithost(rest) │
│ 385 │ │ if self.host: │
│ 386 │ │ │ self.host = unquote(self.host) │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: unknown url type: '/export/share/dongxuli/data/lavis/snli/annotation/ve_train.json'
You can go to the salesforce/ALBEF repo. There is an url for snli_ve dataset github repo. Follow the instruction in the snli_ve repo to generate the snli_ve train/dev/test json file. After that, modify the 'url' arguments in the lavis/configs/datasets/snli_ve/defaults.json to the snli_ve filepath you got before. Hope this help.