libsvmdata
libsvmdata copied to clipboard
FIX - bug dowload SUSY dataset
closes #33
There seems to be an error in the .xy file that prevents this from working (see also https://github.com/mathurinm/libsvmdata/issues/33#issuecomment-1159092937)
In [3]: libsvmdata.fetch_libsvm("SUSY", verbose=True)
Dataset: SUSY
Downloading data from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/SUSY.xz (585.8 MB)
file_sizes: 100%|████████████████████████████| 614M/614M [01:30<00:00, 6.82MB/s]
Successfully downloaded file to /home/mathurin/data/libsvm/binary/SUSY.xz
Loading svmlight file...
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In [3], line 1
----> 1 libsvmdata.fetch_libsvm("SUSY", verbose=True)
File ~/workspace/libsvmdata/libsvmdata/datasets.py:440, in fetch_libsvm(dataset, replace, normalize, min_nnz, verbose)
438 if verbose:
439 print("Dataset: %s" % dataset)
--> 440 X, y = _get_X_y(dataset, multilabel, replace=replace, verbose=verbose)
442 # removing columns with to few non zero entries when using sparse X
443 if sparse.issparse(X) and min_nnz != 0:
File ~/workspace/libsvmdata/libsvmdata/datasets.py:353, in _get_X_y(dataset, multilabel, replace, verbose)
351 print("Loading svmlight file...")
352 with open(tmp_path, 'rb') as f:
--> 353 X, y = load_svmlight_file(
354 f, n_features=n_features_total, multilabel=multilabel)
356 tmp_path.unlink()
357 # if X's density is more than 0.5, store it in dense format:
File ~/mambaforge/lib/python3.10/site-packages/sklearn/datasets/_svmlight_format_io.py:172, in load_svmlight_file(f, n_features, dtype, multilabel, zero_based, query_id, offset, length)
45 def load_svmlight_file(
46 f,
47 *,
(...)
54 length=-1,
55 ):
56 """Load datasets in the svmlight / libsvm format into sparse CSR matrix.
57
58 This format is a text-based format, with one sample per line. It does
(...)
169 X, y = get_data()
170 """
171 return tuple(
--> 172 load_svmlight_files(
173 [f],
174 n_features=n_features,
175 dtype=dtype,
176 multilabel=multilabel,
177 zero_based=zero_based,
178 query_id=query_id,
179 offset=offset,
180 length=length,
181 )
182 )
File ~/mambaforge/lib/python3.10/site-packages/sklearn/datasets/_svmlight_format_io.py:334, in load_svmlight_files(files, n_features, dtype, multilabel, zero_based, query_id, offset, length)
331 if (offset != 0 or length > 0) and n_features is None:
332 raise ValueError("n_features is required when offset or length is specified.")
--> 334 r = [
335 _open_and_load(
336 f,
337 dtype,
338 multilabel,
339 bool(zero_based),
340 bool(query_id),
341 offset=offset,
342 length=length,
343 )
344 for f in files
345 ]
347 if (
348 zero_based is False
349 or zero_based == "auto"
350 and all(len(tmp[1]) and np.min(tmp[1]) > 0 for tmp in r)
351 ):
352 for _, indices, _, _, _ in r:
File ~/mambaforge/lib/python3.10/site-packages/sklearn/datasets/_svmlight_format_io.py:335, in <listcomp>(.0)
331 if (offset != 0 or length > 0) and n_features is None:
332 raise ValueError("n_features is required when offset or length is specified.")
334 r = [
--> 335 _open_and_load(
336 f,
337 dtype,
338 multilabel,
339 bool(zero_based),
340 bool(query_id),
341 offset=offset,
342 length=length,
343 )
344 for f in files
345 ]
347 if (
348 zero_based is False
349 or zero_based == "auto"
350 and all(len(tmp[1]) and np.min(tmp[1]) > 0 for tmp in r)
351 ):
352 for _, indices, _, _, _ in r:
File ~/mambaforge/lib/python3.10/site-packages/sklearn/datasets/_svmlight_format_io.py:208, in _open_and_load(f, dtype, multilabel, zero_based, query_id, offset, length)
206 def _open_and_load(f, dtype, multilabel, zero_based, query_id, offset=0, length=-1):
207 if hasattr(f, "read"):
--> 208 actual_dtype, data, ind, indptr, labels, query = _load_svmlight_file(
209 f, dtype, multilabel, zero_based, query_id, offset, length
210 )
211 else:
212 with closing(_gen_open(f)) as f:
File sklearn/datasets/_svmlight_format_fast.pyx:76, in sklearn.datasets._svmlight_format_fast._load_svmlight_file()
ValueError: could not convert string to float: b'\xfd7zXZ\x00\x00\x04\xe6\xd6\xb4F\x02\x00!\x01\x16\x00\x00\x00t/\xe5\xa3\xe2\xd4\xb6\xef\xfe]\x00\x18\x08\x02\x88]S\x0e\\\x92\xf1B\xf1\x89\x1c\x18\xc3k^\x85\xe5\x91y\xd4\xdfK"\xe4\xfd\xe6\x16\xf1D\xc1e\xf5>\x80U\xd6\xfe\x18\x96,P\xb2\x96\xe3U\xc2\xc2\xdd\x17)\xdbm\xc3N\xa4\x1eC\xb8^\xcc~\xde\xef\xa7\x11Z\xc9\x81\xb8\xa6u\xddw\xb0\x8d\xdc;\xcb\xbdq~\x8d|C\x9f\xb6'
I looked up the SUSY dataset but can't see the error
You may have a cached version already; try forcing the download or deleting the file to reproduce
Le jeu. 16 févr. 2023 à 20:26, Badr MOUFAD @.***> a écrit :
I looked up the SUSY dataset but can't see the error
— Reply to this email directly, view it on GitHub https://github.com/mathurinm/libsvmdata/pull/34#issuecomment-1433601893, or unsubscribe https://github.com/notifications/unsubscribe-auth/ACETTQTB4Z2TS7W4ECNMMXDWXZ5QDANCNFSM5ZC3M2GQ . You are receiving this because you commented.Message ID: @.***>