libsvmdata FIX - bug dowload SUSY dataset

closes #33

Jun 17 '22 16:06 Badr-MOUFAD

There seems to be an error in the .xy file that prevents this from working (see also https://github.com/mathurinm/libsvmdata/issues/33#issuecomment-1159092937)

In [3]: libsvmdata.fetch_libsvm("SUSY", verbose=True)
Dataset: SUSY
Downloading data from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/SUSY.xz (585.8 MB)

file_sizes: 100%|████████████████████████████| 614M/614M [01:30<00:00, 6.82MB/s]
Successfully downloaded file to /home/mathurin/data/libsvm/binary/SUSY.xz
Loading svmlight file...
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In [3], line 1
----> 1 libsvmdata.fetch_libsvm("SUSY", verbose=True)

File ~/workspace/libsvmdata/libsvmdata/datasets.py:440, in fetch_libsvm(dataset, replace, normalize, min_nnz, verbose)
    438 if verbose:
    439     print("Dataset: %s" % dataset)
--> 440 X, y = _get_X_y(dataset, multilabel, replace=replace, verbose=verbose)
    442 # removing columns with to few non zero entries when using sparse X
    443 if sparse.issparse(X) and min_nnz != 0:

File ~/workspace/libsvmdata/libsvmdata/datasets.py:353, in _get_X_y(dataset, multilabel, replace, verbose)
    351     print("Loading svmlight file...")
    352 with open(tmp_path, 'rb') as f:
--> 353     X, y = load_svmlight_file(
    354         f, n_features=n_features_total, multilabel=multilabel)
    356 tmp_path.unlink()
    357 # if X's density is more than 0.5, store it in dense format:

File ~/mambaforge/lib/python3.10/site-packages/sklearn/datasets/_svmlight_format_io.py:172, in load_svmlight_file(f, n_features, dtype, multilabel, zero_based, query_id, offset, length)
     45 def load_svmlight_file(
     46     f,
     47     *,
   (...)
     54     length=-1,
     55 ):
     56     """Load datasets in the svmlight / libsvm format into sparse CSR matrix.
     57 
     58     This format is a text-based format, with one sample per line. It does
   (...)
    169         X, y = get_data()
    170     """
    171     return tuple(
--> 172         load_svmlight_files(
    173             [f],
    174             n_features=n_features,
    175             dtype=dtype,
    176             multilabel=multilabel,
    177             zero_based=zero_based,
    178             query_id=query_id,
    179             offset=offset,
    180             length=length,
    181         )
    182     )

File ~/mambaforge/lib/python3.10/site-packages/sklearn/datasets/_svmlight_format_io.py:334, in load_svmlight_files(files, n_features, dtype, multilabel, zero_based, query_id, offset, length)
    331 if (offset != 0 or length > 0) and n_features is None:
    332     raise ValueError("n_features is required when offset or length is specified.")
--> 334 r = [
    335     _open_and_load(
    336         f,
    337         dtype,
    338         multilabel,
    339         bool(zero_based),
    340         bool(query_id),
    341         offset=offset,
    342         length=length,
    343     )
    344     for f in files
    345 ]
    347 if (
    348     zero_based is False
    349     or zero_based == "auto"
    350     and all(len(tmp[1]) and np.min(tmp[1]) > 0 for tmp in r)
    351 ):
    352     for _, indices, _, _, _ in r:

File ~/mambaforge/lib/python3.10/site-packages/sklearn/datasets/_svmlight_format_io.py:335, in <listcomp>(.0)
    331 if (offset != 0 or length > 0) and n_features is None:
    332     raise ValueError("n_features is required when offset or length is specified.")
    334 r = [
--> 335     _open_and_load(
    336         f,
    337         dtype,
    338         multilabel,
    339         bool(zero_based),
    340         bool(query_id),
    341         offset=offset,
    342         length=length,
    343     )
    344     for f in files
    345 ]
    347 if (
    348     zero_based is False
    349     or zero_based == "auto"
    350     and all(len(tmp[1]) and np.min(tmp[1]) > 0 for tmp in r)
    351 ):
    352     for _, indices, _, _, _ in r:

File ~/mambaforge/lib/python3.10/site-packages/sklearn/datasets/_svmlight_format_io.py:208, in _open_and_load(f, dtype, multilabel, zero_based, query_id, offset, length)
    206 def _open_and_load(f, dtype, multilabel, zero_based, query_id, offset=0, length=-1):
    207     if hasattr(f, "read"):
--> 208         actual_dtype, data, ind, indptr, labels, query = _load_svmlight_file(
    209             f, dtype, multilabel, zero_based, query_id, offset, length
    210         )
    211     else:
    212         with closing(_gen_open(f)) as f:

File sklearn/datasets/_svmlight_format_fast.pyx:76, in sklearn.datasets._svmlight_format_fast._load_svmlight_file()

ValueError: could not convert string to float: b'\xfd7zXZ\x00\x00\x04\xe6\xd6\xb4F\x02\x00!\x01\x16\x00\x00\x00t/\xe5\xa3\xe2\xd4\xb6\xef\xfe]\x00\x18\x08\x02\x88]S\x0e\\\x92\xf1B\xf1\x89\x1c\x18\xc3k^\x85\xe5\x91y\xd4\xdfK"\xe4\xfd\xe6\x16\xf1D\xc1e\xf5>\x80U\xd6\xfe\x18\x96,P\xb2\x96\xe3U\xc2\xc2\xdd\x17)\xdbm\xc3N\xa4\x1eC\xb8^\xcc~\xde\xef\xa7\x11Z\xc9\x81\xb8\xa6u\xddw\xb0\x8d\xdc;\xcb\xbdq~\x8d|C\x9f\xb6'

Feb 16 '23 13:02 mathurinm

I looked up the SUSY dataset but can't see the error

Feb 16 '23 19:02 Badr-MOUFAD

You may have a cached version already; try forcing the download or deleting the file to reproduce

Le jeu. 16 févr. 2023 à 20:26, Badr MOUFAD @.***> a écrit :

I looked up the SUSY dataset but can't see the error

— Reply to this email directly, view it on GitHub https://github.com/mathurinm/libsvmdata/pull/34#issuecomment-1433601893, or unsubscribe https://github.com/notifications/unsubscribe-auth/ACETTQTB4Z2TS7W4ECNMMXDWXZ5QDANCNFSM5ZC3M2GQ . You are receiving this because you commented.Message ID: @.***>

Feb 16 '23 19:02 mathurinm

libsvmdata libsvmdata copied to clipboard

FIX - bug dowload SUSY dataset

libsvmdata
libsvmdata copied to clipboard