textvec
textvec copied to clipboard
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
Code:
...
train['title'].isnull().sum()
# Out: 0
title_countvec = CountVectorizer(ngram_range=(1,3), max_features=300000, lowercase=True)
title_countvec.fit(train['title'], y_train)
train_title_countvec = title_countvec.transform(train['title'])
title_vectorizer = TfIcfVectorizer(norm='l2', sublinear_tf=True)
title_vectorizer.fit(train_title_countvec, y_train)
train_title_countvec = title_countvec.transform(train['title'])
np.isfinite(train_title_countvec.data).all(), np.isinf(train_title_countvec.data).any()
# Out: (True, False)
train_transformed['title'] = title_vectorizer.transform(train_title_countvec)
# Error
Traceback:
ValueError Traceback (most recent call last)
<ipython-input-72-5fdf9718ecba> in <module>()
----> 1 train_transformed['title'] = title_vectorizer.transform(train_title_countvec)
/usr/local/lib/python3.5/dist-packages/textvec/vectorizers.py in transform(self, X, min_freq)
45 X = X * sp.spdiags(self.k, 0, f, f)
46 if self.norm:
---> 47 X = normalize(X, self.norm)
48 return X
49
~/.local/lib/python3.5/site-packages/sklearn/preprocessing/data.py in normalize(X, norm, axis, copy, return_norm)
1410
1411 X = check_array(X, sparse_format, copy=copy,
-> 1412 estimator='the normalize function', dtype=FLOAT_DTYPES)
1413 if axis == 0:
1414 X = X.T
~/.local/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
429 if sp.issparse(array):
430 array = _ensure_sparse_format(array, accept_sparse, dtype, copy,
--> 431 force_all_finite)
432 else:
433 array = np.array(array, dtype=dtype, order=order, copy=copy)
~/.local/lib/python3.5/site-packages/sklearn/utils/validation.py in _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, force_all_finite)
304 % spmatrix.format)
305 else:
--> 306 _assert_all_finite(spmatrix.data)
307 return spmatrix
308
~/.local/lib/python3.5/site-packages/sklearn/utils/validation.py in _assert_all_finite(X)
42 and not np.isfinite(X).all()):
43 raise ValueError("Input contains NaN, infinity"
---> 44 " or a value too large for %r." % X.dtype)
45
46
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
Same issue
Is it OK to fix with?:
from textvec.vectorizers import TfIcfVectorizer
class MyTficf(TfIcfVectorizer):
def fit(self, X, y):
n_samples, n_features = X.shape
...
self.corpus_occurence = np.where(self.corpus_occurence == 0, 1, self.corpus_occurence)
self.k = np.log2(1 + (self.number_of_classes / self.corpus_occurence ))
self._n_features = n_features
return self
Does anyone was able to find any solution for this ?
so... does it fix?