skrub
skrub copied to clipboard
Deduplicate fails when there is not enough variations
Describe the bug
Using deduplicate
on a list with not enough unique values makes it fail in an uninformative way.
Steps/Code to Reproduce
from skrub import deduplicate
deduplicate(["black", "white", "black", "black", "blac"])
deduplicate(["black", "white", "black", "black", "blac"])
Expected Results
No error is thrown or an informative error is thrown
Actual Results
for the first example
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[19], line 3
1 from skrub import deduplicate
----> 3 deduplicate(["black", "white", "black", "black", "blac"])
File [~/VSCProjects/skrub/skrub/_deduplicate.py:248](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/VSCProjects/skrub/skrub/_deduplicate.py:248), in deduplicate(data, n_clusters, ngram_range, analyzer, method, n_jobs)
246 Z = linkage(distance_mat, method=method, optimal_ordering=True)
247 if n_clusters is None:
--> 248 n_clusters = _guess_clusters(Z, distance_mat, n_jobs)
249 clusters = fcluster(Z, n_clusters, criterion="maxclust")
251 translation_table = _create_spelling_correction(unique_words, counts, clusters)
File [~/VSCProjects/skrub/skrub/_deduplicate.py:87](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/VSCProjects/skrub/skrub/_deduplicate.py:87), in _guess_clusters(Z, distance_mat, n_jobs)
85 # silhouette score needs a redundant distance matrix
86 redundant_dist = squareform(distance_mat)
---> 87 silhouette_scores = Parallel(n_jobs=n_jobs, prefer="processes")(
88 delayed(_get_silhouette_avg)(Z, n_clust, redundant_dist)
89 for n_clust in n_clusters
90 )
91 return n_clusters[np.argmax(silhouette_scores)]
File [~/mambaforge/envs/skrub/lib/python3.10/site-packages/joblib/parallel.py:1085](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/mambaforge/envs/skrub/lib/python3.10/site-packages/joblib/parallel.py:1085), in Parallel.__call__(self, iterable)
1076 try:
1077 # Only set self._iterating to True if at least a batch
1078 # was dispatched. In particular this covers the edge
(...)
1082 # was very quick and its callback already dispatched all the
1083 # remaining jobs.
1084 self._iterating = False
-> 1085 if self.dispatch_one_batch(iterator):
1086 self._iterating = self._original_iterator is not None
1088 while self.dispatch_one_batch(iterator):
File [~/mambaforge/envs/skrub/lib/python3.10/site-packages/joblib/parallel.py:901](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/mambaforge/envs/skrub/lib/python3.10/site-packages/joblib/parallel.py:901), in Parallel.dispatch_one_batch(self, iterator)
899 return False
900 else:
--> 901 self._dispatch(tasks)
902 return True
File [~/mambaforge/envs/skrub/lib/python3.10/site-packages/joblib/parallel.py:819](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/mambaforge/envs/skrub/lib/python3.10/site-packages/joblib/parallel.py:819), in Parallel._dispatch(self, batch)
817 with self._lock:
818 job_idx = len(self._jobs)
--> 819 job = self._backend.apply_async(batch, callback=cb)
820 # A job can complete so quickly than its callback is
821 # called before we get here, causing self._jobs to
822 # grow. To ensure correct results ordering, .insert is
823 # used (rather than .append) in the following line
824 self._jobs.insert(job_idx, job)
File [~/mambaforge/envs/skrub/lib/python3.10/site-packages/joblib/_parallel_backends.py:208](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/mambaforge/envs/skrub/lib/python3.10/site-packages/joblib/_parallel_backends.py:208), in SequentialBackend.apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
File [~/mambaforge/envs/skrub/lib/python3.10/site-packages/joblib/_parallel_backends.py:597](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/mambaforge/envs/skrub/lib/python3.10/site-packages/joblib/_parallel_backends.py:597), in ImmediateResult.__init__(self, batch)
594 def __init__(self, batch):
595 # Don't delay the application, to avoid keeping the input
596 # arguments in memory
--> 597 self.results = batch()
File [~/mambaforge/envs/skrub/lib/python3.10/site-packages/joblib/parallel.py:288](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/mambaforge/envs/skrub/lib/python3.10/site-packages/joblib/parallel.py:288), in BatchedCalls.__call__(self)
284 def __call__(self):
285 # Set the default nested backend to self._backend but do not set the
286 # change the default number of processes to -1
287 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288 return [func(*args, **kwargs)
289 for func, args, kwargs in self.items]
File [~/mambaforge/envs/skrub/lib/python3.10/site-packages/joblib/parallel.py:288](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/mambaforge/envs/skrub/lib/python3.10/site-packages/joblib/parallel.py:288), in (.0)
284 def __call__(self):
285 # Set the default nested backend to self._backend but do not set the
286 # change the default number of processes to -1
287 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288 return [func(*args, **kwargs)
289 for func, args, kwargs in self.items]
File [~/VSCProjects/skrub/skrub/_deduplicate.py:61](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/VSCProjects/skrub/skrub/_deduplicate.py:61), in _get_silhouette_avg(Z, n_clust, redundant_dist)
59 def _get_silhouette_avg(Z: NDArray, n_clust: int, redundant_dist: NDArray) -> float:
60 labels = fcluster(Z, n_clust, criterion="maxclust")
---> 61 silhouette_avg = silhouette_score(redundant_dist, labels, metric="precomputed")
62 return silhouette_avg
File [~/mambaforge/envs/skrub/lib/python3.10/site-packages/sklearn/metrics/cluster/_unsupervised.py:117](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/mambaforge/envs/skrub/lib/python3.10/site-packages/sklearn/metrics/cluster/_unsupervised.py:117), in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
115 else:
116 X, labels = X[indices], labels[indices]
--> 117 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
File [~/mambaforge/envs/skrub/lib/python3.10/site-packages/sklearn/metrics/cluster/_unsupervised.py:231](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/mambaforge/envs/skrub/lib/python3.10/site-packages/sklearn/metrics/cluster/_unsupervised.py:231), in silhouette_samples(X, labels, metric, **kwds)
229 n_samples = len(labels)
230 label_freqs = np.bincount(labels)
--> 231 check_number_of_labels(len(le.classes_), n_samples)
233 kwds["metric"] = metric
234 reduce_func = functools.partial(
235 _silhouette_reduce, labels=labels, label_freqs=label_freqs
236 )
File [~/mambaforge/envs/skrub/lib/python3.10/site-packages/sklearn/metrics/cluster/_unsupervised.py:33](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/mambaforge/envs/skrub/lib/python3.10/site-packages/sklearn/metrics/cluster/_unsupervised.py:33), in check_number_of_labels(n_labels, n_samples)
22 """Check that number of labels are valid.
23
24 Parameters
(...)
30 Number of samples.
31 """
32 if not 1 < n_labels < n_samples:
---> 33 raise ValueError(
34 "Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)"
35 % n_labels
36 )
ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)
for the second example
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[18], line 3
1 from skrub import deduplicate
----> 3 deduplicate(["black", "white", "black", "black"])
File [~/VSCProjects/skrub/skrub/_deduplicate.py:248](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/VSCProjects/skrub/skrub/_deduplicate.py:248), in deduplicate(data, n_clusters, ngram_range, analyzer, method, n_jobs)
246 Z = linkage(distance_mat, method=method, optimal_ordering=True)
247 if n_clusters is None:
--> 248 n_clusters = _guess_clusters(Z, distance_mat, n_jobs)
249 clusters = fcluster(Z, n_clusters, criterion="maxclust")
251 translation_table = _create_spelling_correction(unique_words, counts, clusters)
File [~/VSCProjects/skrub/skrub/_deduplicate.py:91](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/VSCProjects/skrub/skrub/_deduplicate.py:91), in _guess_clusters(Z, distance_mat, n_jobs)
86 redundant_dist = squareform(distance_mat)
87 silhouette_scores = Parallel(n_jobs=n_jobs, prefer="processes")(
88 delayed(_get_silhouette_avg)(Z, n_clust, redundant_dist)
89 for n_clust in n_clusters
90 )
---> 91 return n_clusters[np.argmax(silhouette_scores)]
File <__array_function__ internals>:200, in argmax(*args, **kwargs)
File [~/mambaforge/envs/skrub/lib/python3.10/site-packages/numpy/core/fromnumeric.py:1242](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/mambaforge/envs/skrub/lib/python3.10/site-packages/numpy/core/fromnumeric.py:1242), in argmax(a, axis, out, keepdims)
1155 """
1156 Returns the indices of the maximum values along an axis.
1157
(...)
1239 (2, 1, 4)
1240 """
1241 kwds = {'keepdims': keepdims} if keepdims is not np._NoValue else {}
-> 1242 return _wrapfunc(a, 'argmax', axis=axis, out=out, **kwds)
File [~/mambaforge/envs/skrub/lib/python3.10/site-packages/numpy/core/fromnumeric.py:54](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/mambaforge/envs/skrub/lib/python3.10/site-packages/numpy/core/fromnumeric.py:54), in _wrapfunc(obj, method, *args, **kwds)
52 bound = getattr(obj, method, None)
53 if bound is None:
---> 54 return _wrapit(obj, method, *args, **kwds)
56 try:
57 return bound(*args, **kwds)
File [~/mambaforge/envs/skrub/lib/python3.10/site-packages/numpy/core/fromnumeric.py:43](https://file+.vscode-resource.vscode-cdn.net/Users/leo/VSCProjects/skrub/benchmarks/~/mambaforge/envs/skrub/lib/python3.10/site-packages/numpy/core/fromnumeric.py:43), in _wrapit(obj, method, *args, **kwds)
41 except AttributeError:
42 wrap = None
---> 43 result = getattr(asarray(obj), method)(*args, **kwds)
44 if wrap:
45 if not isinstance(result, mu.ndarray):
ValueError: attempt to get argmax of an empty sequence
Versions
System:
python: 3.10.11 | packaged by conda-forge | (main, May 10 2023, 19:01:19) [Clang 14.0.6 ]
executable: /Users/leo/mambaforge/envs/skrub/bin/python
machine: macOS-12.6.5-arm64-arm-64bit
Python dependencies:
sklearn: 1.2.2
pip: 23.1.2
setuptools: 67.7.2
numpy: 1.24.3
scipy: 1.10.1
Cython: None
pandas: 1.5.3
matplotlib: 3.7.1
joblib: 1.2.0
threadpoolctl: 3.1.0
Built with OpenMP: True
threadpoolctl info:
user_api: blas
internal_api: openblas
prefix: libopenblas
filepath: /Users/leo/mambaforge/envs/skrub/lib/python3.10/site-packages/numpy/.dylibs/libopenblas64_.0.dylib
version: 0.3.21
threading_layer: pthreads
architecture: armv8
num_threads: 8
user_api: blas
internal_api: openblas
prefix: libopenblas
filepath: /Users/leo/mambaforge/envs/skrub/lib/python3.10/site-packages/scipy/.dylibs/libopenblas.0.dylib
version: 0.3.18
threading_layer: pthreads
architecture: armv8
num_threads: 8
user_api: openmp
internal_api: openmp
prefix: libomp
filepath: /Users/leo/mambaforge/envs/skrub/lib/python3.10/site-packages/sklearn/.dylibs/libomp.dylib
version: None
num_threads: 8
0.0.1.dev0