scikit-learn-intelex
scikit-learn-intelex copied to clipboard
Clustering performance decrease
trafficstars
Describe the bug Some clustering algorithms have speedup < 0.95 if sklearnex is used. Reported by @psmgeelen at https://github.com/intel/scikit-learn-intelex/issues/932. Partially caused by assume_all_finite validation function.
To Reproduce Reproducer (modified code from https://github.com/intel/scikit-learn-intelex/issues/932):
# enable DEBUG logging level to track sklearnex usage
import logging
logging.getLogger().setLevel(logging.DEBUG)
import time
import warnings
import numpy as np
from itertools import cycle, islice
def clustering(assume_finite=False):
import sklearn as skl
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
skl.set_config(assume_finite=assume_finite)
np.random.seed(0)
# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=0.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None
# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)
# blobs with varied variances
varied = datasets.make_blobs(
n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
)
# ============
# Set up cluster parameters
# ============
default_base = {
"quantile": 0.3,
"eps": 0.3,
"damping": 0.9,
"preference": -200,
"n_neighbors": 10,
"n_clusters": 3,
"min_samples": 20,
"xi": 0.05,
"min_cluster_size": 0.1,
}
clst_datasets = [
(
noisy_circles,
{
"damping": 0.77,
"preference": -240,
"quantile": 0.2,
"n_clusters": 2,
"min_samples": 20,
"xi": 0.25,
},
),
(noisy_moons, {"damping": 0.75, "preference": -220, "n_clusters": 2}),
(
varied,
{
"eps": 0.18,
"n_neighbors": 2,
"min_samples": 5,
"xi": 0.035,
"min_cluster_size": 0.2,
},
),
(
aniso,
{
"eps": 0.15,
"n_neighbors": 2,
"min_samples": 20,
"xi": 0.1,
"min_cluster_size": 0.2,
},
),
(blobs, {}),
(no_structure, {}),
]
clst_datasets_names = ['noisy_circles', 'noisy_moons', 'varied', 'aniso', 'blobs', 'no_structure']
for i_dataset, (dataset, algo_params) in enumerate(clst_datasets):
# update parameters with dataset-specific values
params = default_base.copy()
params.update(algo_params)
X, y = dataset
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = cluster.estimate_bandwidth(X, quantile=params["quantile"])
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(
X, n_neighbors=params["n_neighbors"], include_self=False
)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# ============
# Create cluster objects
# ============
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=params["n_clusters"])
ward = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"], linkage="ward", connectivity=connectivity
)
spectral = cluster.SpectralClustering(
n_clusters=params["n_clusters"],
eigen_solver="arpack",
affinity="nearest_neighbors",
)
dbscan = cluster.DBSCAN(eps=params["eps"])
optics = cluster.OPTICS(
min_samples=params["min_samples"],
xi=params["xi"],
min_cluster_size=params["min_cluster_size"],
)
affinity_propagation = cluster.AffinityPropagation(
damping=params["damping"], preference=params["preference"], random_state=0
)
average_linkage = cluster.AgglomerativeClustering(
linkage="average",
affinity="cityblock",
n_clusters=params["n_clusters"],
connectivity=connectivity,
)
birch = cluster.Birch(n_clusters=params["n_clusters"])
gmm = mixture.GaussianMixture(
n_components=params["n_clusters"], covariance_type="full"
)
clustering_algorithms = (
("MiniBatch\nKMeans", two_means),
("Affinity\nPropagation", affinity_propagation),
("MeanShift", ms),
("Spectral\nClustering", spectral),
("Ward", ward),
("Agglomerative\nClustering", average_linkage),
("DBSCAN", dbscan),
("OPTICS", optics),
("BIRCH", birch),
("Gaussian\nMixture", gmm),
)
for name, algorithm in clustering_algorithms:
t0 = time.time()
# catch warnings related to kneighbors_graph
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="the number of connected components of the "
+ "connectivity matrix is [0-9]{1,2}"
+ " > 1. Completing it to avoid stopping the tree early.",
category=UserWarning,
)
warnings.filterwarnings(
"ignore",
message="Graph is not fully connected, spectral embedding"
+ " may not work as expected.",
category=UserWarning,
)
algorithm.fit(X)
t1 = time.time()
line_break = '\n'
case_name = f"{name.replace(line_break, ' ')} - {clst_datasets_names[i_dataset]}"
if case_name not in perf_data.keys():
perf_data[case_name] = [t1 - t0, ]
else:
perf_data[case_name].append(t1 - t0)
def run_comparison(assume_finite=False, n_runs=5):
global perf_data
from sklearnex import unpatch_sklearn
unpatch_sklearn()
perf_data = {}
for i in range(n_runs):
clustering(assume_finite=assume_finite)
def_perf_data = perf_data.copy()
from sklearnex import patch_sklearn
patch_sklearn()
perf_data = {}
for i in range(n_runs):
clustering(assume_finite=assume_finite)
opt_perf_data = perf_data.copy()
print('Alg | Dataset | Speedup\n--- | --- | ---')
for key in def_perf_data:
alg_name, data_name = key.split(' - ')
def_time = sum(def_perf_data[key]) / len(def_perf_data[key])
opt_time = sum(opt_perf_data[key]) / len(opt_perf_data[key])
speedup = def_time / opt_time
if speedup < 0.9 or speedup > 1.5:
print(alg_name, data_name, speedup, sep=' | ')
run_comparison(assume_finite=False)
run_comparison(assume_finite=True)
Expected behavior Clustering algorithm speedups > 0.95
Output/Screenshots With assume_finite=False:
| Alg | Dataset | Speedup |
|---|---|---|
| MiniBatch KMeans | noisy_circles | 1.5424084986061413 |
| MeanShift | noisy_circles | 0.8906212280161131 |
| Agglomerative Clustering | noisy_circles | 0.8477524914054186 |
| DBSCAN | noisy_circles | 0.7486776825618108 |
| OPTICS | noisy_circles | 0.8315285621860906 |
| BIRCH | noisy_circles | 2.9126767716285507 |
| Gaussian Mixture | noisy_circles | 1.5707238504295098 |
| MeanShift | noisy_moons | 0.873389332160128 |
| Spectral Clustering | noisy_moons | 0.8863623889720044 |
| DBSCAN | noisy_moons | 2.170077954357159 |
| OPTICS | noisy_moons | 0.8256592058631651 |
| Gaussian Mixture | noisy_moons | 1.981423040867879 |
| MiniBatch KMeans | varied | 0.8000331881443596 |
| MeanShift | varied | 0.8755708509517118 |
| Ward | varied | 0.7533115130790967 |
| Agglomerative Clustering | varied | 0.7173325047787925 |
| DBSCAN | varied | 2.6880284447454468 |
| OPTICS | varied | 0.8328274771473778 |
| Gaussian Mixture | varied | 1.7147550603957624 |
| MiniBatch KMeans | aniso | 0.8246916385036271 |
| MeanShift | aniso | 0.8702642526960361 |
| Ward | aniso | 0.748083536618873 |
| Agglomerative Clustering | aniso | 0.7445037265218657 |
| DBSCAN | aniso | 2.4418816912526515 |
| OPTICS | aniso | 0.8246993441165708 |
| MeanShift | blobs | 0.8640600646439754 |
| DBSCAN | blobs | 3.254734072915891 |
| OPTICS | blobs | 0.8132894811389668 |
| Gaussian Mixture | blobs | 1.7761056992498039 |
| MeanShift | no_structure | 0.8748425261959033 |
| DBSCAN | no_structure | 2.933828881796293 |
| OPTICS | no_structure | 0.8229861092286408 |
With assume_finite=True:
| Alg | Dataset | Speedup |
|---|---|---|
| MiniBatch KMeans | noisy_circles | 0.8383482798916301 |
| Ward | noisy_circles | 0.888451098496735 |
| Agglomerative Clustering | noisy_circles | 0.8544552767601262 |
| DBSCAN | noisy_circles | 2.4742260996332655 |
| Gaussian Mixture | noisy_circles | 1.840857669722611 |
| MiniBatch KMeans | noisy_moons | 0.865952675317415 |
| DBSCAN | noisy_moons | 2.5387578721693687 |
| Gaussian Mixture | noisy_moons | 1.9125839625192775 |
| DBSCAN | varied | 2.8764809052333806 |
| DBSCAN | aniso | 2.3317455162209955 |
| Gaussian Mixture | aniso | 1.532437695421681 |
| DBSCAN | blobs | 3.6175978247801193 |
| Gaussian Mixture | blobs | 1.5238006360772252 |
| DBSCAN | no_structure | 3.2661174275368046 |
| Gaussian Mixture | no_structure | 1.7410552737541178 |
Environment:
- OS: Red Hat Enterprise Linux Server 7.2
- Compiler: Intel(R) C++ Compiler 2021.3.0
- Version: compiled from https://github.com/intel/scikit-learn-intelex/commit/08d4f573471a2772ddad99cad87a6ce306b2256d
- CPU: Intel(R) Xeon(R) CPU E5-2680 v3