scikit-learn-intelex Clustering performance decrease

Clustering performance decrease

Open Alexsandruss opened this issue 3 years ago • 0 comments

trafficstars

Describe the bug Some clustering algorithms have speedup < 0.95 if sklearnex is used. Reported by @psmgeelen at https://github.com/intel/scikit-learn-intelex/issues/932. Partially caused by assume_all_finite validation function.

To Reproduce Reproducer (modified code from https://github.com/intel/scikit-learn-intelex/issues/932):

# enable DEBUG logging level to track sklearnex usage
import logging
logging.getLogger().setLevel(logging.DEBUG)

import time
import warnings
import numpy as np
from itertools import cycle, islice


def clustering(assume_finite=False):
    import sklearn as skl
    from sklearn import cluster, datasets, mixture
    from sklearn.neighbors import kneighbors_graph
    from sklearn.preprocessing import StandardScaler
    skl.set_config(assume_finite=assume_finite)

    np.random.seed(0)

    # ============
    # Generate datasets. We choose the size big enough to see the scalability
    # of the algorithms, but not too big to avoid too long running times
    # ============
    n_samples = 1500
    noisy_circles = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05)
    noisy_moons = datasets.make_moons(n_samples=n_samples, noise=0.05)
    blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
    no_structure = np.random.rand(n_samples, 2), None

    # Anisotropicly distributed data
    random_state = 170
    X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
    transformation = [[0.6, -0.6], [-0.4, 0.8]]
    X_aniso = np.dot(X, transformation)
    aniso = (X_aniso, y)

    # blobs with varied variances
    varied = datasets.make_blobs(
        n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
    )

    # ============
    # Set up cluster parameters
    # ============
    default_base = {
        "quantile": 0.3,
        "eps": 0.3,
        "damping": 0.9,
        "preference": -200,
        "n_neighbors": 10,
        "n_clusters": 3,
        "min_samples": 20,
        "xi": 0.05,
        "min_cluster_size": 0.1,
    }

    clst_datasets = [
        (
            noisy_circles,
            {
                "damping": 0.77,
                "preference": -240,
                "quantile": 0.2,
                "n_clusters": 2,
                "min_samples": 20,
                "xi": 0.25,
            },
        ),
        (noisy_moons, {"damping": 0.75, "preference": -220, "n_clusters": 2}),
        (
            varied,
            {
                "eps": 0.18,
                "n_neighbors": 2,
                "min_samples": 5,
                "xi": 0.035,
                "min_cluster_size": 0.2,
            },
        ),
        (
            aniso,
            {
                "eps": 0.15,
                "n_neighbors": 2,
                "min_samples": 20,
                "xi": 0.1,
                "min_cluster_size": 0.2,
            },
        ),
        (blobs, {}),
        (no_structure, {}),
    ]
    clst_datasets_names = ['noisy_circles', 'noisy_moons', 'varied', 'aniso', 'blobs', 'no_structure']

    for i_dataset, (dataset, algo_params) in enumerate(clst_datasets):
        # update parameters with dataset-specific values
        params = default_base.copy()
        params.update(algo_params)

        X, y = dataset

        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(X)

        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(X, quantile=params["quantile"])

        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(
            X, n_neighbors=params["n_neighbors"], include_self=False
        )
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)

        # ============
        # Create cluster objects
        # ============
        ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
        two_means = cluster.MiniBatchKMeans(n_clusters=params["n_clusters"])
        ward = cluster.AgglomerativeClustering(
            n_clusters=params["n_clusters"], linkage="ward", connectivity=connectivity
        )
        spectral = cluster.SpectralClustering(
            n_clusters=params["n_clusters"],
            eigen_solver="arpack",
            affinity="nearest_neighbors",
        )
        dbscan = cluster.DBSCAN(eps=params["eps"])
        optics = cluster.OPTICS(
            min_samples=params["min_samples"],
            xi=params["xi"],
            min_cluster_size=params["min_cluster_size"],
        )
        affinity_propagation = cluster.AffinityPropagation(
            damping=params["damping"], preference=params["preference"], random_state=0
        )
        average_linkage = cluster.AgglomerativeClustering(
            linkage="average",
            affinity="cityblock",
            n_clusters=params["n_clusters"],
            connectivity=connectivity,
        )
        birch = cluster.Birch(n_clusters=params["n_clusters"])
        gmm = mixture.GaussianMixture(
            n_components=params["n_clusters"], covariance_type="full"
        )

        clustering_algorithms = (
            ("MiniBatch\nKMeans", two_means),
            ("Affinity\nPropagation", affinity_propagation),
            ("MeanShift", ms),
            ("Spectral\nClustering", spectral),
            ("Ward", ward),
            ("Agglomerative\nClustering", average_linkage),
            ("DBSCAN", dbscan),
            ("OPTICS", optics),
            ("BIRCH", birch),
            ("Gaussian\nMixture", gmm),
        )

        for name, algorithm in clustering_algorithms:
            t0 = time.time()

            # catch warnings related to kneighbors_graph
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore",
                    message="the number of connected components of the "
                    + "connectivity matrix is [0-9]{1,2}"
                    + " > 1. Completing it to avoid stopping the tree early.",
                    category=UserWarning,
                )
                warnings.filterwarnings(
                    "ignore",
                    message="Graph is not fully connected, spectral embedding"
                    + " may not work as expected.",
                    category=UserWarning,
                )
                algorithm.fit(X)

            t1 = time.time()

            line_break = '\n'
            case_name = f"{name.replace(line_break, ' ')} - {clst_datasets_names[i_dataset]}"
            if case_name not in perf_data.keys():
                perf_data[case_name] = [t1 - t0, ]
            else:
                perf_data[case_name].append(t1 - t0)


def run_comparison(assume_finite=False, n_runs=5):
    global perf_data

    from sklearnex import unpatch_sklearn
    unpatch_sklearn()

    perf_data = {}
    for i in range(n_runs):
        clustering(assume_finite=assume_finite)
    def_perf_data = perf_data.copy()
    
    from sklearnex import patch_sklearn
    patch_sklearn()
    
    perf_data = {}
    for i in range(n_runs):
        clustering(assume_finite=assume_finite)
    opt_perf_data = perf_data.copy()
    
    print('Alg | Dataset | Speedup\n--- | --- | ---')
    for key in def_perf_data:
        alg_name, data_name = key.split(' - ')
        def_time = sum(def_perf_data[key]) / len(def_perf_data[key])
        opt_time = sum(opt_perf_data[key]) / len(opt_perf_data[key])
        speedup = def_time / opt_time
        if speedup < 0.9 or speedup > 1.5:
            print(alg_name, data_name, speedup, sep=' | ')


run_comparison(assume_finite=False)
run_comparison(assume_finite=True)

Expected behavior Clustering algorithm speedups > 0.95

Output/Screenshots With assume_finite=False:

Alg	Dataset	Speedup
MiniBatch KMeans	noisy_circles	1.5424084986061413
MeanShift	noisy_circles	0.8906212280161131
Agglomerative Clustering	noisy_circles	0.8477524914054186
DBSCAN	noisy_circles	0.7486776825618108
OPTICS	noisy_circles	0.8315285621860906
BIRCH	noisy_circles	2.9126767716285507
Gaussian Mixture	noisy_circles	1.5707238504295098
MeanShift	noisy_moons	0.873389332160128
Spectral Clustering	noisy_moons	0.8863623889720044
DBSCAN	noisy_moons	2.170077954357159
OPTICS	noisy_moons	0.8256592058631651
Gaussian Mixture	noisy_moons	1.981423040867879
MiniBatch KMeans	varied	0.8000331881443596
MeanShift	varied	0.8755708509517118
Ward	varied	0.7533115130790967
Agglomerative Clustering	varied	0.7173325047787925
DBSCAN	varied	2.6880284447454468
OPTICS	varied	0.8328274771473778
Gaussian Mixture	varied	1.7147550603957624
MiniBatch KMeans	aniso	0.8246916385036271
MeanShift	aniso	0.8702642526960361
Ward	aniso	0.748083536618873
Agglomerative Clustering	aniso	0.7445037265218657
DBSCAN	aniso	2.4418816912526515
OPTICS	aniso	0.8246993441165708
MeanShift	blobs	0.8640600646439754
DBSCAN	blobs	3.254734072915891
OPTICS	blobs	0.8132894811389668
Gaussian Mixture	blobs	1.7761056992498039
MeanShift	no_structure	0.8748425261959033
DBSCAN	no_structure	2.933828881796293
OPTICS	no_structure	0.8229861092286408

With assume_finite=True:

Alg	Dataset	Speedup
MiniBatch KMeans	noisy_circles	0.8383482798916301
Ward	noisy_circles	0.888451098496735
Agglomerative Clustering	noisy_circles	0.8544552767601262
DBSCAN	noisy_circles	2.4742260996332655
Gaussian Mixture	noisy_circles	1.840857669722611
MiniBatch KMeans	noisy_moons	0.865952675317415
DBSCAN	noisy_moons	2.5387578721693687
Gaussian Mixture	noisy_moons	1.9125839625192775
DBSCAN	varied	2.8764809052333806
DBSCAN	aniso	2.3317455162209955
Gaussian Mixture	aniso	1.532437695421681
DBSCAN	blobs	3.6175978247801193
Gaussian Mixture	blobs	1.5238006360772252
DBSCAN	no_structure	3.2661174275368046
Gaussian Mixture	no_structure	1.7410552737541178

Environment:

OS: Red Hat Enterprise Linux Server 7.2
Compiler: Intel(R) C++ Compiler 2021.3.0
Version: compiled from https://github.com/intel/scikit-learn-intelex/commit/08d4f573471a2772ddad99cad87a6ce306b2256d
CPU: Intel(R) Xeon(R) CPU E5-2680 v3

Jan 12 '22 11:01 Alexsandruss

scikit-learn-intelex scikit-learn-intelex copied to clipboard

Clustering performance decrease

scikit-learn-intelex
scikit-learn-intelex copied to clipboard