scikit-learn-intelex icon indicating copy to clipboard operation
scikit-learn-intelex copied to clipboard

Clustering performance decrease

Open Alexsandruss opened this issue 3 years ago • 0 comments
trafficstars

Describe the bug Some clustering algorithms have speedup < 0.95 if sklearnex is used. Reported by @psmgeelen at https://github.com/intel/scikit-learn-intelex/issues/932. Partially caused by assume_all_finite validation function.

To Reproduce Reproducer (modified code from https://github.com/intel/scikit-learn-intelex/issues/932):

# enable DEBUG logging level to track sklearnex usage
import logging
logging.getLogger().setLevel(logging.DEBUG)

import time
import warnings
import numpy as np
from itertools import cycle, islice


def clustering(assume_finite=False):
    import sklearn as skl
    from sklearn import cluster, datasets, mixture
    from sklearn.neighbors import kneighbors_graph
    from sklearn.preprocessing import StandardScaler
    skl.set_config(assume_finite=assume_finite)

    np.random.seed(0)

    # ============
    # Generate datasets. We choose the size big enough to see the scalability
    # of the algorithms, but not too big to avoid too long running times
    # ============
    n_samples = 1500
    noisy_circles = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05)
    noisy_moons = datasets.make_moons(n_samples=n_samples, noise=0.05)
    blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
    no_structure = np.random.rand(n_samples, 2), None

    # Anisotropicly distributed data
    random_state = 170
    X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
    transformation = [[0.6, -0.6], [-0.4, 0.8]]
    X_aniso = np.dot(X, transformation)
    aniso = (X_aniso, y)

    # blobs with varied variances
    varied = datasets.make_blobs(
        n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
    )

    # ============
    # Set up cluster parameters
    # ============
    default_base = {
        "quantile": 0.3,
        "eps": 0.3,
        "damping": 0.9,
        "preference": -200,
        "n_neighbors": 10,
        "n_clusters": 3,
        "min_samples": 20,
        "xi": 0.05,
        "min_cluster_size": 0.1,
    }

    clst_datasets = [
        (
            noisy_circles,
            {
                "damping": 0.77,
                "preference": -240,
                "quantile": 0.2,
                "n_clusters": 2,
                "min_samples": 20,
                "xi": 0.25,
            },
        ),
        (noisy_moons, {"damping": 0.75, "preference": -220, "n_clusters": 2}),
        (
            varied,
            {
                "eps": 0.18,
                "n_neighbors": 2,
                "min_samples": 5,
                "xi": 0.035,
                "min_cluster_size": 0.2,
            },
        ),
        (
            aniso,
            {
                "eps": 0.15,
                "n_neighbors": 2,
                "min_samples": 20,
                "xi": 0.1,
                "min_cluster_size": 0.2,
            },
        ),
        (blobs, {}),
        (no_structure, {}),
    ]
    clst_datasets_names = ['noisy_circles', 'noisy_moons', 'varied', 'aniso', 'blobs', 'no_structure']

    for i_dataset, (dataset, algo_params) in enumerate(clst_datasets):
        # update parameters with dataset-specific values
        params = default_base.copy()
        params.update(algo_params)

        X, y = dataset

        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(X)

        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(X, quantile=params["quantile"])

        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(
            X, n_neighbors=params["n_neighbors"], include_self=False
        )
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)

        # ============
        # Create cluster objects
        # ============
        ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
        two_means = cluster.MiniBatchKMeans(n_clusters=params["n_clusters"])
        ward = cluster.AgglomerativeClustering(
            n_clusters=params["n_clusters"], linkage="ward", connectivity=connectivity
        )
        spectral = cluster.SpectralClustering(
            n_clusters=params["n_clusters"],
            eigen_solver="arpack",
            affinity="nearest_neighbors",
        )
        dbscan = cluster.DBSCAN(eps=params["eps"])
        optics = cluster.OPTICS(
            min_samples=params["min_samples"],
            xi=params["xi"],
            min_cluster_size=params["min_cluster_size"],
        )
        affinity_propagation = cluster.AffinityPropagation(
            damping=params["damping"], preference=params["preference"], random_state=0
        )
        average_linkage = cluster.AgglomerativeClustering(
            linkage="average",
            affinity="cityblock",
            n_clusters=params["n_clusters"],
            connectivity=connectivity,
        )
        birch = cluster.Birch(n_clusters=params["n_clusters"])
        gmm = mixture.GaussianMixture(
            n_components=params["n_clusters"], covariance_type="full"
        )

        clustering_algorithms = (
            ("MiniBatch\nKMeans", two_means),
            ("Affinity\nPropagation", affinity_propagation),
            ("MeanShift", ms),
            ("Spectral\nClustering", spectral),
            ("Ward", ward),
            ("Agglomerative\nClustering", average_linkage),
            ("DBSCAN", dbscan),
            ("OPTICS", optics),
            ("BIRCH", birch),
            ("Gaussian\nMixture", gmm),
        )

        for name, algorithm in clustering_algorithms:
            t0 = time.time()

            # catch warnings related to kneighbors_graph
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore",
                    message="the number of connected components of the "
                    + "connectivity matrix is [0-9]{1,2}"
                    + " > 1. Completing it to avoid stopping the tree early.",
                    category=UserWarning,
                )
                warnings.filterwarnings(
                    "ignore",
                    message="Graph is not fully connected, spectral embedding"
                    + " may not work as expected.",
                    category=UserWarning,
                )
                algorithm.fit(X)

            t1 = time.time()

            line_break = '\n'
            case_name = f"{name.replace(line_break, ' ')} - {clst_datasets_names[i_dataset]}"
            if case_name not in perf_data.keys():
                perf_data[case_name] = [t1 - t0, ]
            else:
                perf_data[case_name].append(t1 - t0)


def run_comparison(assume_finite=False, n_runs=5):
    global perf_data

    from sklearnex import unpatch_sklearn
    unpatch_sklearn()

    perf_data = {}
    for i in range(n_runs):
        clustering(assume_finite=assume_finite)
    def_perf_data = perf_data.copy()
    
    from sklearnex import patch_sklearn
    patch_sklearn()
    
    perf_data = {}
    for i in range(n_runs):
        clustering(assume_finite=assume_finite)
    opt_perf_data = perf_data.copy()
    
    print('Alg | Dataset | Speedup\n--- | --- | ---')
    for key in def_perf_data:
        alg_name, data_name = key.split(' - ')
        def_time = sum(def_perf_data[key]) / len(def_perf_data[key])
        opt_time = sum(opt_perf_data[key]) / len(opt_perf_data[key])
        speedup = def_time / opt_time
        if speedup < 0.9 or speedup > 1.5:
            print(alg_name, data_name, speedup, sep=' | ')


run_comparison(assume_finite=False)
run_comparison(assume_finite=True)

Expected behavior Clustering algorithm speedups > 0.95

Output/Screenshots With assume_finite=False:

Alg Dataset Speedup
MiniBatch KMeans noisy_circles 1.5424084986061413
MeanShift noisy_circles 0.8906212280161131
Agglomerative Clustering noisy_circles 0.8477524914054186
DBSCAN noisy_circles 0.7486776825618108
OPTICS noisy_circles 0.8315285621860906
BIRCH noisy_circles 2.9126767716285507
Gaussian Mixture noisy_circles 1.5707238504295098
MeanShift noisy_moons 0.873389332160128
Spectral Clustering noisy_moons 0.8863623889720044
DBSCAN noisy_moons 2.170077954357159
OPTICS noisy_moons 0.8256592058631651
Gaussian Mixture noisy_moons 1.981423040867879
MiniBatch KMeans varied 0.8000331881443596
MeanShift varied 0.8755708509517118
Ward varied 0.7533115130790967
Agglomerative Clustering varied 0.7173325047787925
DBSCAN varied 2.6880284447454468
OPTICS varied 0.8328274771473778
Gaussian Mixture varied 1.7147550603957624
MiniBatch KMeans aniso 0.8246916385036271
MeanShift aniso 0.8702642526960361
Ward aniso 0.748083536618873
Agglomerative Clustering aniso 0.7445037265218657
DBSCAN aniso 2.4418816912526515
OPTICS aniso 0.8246993441165708
MeanShift blobs 0.8640600646439754
DBSCAN blobs 3.254734072915891
OPTICS blobs 0.8132894811389668
Gaussian Mixture blobs 1.7761056992498039
MeanShift no_structure 0.8748425261959033
DBSCAN no_structure 2.933828881796293
OPTICS no_structure 0.8229861092286408

With assume_finite=True:

Alg Dataset Speedup
MiniBatch KMeans noisy_circles 0.8383482798916301
Ward noisy_circles 0.888451098496735
Agglomerative Clustering noisy_circles 0.8544552767601262
DBSCAN noisy_circles 2.4742260996332655
Gaussian Mixture noisy_circles 1.840857669722611
MiniBatch KMeans noisy_moons 0.865952675317415
DBSCAN noisy_moons 2.5387578721693687
Gaussian Mixture noisy_moons 1.9125839625192775
DBSCAN varied 2.8764809052333806
DBSCAN aniso 2.3317455162209955
Gaussian Mixture aniso 1.532437695421681
DBSCAN blobs 3.6175978247801193
Gaussian Mixture blobs 1.5238006360772252
DBSCAN no_structure 3.2661174275368046
Gaussian Mixture no_structure 1.7410552737541178

Environment:

  • OS: Red Hat Enterprise Linux Server 7.2
  • Compiler: Intel(R) C++ Compiler 2021.3.0
  • Version: compiled from https://github.com/intel/scikit-learn-intelex/commit/08d4f573471a2772ddad99cad87a6ce306b2256d
  • CPU: Intel(R) Xeon(R) CPU E5-2680 v3

Alexsandruss avatar Jan 12 '22 11:01 Alexsandruss