cuml
cuml copied to clipboard
[BUG] Pipeline of MinMaxScaler and KMeans with DataFrame input produces incorrect results
Describe the bug
When using a Pipeline([ ('minmax', MinMaxScaler()), ('kmeans', Kmeans())])
, with a cudf DataFrame
input, the results will be incorrect if the dataframe index is not sorted.
Steps/Code to reproduce bug
#!/usr/bin/env python
# coding: utf-8
# In[2]:
import cudf
import numpy as np
import cupy as cp
import matplotlib.pyplot as plt
from cuml.cluster import KMeans
from cuml.pipeline import Pipeline
from cuml.preprocessing import MinMaxScaler
from cuml.datasets import make_blobs
# In[4]:
# Create dataset of random blobs
n_rows = 1000
n_cols = 2
dataset, _ = make_blobs(n_samples=n_rows, n_features=n_cols, centers=2, cluster_std=1.0, center_box=(-10.0, 10.0), random_state=137, dtype='float32')
# Create dataframe with random index
idx = cp.random.permutation(n_rows)
df = cudf.DataFrame(data={"I": idx, "A": dataset[:,0], "B": dataset[:,1]})
df = df.set_index("I")
print(df)
# In[5]:
# Clustering the cupy the dataset works as expected
mms = MinMaxScaler()
kmeans = KMeans(n_clusters=2)
pipeline = Pipeline([
('minmax', mms),
('kmeans', kmeans)
])
pipeline.fit(dataset)
labels = pipeline.predict(dataset)
plt.scatter(cp.asnumpy(dataset[:,0]), cp.asnumpy(dataset[:,1]), c=cp.asnumpy(labels))
# In[6]:
# Using the dataframe as input does not work
mms = MinMaxScaler()
kmeans = KMeans(n_clusters=2)
pipeline = Pipeline([
('minmax', mms),
('kmeans', kmeans)
])
pipeline.fit(df[['A', 'B']])
prediction = pipeline.predict(df[['A', 'B']])
df['C'] = prediction
plt.scatter(df["A"].to_numpy(), df["B"].to_numpy(), c=df["C"].to_numpy())
# In[7]:
# If the dataframe is sorted then it works
df = df.sort_index()
mms = MinMaxScaler()
kmeans = KMeans(n_clusters=2)
pipeline = Pipeline([
('minmax', mms),
('kmeans', kmeans)
])
pipeline.fit(df[['A', 'B']])
prediction = pipeline.predict(df[['A', 'B']])
df['C'] = prediction
# tmp = df.reset_index()[['I', 'C']]
plt.scatter(df["A"].to_numpy(), df["B"].to_numpy(), c=df["C"].to_numpy())
Expected behavior I expect to run the pipeline and get correct clustering regardless whether the dataframe index is sorted or not.