hdbscan
hdbscan copied to clipboard
Strange seemingly random results when using high-dimensional features
Hi, I've been experimenting with insightface's feature vectors and have been noticing some strange inconsistencies when clustering with HDBSCAN. Sometimes the results will be very high quality, such as this (example 1):
But then other times, I get results like this (example 2):
I've dumped the features into a *.npz
here (in case anyone wanted to look at it).
I've also wrote a quick script to replicate this projection here:
import hdbscan
import numpy as np
from bokeh.plotting import figure, output_file, show
from bokeh.resources import CDN
from bokeh.colors.rgb import RGB
from bokeh.palettes import Turbo256, linear_palette
from bokeh.models import ColumnDataSource
from bokeh.models.tools import WheelZoomTool, PanTool, ResetTool, BoxZoomTool
from umap import UMAP
from webcolors import hex_to_rgb
def project_cluster(features: np.array, labels: np.array) -> str:
"""_summary_
Args:
clusterer (Clusterer): Clusterer object
features (np.array): Features used to create clusterer.
face_events (list): face_events used for projection.
emp_map (dict): emp_map at the time of creation.
show_projection (bool, optional): Opens a browser with the projection.
Defaults to False.
Returns:
str: HTML comprising the projection
"""
color_map = linear_palette(Turbo256, len(set(labels)) + 1)
color_matrix = []
for label in labels:
if label == -1:
color_matrix.append(RGB(0, 0, 0, a=0.1))
else:
rgb = hex_to_rgb(color_map[int(label)])
color_matrix.append(RGB(rgb[0], rgb[1], rgb[2], a=0.7))
print(f'projecting clusters...')
projection = UMAP(n_neighbors=15, n_components=2, metric="cosine").fit_transform(
features
)
columndata = ColumnDataSource(
{
"x": projection[:, 0],
"y": projection[:, 1],
"colors": color_matrix,
}
)
output_file(
f"cluster.html",
title=f"cluster",
)
p = figure(
sizing_mode="stretch_both",
tools=[WheelZoomTool(), PanTool(), ResetTool(), BoxZoomTool()],
)
p.circle(
x="x",
y="y",
source=columndata,
size=10,
color="colors",
)
show(p)
if __name__ == "__main__":
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("filename")
args = ap.parse_args()
features = np.load(args.filename)["features"]
print(f'clustering {features.shape}...')
cluster = hdbscan.HDBSCAN(50, 50, core_dist_n_jobs=-1, prediction_data=True).fit(
features
)
project_cluster(features, cluster.labels_)
It's probably a problem with the features, but if it is, how can one detect something like this without having to cluster it first? Is there an inherit way to remove bad features that could cause this kind of behaviour? What could cause HDBSCAN to cluster so strangely? There are clearly different clusters that UMAP has managed to detect, but for some reason they're all clumped together in example 2.