# %% !pip install -Uq sentence-transformers hdbscan # %% from hdbscan.hdbscan_ import hdbscan import numpy as np from sentence_transformers import SentenceTransformer import seaborn as sns import matplotlib.pyplot as plt from umap import UMAP import hdbscan model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu') embeddings = model.encode([ 'this is about a cat', 'but this one is not about animals', 'this is about a dog', 'but this one is about animals', 'this one is about humans', 'this one is about queen', 'this one is about king' ]) clusterer = hdbscan.HDBSCAN(2, prediction_data=True).fit(embeddings) clusterer.labels_, clusterer.probabilities_ embeddings_2d = UMAP().fit_transform(embeddings) palette = sns.color_palette('deep', 8) colors = [sns.desaturate(palette[col], sat) for col, sat in zip(clusterer.labels_, clusterer.probabilities_)] plt.scatter(embeddings_2d.T[0], embeddings_2d.T[1], c=colors) hdbscan.prediction.all_points_membership_vectors(clusterer)