# %%
!pip install -Uq sentence-transformers hdbscan
# %%
from hdbscan.hdbscan_ import hdbscan
import numpy as np
from sentence_transformers import SentenceTransformer
import seaborn as sns
import matplotlib.pyplot as plt
from umap import UMAP
import hdbscan
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
embeddings = model.encode([
'my cat is crazy',
'this one is not about animals',
'this is about a dog',
'but this one is about animals',
'this one is about humans',
'this one is about queen',
'this one is about king'
])
clusterer = hdbscan.HDBSCAN(2, prediction_data=True).fit(embeddings)
clusterer.labels_, clusterer.probabilities_
embeddings_2d = UMAP().fit_transform(embeddings)
palette = sns.color_palette('deep', 8)
colors = [sns.desaturate(palette[col], sat)
for col, sat in zip(clusterer.labels_,
clusterer.probabilities_)]
plt.scatter(embeddings_2d.T[0], embeddings_2d.T[1], c=colors)
hdbscan.prediction.all_points_membership_vectors(clusterer)