Spaces:
Sleeping
Sleeping
chore: rm filtering in kmeans
Browse files
app.py
CHANGED
@@ -1,10 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
-
import pickle
|
5 |
import os
|
6 |
-
from sklearn.manifold import TSNE
|
7 |
-
import matplotlib.pyplot as plt
|
8 |
from utils import (plot_distances_tsne,
|
9 |
plot_distances_umap,
|
10 |
cluster_languages_hdbscan,
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
|
|
4 |
import os
|
|
|
|
|
5 |
from utils import (plot_distances_tsne,
|
6 |
plot_distances_umap,
|
7 |
cluster_languages_hdbscan,
|
utils.py
CHANGED
@@ -143,19 +143,21 @@ def cluster_languages_kmeans(dist_matrix, languages, n_clusters=5):
|
|
143 |
kmeans_model = KMeans(n_clusters=n_clusters, random_state=23)
|
144 |
clusters = kmeans_model.fit_predict(dist_matrix)
|
145 |
|
146 |
-
# Count the number of elements in each cluster
|
147 |
-
cluster_counts = np.bincount(clusters)
|
148 |
|
149 |
-
# Identify clusters with more than 1 element
|
150 |
-
valid_clusters = np.where(cluster_counts > 1)[0]
|
151 |
|
152 |
-
# Filter out points belonging to clusters with only 1 element
|
153 |
-
valid_indices = np.isin(clusters, valid_clusters)
|
154 |
-
filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)]
|
155 |
-
filtered_languages = np.array(languages)[valid_indices]
|
156 |
-
filtered_clusters = np.array(clusters)[valid_indices]
|
157 |
|
158 |
-
return filtered_matrix, filtered_languages, filtered_clusters
|
|
|
|
|
159 |
|
160 |
|
161 |
def cluster_languages_hdbscan(dist_matrix, languages, min_cluster_size=2):
|
|
|
143 |
kmeans_model = KMeans(n_clusters=n_clusters, random_state=23)
|
144 |
clusters = kmeans_model.fit_predict(dist_matrix)
|
145 |
|
146 |
+
# # Count the number of elements in each cluster
|
147 |
+
# cluster_counts = np.bincount(clusters)
|
148 |
|
149 |
+
# # Identify clusters with more than 1 element
|
150 |
+
# valid_clusters = np.where(cluster_counts > 1)[0]
|
151 |
|
152 |
+
# # Filter out points belonging to clusters with only 1 element
|
153 |
+
# valid_indices = np.isin(clusters, valid_clusters)
|
154 |
+
# filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)]
|
155 |
+
# filtered_languages = np.array(languages)[valid_indices]
|
156 |
+
# filtered_clusters = np.array(clusters)[valid_indices]
|
157 |
|
158 |
+
# return filtered_matrix, filtered_languages, filtered_clusters
|
159 |
+
|
160 |
+
return dist_matrix, languages, clusters
|
161 |
|
162 |
|
163 |
def cluster_languages_hdbscan(dist_matrix, languages, min_cluster_size=2):
|