Spaces:

mshamrai
/

language-metric-analysis

Sleeping

mshamrai commited on Apr 18

Commit

e5394a1

1 Parent(s): e581b39

chore: rm filtering in kmeans

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,10 +1,7 @@
 import gradio as gr
 import pandas as pd
 import numpy as np
-import pickle
 import os
-from sklearn.manifold import TSNE
-import matplotlib.pyplot as plt
 from utils import (plot_distances_tsne,
                    plot_distances_umap,
                    cluster_languages_hdbscan,

 import gradio as gr
 import pandas as pd
 import numpy as np
 import os
 from utils import (plot_distances_tsne,
                    plot_distances_umap,
                    cluster_languages_hdbscan,

utils.py CHANGED Viewed

@@ -143,19 +143,21 @@ def cluster_languages_kmeans(dist_matrix, languages, n_clusters=5):
     kmeans_model = KMeans(n_clusters=n_clusters, random_state=23)
     clusters = kmeans_model.fit_predict(dist_matrix)
-    # Count the number of elements in each cluster
-    cluster_counts = np.bincount(clusters)
-    # Identify clusters with more than 1 element
-    valid_clusters = np.where(cluster_counts > 1)[0]
-    # Filter out points belonging to clusters with only 1 element
-    valid_indices = np.isin(clusters, valid_clusters)
-    filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)]
-    filtered_languages = np.array(languages)[valid_indices]
-    filtered_clusters = np.array(clusters)[valid_indices]
-    return filtered_matrix, filtered_languages, filtered_clusters
 def cluster_languages_hdbscan(dist_matrix, languages, min_cluster_size=2):

     kmeans_model = KMeans(n_clusters=n_clusters, random_state=23)
     clusters = kmeans_model.fit_predict(dist_matrix)
+    # # Count the number of elements in each cluster
+    # cluster_counts = np.bincount(clusters)
+    # # Identify clusters with more than 1 element
+    # valid_clusters = np.where(cluster_counts > 1)[0]
+    # # Filter out points belonging to clusters with only 1 element
+    # valid_indices = np.isin(clusters, valid_clusters)
+    # filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)]
+    # filtered_languages = np.array(languages)[valid_indices]
+    # filtered_clusters = np.array(clusters)[valid_indices]
+    # return filtered_matrix, filtered_languages, filtered_clusters
+    return dist_matrix, languages, clusters
 def cluster_languages_hdbscan(dist_matrix, languages, min_cluster_size=2):