mshamrai commited on
Commit
e5394a1
·
1 Parent(s): e581b39

chore: rm filtering in kmeans

Browse files
Files changed (2) hide show
  1. app.py +0 -3
  2. utils.py +12 -10
app.py CHANGED
@@ -1,10 +1,7 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
- import pickle
5
  import os
6
- from sklearn.manifold import TSNE
7
- import matplotlib.pyplot as plt
8
  from utils import (plot_distances_tsne,
9
  plot_distances_umap,
10
  cluster_languages_hdbscan,
 
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
 
4
  import os
 
 
5
  from utils import (plot_distances_tsne,
6
  plot_distances_umap,
7
  cluster_languages_hdbscan,
utils.py CHANGED
@@ -143,19 +143,21 @@ def cluster_languages_kmeans(dist_matrix, languages, n_clusters=5):
143
  kmeans_model = KMeans(n_clusters=n_clusters, random_state=23)
144
  clusters = kmeans_model.fit_predict(dist_matrix)
145
 
146
- # Count the number of elements in each cluster
147
- cluster_counts = np.bincount(clusters)
148
 
149
- # Identify clusters with more than 1 element
150
- valid_clusters = np.where(cluster_counts > 1)[0]
151
 
152
- # Filter out points belonging to clusters with only 1 element
153
- valid_indices = np.isin(clusters, valid_clusters)
154
- filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)]
155
- filtered_languages = np.array(languages)[valid_indices]
156
- filtered_clusters = np.array(clusters)[valid_indices]
157
 
158
- return filtered_matrix, filtered_languages, filtered_clusters
 
 
159
 
160
 
161
  def cluster_languages_hdbscan(dist_matrix, languages, min_cluster_size=2):
 
143
  kmeans_model = KMeans(n_clusters=n_clusters, random_state=23)
144
  clusters = kmeans_model.fit_predict(dist_matrix)
145
 
146
+ # # Count the number of elements in each cluster
147
+ # cluster_counts = np.bincount(clusters)
148
 
149
+ # # Identify clusters with more than 1 element
150
+ # valid_clusters = np.where(cluster_counts > 1)[0]
151
 
152
+ # # Filter out points belonging to clusters with only 1 element
153
+ # valid_indices = np.isin(clusters, valid_clusters)
154
+ # filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)]
155
+ # filtered_languages = np.array(languages)[valid_indices]
156
+ # filtered_clusters = np.array(clusters)[valid_indices]
157
 
158
+ # return filtered_matrix, filtered_languages, filtered_clusters
159
+
160
+ return dist_matrix, languages, clusters
161
 
162
 
163
  def cluster_languages_hdbscan(dist_matrix, languages, min_cluster_size=2):