ehengao commited on
Commit
2fd8e3d
·
1 Parent(s): 21e45f2

add initial version for the kmeans assumption dashboard

Browse files
Files changed (3) hide show
  1. README.md +1 -0
  2. app.py +165 -0
  3. requirements.txt +2 -0
README.md CHANGED
@@ -9,5 +9,6 @@ app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
 
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
9
  pinned: false
10
  license: mit
11
  ---
12
+ This dashboard is a live demonstration of the sklearn document at https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This dashboard is a live demonstration of the sklearn document at
2
+ https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py
3
+ """
4
+ import numpy as np
5
+ import typing as tp
6
+ import gradio as gr
7
+ from sklearn.datasets import make_blobs
8
+ from sklearn.cluster import KMeans
9
+ import matplotlib.pyplot as plt
10
+
11
+ title = "Demonstration of k-means assumptions"
12
+ random_state = 170
13
+ transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
14
+
15
+ # Defines 4 Apps for each demo senario
16
+ class App:
17
+ name: tp.ClassVar[str]
18
+ description: tp.ClassVar[str]
19
+
20
+ def make_data(self, n_samples: int) -> tp.Tuple[np.ndarray, np.ndarray]:
21
+ raise NotImplementedError()
22
+
23
+ def kmeans_predict(self, n_cluster: int, X: np.ndarray) -> np.ndarray:
24
+ raise NotImplementedError()
25
+
26
+ class MixGaussianBlobs(App):
27
+ name = "Mixture of Gaussian Blobs"
28
+ description = (
29
+ "In a real setting there is no uniquely defined true number of clusters. "
30
+ "An appropriate number of clusters has to be decided from data-based criteria"
31
+ " and knowledge of the intended goal."
32
+ )
33
+
34
+ def make_data(self, n_samples):
35
+ return make_blobs(n_samples=n_samples, random_state=random_state)
36
+
37
+ def kmeans_predict(self, n_clusters, X):
38
+ return KMeans(
39
+ n_clusters=n_clusters, n_init="auto", random_state=random_state
40
+ ).fit_predict(X)
41
+
42
+
43
+ class AnisoDistBlobs(MixGaussianBlobs):
44
+ name = "Anisotropically Distributed Blobs"
45
+ description = (
46
+ "k-means consists of minimizing sample’s euclidean distances to the centroid of the"
47
+ " cluster they are assigned to. As a consequence, k-means is more appropriate for "
48
+ "clusters that are isotropic and normally distributed (i.e. spherical gaussians)"
49
+ )
50
+
51
+ def make_data(self, n_samples):
52
+ X, y = super().make_data(n_samples=n_samples)
53
+ X = np.dot(X, transformation)
54
+ return X, y
55
+
56
+
57
+ class UnequalVariance(MixGaussianBlobs):
58
+ name = "Unequal Variance"
59
+ description = (
60
+ "k-means is equivalent to taking the maximum likelihood estimator for a 'mixture' "
61
+ "of k gaussian distributions with the same variances but with possibly different "
62
+ " means."
63
+ )
64
+
65
+ def make_data(self, n_samples):
66
+ return make_blobs(
67
+ n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
68
+ )
69
+
70
+
71
+ class UnevenlySizedBlobs(MixGaussianBlobs):
72
+ name = "Unevenly Sized Blobs"
73
+ description = (
74
+ "There is no theoretical result about k-means that states that it requires similar"
75
+ " cluster sizes to perform well, yet minimizing euclidean distances does mean that"
76
+ " the more sparse and high-dimensional the problem is, the higher is the need to run "
77
+ "the algorithm with different centroid seeds to ensure a global minimal inertia."
78
+ )
79
+
80
+ def make_data(self, n_samples):
81
+ X, y = super().make_data(n_samples=n_samples)
82
+ X_filter = np.vstack(
83
+ (
84
+ X[y == 0][:500],
85
+ X[y == 1][:100],
86
+ X[y == 2][:10],
87
+ )
88
+ )
89
+ # print(len(X_filter[:, 0]))
90
+ # print(len(X_filter[:, 1]))
91
+ y_filter = [0] * 500 + [1] * 100 + [2] * 10
92
+ return X_filter, y_filter
93
+
94
+
95
+ # Define instances of the apps
96
+ _apps = [
97
+ MixGaussianBlobs(),
98
+ AnisoDistBlobs(),
99
+ UnequalVariance(),
100
+ UnevenlySizedBlobs(),
101
+ ]
102
+ apps = {k.name: k for k in _apps}
103
+ data_choices = [k.name for k in _apps]
104
+
105
+
106
+ # Define the callback to the triggered when a button or a slider used by the user.
107
+ def fn(data_choice, n_samples, n_clusters):
108
+ # Find the app and create sample data based on the user choice.
109
+ app = apps[data_choice]
110
+ X, y = app.make_data(n_samples)
111
+ fig_sample, ax_sample = plt.subplots()
112
+ ax_sample.set_title(app.name)
113
+
114
+ # Execute the KMeans clustering.
115
+ y_pred = app.kmeans_predict(n_clusters, X)
116
+ ax_sample.scatter(X[:, 0], X[:, 1], c=y)
117
+ fig_pred, ax_pred = plt.subplots()
118
+ ax_pred.scatter(X[:, 0], X[:, 1], c=y_pred)
119
+ ax_pred.set_title(f"Unexpected KMeans Clusters (n_cluster={n_clusters})")
120
+
121
+ return f"## {app.description}", fig_sample, fig_pred
122
+
123
+
124
+ # Define the dashboard layout and buttons
125
+ with gr.Blocks(title=title) as demo:
126
+ gr.Markdown(f"# {title}")
127
+ gr.Markdown(
128
+ "This demo is based on "
129
+ "[sklearn document](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py)."
130
+ "It is meant to illustrate how K-Means can produce unexpected clusters in 4 different data sets"
131
+ )
132
+ with gr.Row():
133
+ data_choice = gr.Radio(
134
+ choices=data_choices,
135
+ value=data_choices[0],
136
+ )
137
+ with gr.Row():
138
+ n_samples = gr.Slider(
139
+ minimum=1500, maximum=3000, step=50, label="Number of Samples"
140
+ )
141
+ n_clusters = gr.Slider(minimum=2, maximum=8, step=1, label="Number of Clusters")
142
+ with gr.Accordion("Description"):
143
+ description = gr.Markdown(label="Description")
144
+ with gr.Row():
145
+ plot_sample = gr.Plot(label="Ground Truth Cluster")
146
+ plot_kmeans = gr.Plot(label="Unexpected KMeans Cluster")
147
+
148
+ data_choice.change(
149
+ fn=fn,
150
+ inputs=[data_choice, n_samples, n_clusters],
151
+ outputs=[description, plot_sample, plot_kmeans],
152
+ )
153
+ n_samples.change(
154
+ fn=fn,
155
+ inputs=[data_choice, n_samples, n_clusters],
156
+ outputs=[description, plot_sample, plot_kmeans],
157
+ )
158
+ n_clusters.change(
159
+ fn=fn,
160
+ inputs=[data_choice, n_samples, n_clusters],
161
+ outputs=[description, plot_sample, plot_kmeans],
162
+ )
163
+
164
+
165
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ scikit-learn==1.2.2
2
+ matplotlib==3.7.1