improve clustering app
Browse files- apps/clustering.py +138 -15
- requirements.txt +0 -0
- samples/Site_Clustering.xlsx +0 -0
apps/clustering.py
CHANGED
@@ -4,10 +4,11 @@ import numpy as np
|
|
4 |
import pandas as pd
|
5 |
import plotly.express as px
|
6 |
import streamlit as st
|
|
|
7 |
from sklearn.cluster import KMeans
|
8 |
|
9 |
|
10 |
-
def
|
11 |
df: pd.DataFrame,
|
12 |
lat_col: str,
|
13 |
lon_col: str,
|
@@ -23,20 +24,103 @@ def cluster_sites(
|
|
23 |
else:
|
24 |
grouped = [("All", df)]
|
25 |
|
|
|
|
|
|
|
|
|
26 |
for region, group in grouped:
|
27 |
-
|
28 |
-
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
34 |
-
labels = kmeans.fit_predict(coords)
|
35 |
|
36 |
group = group.copy()
|
37 |
-
group["
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
return pd.concat(clusters)
|
42 |
|
@@ -57,7 +141,18 @@ st.write(
|
|
57 |
"""
|
58 |
)
|
59 |
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
if uploaded_file:
|
63 |
df = pd.read_excel(uploaded_file)
|
@@ -73,30 +168,58 @@ if uploaded_file:
|
|
73 |
max_sites = st.number_input(
|
74 |
"Max sites per cluster", min_value=5, max_value=100, value=25
|
75 |
)
|
|
|
|
|
|
|
|
|
76 |
mix_regions = st.checkbox(
|
77 |
"Allow mixing different regions in clusters", value=False
|
78 |
)
|
79 |
submitted = st.form_submit_button("Run Clustering")
|
80 |
|
81 |
if submitted:
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
85 |
st.success("Clustering completed!")
|
86 |
st.write(clustered_df.head())
|
87 |
|
88 |
# Plot
|
|
|
89 |
fig = px.scatter_map(
|
90 |
clustered_df,
|
91 |
lat=lat_col,
|
92 |
lon=lon_col,
|
93 |
color="Cluster",
|
|
|
94 |
hover_name=code_col,
|
95 |
hover_data=[region_col],
|
96 |
zoom=5,
|
97 |
height=600,
|
98 |
)
|
99 |
fig.update_layout(mapbox_style="open-street-map")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
st.plotly_chart(fig)
|
101 |
|
102 |
# Download button
|
|
|
4 |
import pandas as pd
|
5 |
import plotly.express as px
|
6 |
import streamlit as st
|
7 |
+
from hilbertcurve.hilbertcurve import HilbertCurve
|
8 |
from sklearn.cluster import KMeans
|
9 |
|
10 |
|
11 |
+
def cluster_sites_hilbert_curve_same_size(
|
12 |
df: pd.DataFrame,
|
13 |
lat_col: str,
|
14 |
lon_col: str,
|
|
|
24 |
else:
|
25 |
grouped = [("All", df)]
|
26 |
|
27 |
+
# Create Hilbert Curve (higher p = more precision)
|
28 |
+
p = 16 # Adjust based on your coordinate precision needs
|
29 |
+
hilbert_curve = HilbertCurve(p, 2) # 2D curve
|
30 |
+
|
31 |
for region, group in grouped:
|
32 |
+
if len(group) == 0:
|
33 |
+
continue
|
34 |
|
35 |
+
# Normalize coordinates to [0, 2^p-1] range
|
36 |
+
lat_min, lat_max = group[lat_col].min(), group[lat_col].max()
|
37 |
+
lon_min, lon_max = group[lon_col].min(), group[lon_col].max()
|
|
|
|
|
38 |
|
39 |
group = group.copy()
|
40 |
+
group["x"] = ((group[lat_col] - lat_min) / (lat_max - lat_min + 1e-10)) * (
|
41 |
+
2**p - 1
|
42 |
+
)
|
43 |
+
group["y"] = ((group[lon_col] - lon_min) / (lon_max - lon_min + 1e-10)) * (
|
44 |
+
2**p - 1
|
45 |
+
)
|
46 |
+
|
47 |
+
# Calculate Hilbert distance
|
48 |
+
group["hilbert"] = group.apply(
|
49 |
+
lambda row: hilbert_curve.distance_from_point(
|
50 |
+
[int(row["x"]), int(row["y"])]
|
51 |
+
),
|
52 |
+
axis=1,
|
53 |
+
)
|
54 |
+
|
55 |
+
# Sort by Hilbert value
|
56 |
+
group = group.sort_values("hilbert")
|
57 |
+
|
58 |
+
# Create fixed-size clusters
|
59 |
+
for i in range(0, len(group), max_sites):
|
60 |
+
cluster = group.iloc[i : i + max_sites].copy()
|
61 |
+
cluster["Cluster"] = f"C{cluster_id}"
|
62 |
+
clusters.append(cluster)
|
63 |
+
cluster_id += 1
|
64 |
+
|
65 |
+
result = pd.concat(clusters)
|
66 |
+
return result.drop(columns=["x", "y", "hilbert"], errors="ignore")
|
67 |
+
|
68 |
+
|
69 |
+
def cluster_sites_kmeans_lower_to_fixed_size(
|
70 |
+
df: pd.DataFrame,
|
71 |
+
lat_col: str,
|
72 |
+
lon_col: str,
|
73 |
+
region_col: str,
|
74 |
+
max_sites: int = 25,
|
75 |
+
mix_regions: bool = False,
|
76 |
+
):
|
77 |
+
clusters = []
|
78 |
+
cluster_id = 0
|
79 |
+
|
80 |
+
if not mix_regions:
|
81 |
+
grouped = df.groupby(region_col)
|
82 |
+
else:
|
83 |
+
grouped = [("All", df)]
|
84 |
+
|
85 |
+
for region, group in grouped:
|
86 |
+
coords = group[[lat_col, lon_col]].to_numpy()
|
87 |
+
remaining_sites = group.copy()
|
88 |
+
|
89 |
+
while len(remaining_sites) > 0:
|
90 |
+
# Calculate number of clusters needed for remaining sites
|
91 |
+
n_remaining = len(remaining_sites)
|
92 |
+
n_clusters = max(1, int(np.ceil(n_remaining / max_sites)))
|
93 |
+
|
94 |
+
if n_remaining <= max_sites:
|
95 |
+
# If remaining sites can fit in one cluster
|
96 |
+
cluster_group = remaining_sites.copy()
|
97 |
+
cluster_group["Cluster"] = f"C{cluster_id}"
|
98 |
+
clusters.append(cluster_group)
|
99 |
+
cluster_id += 1
|
100 |
+
break
|
101 |
+
else:
|
102 |
+
# Apply KMeans to remaining sites
|
103 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
104 |
+
labels = kmeans.fit_predict(
|
105 |
+
remaining_sites[[lat_col, lon_col]].to_numpy()
|
106 |
+
)
|
107 |
+
|
108 |
+
# Split into clusters and check sizes
|
109 |
+
temp_df = remaining_sites.copy()
|
110 |
+
temp_df["Cluster"] = labels
|
111 |
+
temp_df["Temp_Cluster"] = labels
|
112 |
+
|
113 |
+
for cluster_num in range(n_clusters):
|
114 |
+
cluster_group = temp_df[temp_df["Temp_Cluster"] == cluster_num]
|
115 |
+
if len(cluster_group) <= max_sites:
|
116 |
+
# If cluster is small enough, keep it
|
117 |
+
cluster_group = cluster_group.drop(columns=["Temp_Cluster"])
|
118 |
+
cluster_group["Cluster"] = f"C{cluster_id}"
|
119 |
+
clusters.append(cluster_group)
|
120 |
+
cluster_id += 1
|
121 |
+
# Remove these sites from remaining_sites
|
122 |
+
remaining_sites = remaining_sites.drop(cluster_group.index)
|
123 |
+
# Else these sites will remain for next iteration
|
124 |
|
125 |
return pd.concat(clusters)
|
126 |
|
|
|
141 |
"""
|
142 |
)
|
143 |
|
144 |
+
# Download Sample file
|
145 |
+
clustering_sample_file_path = "samples/Site_Clustering.xlsx"
|
146 |
+
|
147 |
+
# Create a download button
|
148 |
+
st.download_button(
|
149 |
+
label="Download Clustering Sample File",
|
150 |
+
data=open(clustering_sample_file_path, "rb").read(),
|
151 |
+
file_name="Site_Clustering.xlsx",
|
152 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
153 |
+
)
|
154 |
+
|
155 |
+
uploaded_file = st.file_uploader("Upload your Excel file ", type=["xlsx"])
|
156 |
|
157 |
if uploaded_file:
|
158 |
df = pd.read_excel(uploaded_file)
|
|
|
168 |
max_sites = st.number_input(
|
169 |
"Max sites per cluster", min_value=5, max_value=100, value=25
|
170 |
)
|
171 |
+
cluster_method = st.selectbox(
|
172 |
+
"Select clustering method",
|
173 |
+
["Hilbert Curve Same Size", "KMeans Lower To Fixed Size"],
|
174 |
+
)
|
175 |
mix_regions = st.checkbox(
|
176 |
"Allow mixing different regions in clusters", value=False
|
177 |
)
|
178 |
submitted = st.form_submit_button("Run Clustering")
|
179 |
|
180 |
if submitted:
|
181 |
+
if cluster_method == "Hilbert Curve Same Size":
|
182 |
+
clustered_df = cluster_sites_hilbert_curve_same_size(
|
183 |
+
df, lat_col, lon_col, region_col, max_sites, mix_regions
|
184 |
+
)
|
185 |
+
elif cluster_method == "KMeans Lower To Fixed Size":
|
186 |
+
clustered_df = cluster_sites_kmeans_lower_to_fixed_size(
|
187 |
+
df, lat_col, lon_col, region_col, max_sites, mix_regions
|
188 |
+
)
|
189 |
st.success("Clustering completed!")
|
190 |
st.write(clustered_df.head())
|
191 |
|
192 |
# Plot
|
193 |
+
clustered_df["size"] = 10
|
194 |
fig = px.scatter_map(
|
195 |
clustered_df,
|
196 |
lat=lat_col,
|
197 |
lon=lon_col,
|
198 |
color="Cluster",
|
199 |
+
size="size",
|
200 |
hover_name=code_col,
|
201 |
hover_data=[region_col],
|
202 |
zoom=5,
|
203 |
height=600,
|
204 |
)
|
205 |
fig.update_layout(mapbox_style="open-street-map")
|
206 |
+
fig.update_traces(marker=dict(size=15))
|
207 |
+
st.plotly_chart(fig)
|
208 |
+
|
209 |
+
# Show cluster size per cluster plot
|
210 |
+
cluster_size = clustered_df["Cluster"].value_counts().sort_index()
|
211 |
+
fig = px.bar(cluster_size, x=cluster_size.index, y=cluster_size.values)
|
212 |
+
fig.update_layout(title="Cluster Size")
|
213 |
+
st.plotly_chart(fig)
|
214 |
+
|
215 |
+
# Show cluster size per region plot
|
216 |
+
cluster_size_per_region = (
|
217 |
+
clustered_df.groupby([region_col, "Cluster"])
|
218 |
+
.size()
|
219 |
+
.reset_index(name="count")
|
220 |
+
)
|
221 |
+
fig = px.bar(cluster_size_per_region, x="Cluster", y="count", color=region_col)
|
222 |
+
fig.update_layout(title="Cluster Size per Region")
|
223 |
st.plotly_chart(fig)
|
224 |
|
225 |
# Download button
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
samples/Site_Clustering.xlsx
ADDED
Binary file (39.9 kB). View file
|
|