File size: 3,289 Bytes
f9c0284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5af9bb
 
 
 
 
 
 
f9c0284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from io import BytesIO

import numpy as np
import pandas as pd
import plotly.express as px
import streamlit as st
from sklearn.cluster import KMeans


def cluster_sites(
    df: pd.DataFrame,
    lat_col: str,
    lon_col: str,
    region_col: str,
    max_sites: int = 25,
    mix_regions: bool = False,
):
    clusters = []
    cluster_id = 0

    if not mix_regions:
        grouped = df.groupby(region_col)
    else:
        grouped = [("All", df)]

    for region, group in grouped:
        coords = group[[lat_col, lon_col]].to_numpy()
        n_clusters = max(1, int(np.ceil(len(group) / max_sites)))

        if len(group) < max_sites:
            labels = np.zeros(len(group), dtype=int)
        else:
            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
            labels = kmeans.fit_predict(coords)

        group = group.copy()
        group["Cluster"] = [f"C{cluster_id + l}" for l in labels]
        clusters.append(group)
        cluster_id += len(set(labels))

    return pd.concat(clusters)


def to_excel(df: pd.DataFrame) -> bytes:
    output = BytesIO()
    with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
        df.to_excel(writer, index=False, sheet_name="Clusters")
    return output.getvalue()


st.title("Automatic Site Clustering App")

# Add description
st.write(
    """This app allows you to cluster sites based on their latitude and longitude.
    **Please choose a file containing the latitude and longitude region and site code columns.**
                      """
)

uploaded_file = st.file_uploader("Upload your Excel file", type=["xlsx"])

if uploaded_file:
    df = pd.read_excel(uploaded_file)
    st.write("Sample of uploaded data:", df.head())

    columns = df.columns.tolist()

    with st.form("clustering_form"):
        lat_col = st.selectbox("Select Latitude column", columns)
        lon_col = st.selectbox("Select Longitude column", columns)
        region_col = st.selectbox("Select Region column", columns)
        code_col = st.selectbox("Select Site Code column", columns)
        max_sites = st.number_input(
            "Max sites per cluster", min_value=5, max_value=100, value=25
        )
        mix_regions = st.checkbox(
            "Allow mixing different regions in clusters", value=False
        )
        submitted = st.form_submit_button("Run Clustering")

    if submitted:
        clustered_df = cluster_sites(
            df, lat_col, lon_col, region_col, max_sites, mix_regions
        )
        st.success("Clustering completed!")
        st.write(clustered_df.head())

        # Plot
        fig = px.scatter_map(
            clustered_df,
            lat=lat_col,
            lon=lon_col,
            color="Cluster",
            hover_name=code_col,
            hover_data=[region_col],
            zoom=5,
            height=600,
        )
        fig.update_layout(mapbox_style="open-street-map")
        st.plotly_chart(fig)

        # Download button
        st.download_button(
            label="Download clustered Excel file",
            data=to_excel(clustered_df),
            file_name="clustered_sites.xlsx",
            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            on_click="ignore",
            type="primary",
        )