Improve multi distance calculator
Browse files- apps/multi_points_distance_calculator.py +71 -24
- utils/utils_functions.py +111 -26
apps/multi_points_distance_calculator.py
CHANGED
@@ -15,16 +15,16 @@ st.write(
|
|
15 |
dataset1_sample_file_path = "samples/Dataset1.xlsx"
|
16 |
dataset2_sample_file_path = "samples/Dataset2.xlsx"
|
17 |
|
18 |
-
|
19 |
|
20 |
-
with
|
21 |
st.download_button(
|
22 |
label="Dataset1 Sample File",
|
23 |
data=open(dataset1_sample_file_path, "rb").read(),
|
24 |
file_name="Dataset1.xlsx",
|
25 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
26 |
)
|
27 |
-
with
|
28 |
st.download_button(
|
29 |
label="Dataset2 Sample File",
|
30 |
data=open(dataset2_sample_file_path, "rb").read(),
|
@@ -32,51 +32,98 @@ with col2:
|
|
32 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
33 |
)
|
34 |
|
|
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
# Upload Dataset 2
|
41 |
-
st.subheader("Upload Dataset 2 (Comparison Points)")
|
42 |
-
file2 = st.file_uploader("Upload second dataset (Excel)", type=["xlsx"], key="file2")
|
43 |
|
44 |
if file1 and file2:
|
|
|
45 |
try:
|
46 |
# Read the datasets
|
47 |
df1 = pd.read_excel(file1)
|
48 |
df2 = pd.read_excel(file2)
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
lat_col2 = st.selectbox("Select 'Latitude' Column", df2.columns, key="lat2")
|
58 |
-
long_col2 = st.selectbox("Select 'Longitude' Column", df2.columns, key="long2")
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
# Calculate distances when button is clicked
|
61 |
if st.button("Calculate Distances"):
|
62 |
-
df_distances, df_closest = calculate_distances(
|
63 |
-
df1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
)
|
65 |
|
66 |
-
# Display all distances
|
67 |
-
st.subheader("All Distances")
|
68 |
-
st.dataframe(df_distances)
|
69 |
|
70 |
# Display closest points
|
71 |
st.subheader("Closest Matches")
|
72 |
st.dataframe(df_closest)
|
|
|
|
|
73 |
|
74 |
-
# Downloadable CSV
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
st.download_button(
|
76 |
label="Download Closest Matches as CSV",
|
77 |
data=df_closest.to_csv(index=False),
|
78 |
file_name="closest_matches.csv",
|
79 |
mime="text/csv",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
)
|
81 |
|
82 |
except Exception as e:
|
|
|
15 |
dataset1_sample_file_path = "samples/Dataset1.xlsx"
|
16 |
dataset2_sample_file_path = "samples/Dataset2.xlsx"
|
17 |
|
18 |
+
download_col1, download_col2 = st.columns(2)
|
19 |
|
20 |
+
with download_col1:
|
21 |
st.download_button(
|
22 |
label="Dataset1 Sample File",
|
23 |
data=open(dataset1_sample_file_path, "rb").read(),
|
24 |
file_name="Dataset1.xlsx",
|
25 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
26 |
)
|
27 |
+
with download_col2:
|
28 |
st.download_button(
|
29 |
label="Dataset2 Sample File",
|
30 |
data=open(dataset2_sample_file_path, "rb").read(),
|
|
|
32 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
33 |
)
|
34 |
|
35 |
+
upload_data1_col, upload_data2_col = st.columns(2)
|
36 |
|
37 |
+
with upload_data1_col:
|
38 |
+
# Upload Dataset 1
|
39 |
+
st.subheader("Upload Dataset 1 (Reference Points)")
|
40 |
+
file1 = st.file_uploader("Upload first dataset (Excel)", type=["xlsx"], key="file1")
|
41 |
+
|
42 |
+
with upload_data2_col:
|
43 |
+
# Upload Dataset 2
|
44 |
+
st.subheader("Upload Dataset 2 (Comparison Points)")
|
45 |
+
file2 = st.file_uploader(
|
46 |
+
"Upload second dataset (Excel)", type=["xlsx"], key="file2"
|
47 |
+
)
|
48 |
|
|
|
|
|
|
|
49 |
|
50 |
if file1 and file2:
|
51 |
+
param_col1, param_col2 = st.columns(2)
|
52 |
try:
|
53 |
# Read the datasets
|
54 |
df1 = pd.read_excel(file1)
|
55 |
df2 = pd.read_excel(file2)
|
56 |
|
57 |
+
with param_col1:
|
58 |
+
st.subheader("Select Columns for Dataset 1")
|
59 |
+
code_col1 = st.selectbox("Select 'CODE' Column", df1.columns, key="code1")
|
60 |
+
lat_col1 = st.selectbox("Select 'Latitude' Column", df1.columns, key="lat1")
|
61 |
+
long_col1 = st.selectbox(
|
62 |
+
"Select 'Longitude' Column", df1.columns, key="long1"
|
63 |
+
)
|
|
|
|
|
64 |
|
65 |
+
with param_col2:
|
66 |
+
st.subheader("Select Columns for Dataset 2")
|
67 |
+
code_col2 = st.selectbox("Select 'CODE' Column", df2.columns, key="code2")
|
68 |
+
lat_col2 = st.selectbox("Select 'Latitude' Column", df2.columns, key="lat2")
|
69 |
+
long_col2 = st.selectbox(
|
70 |
+
"Select 'Longitude' Column", df2.columns, key="long2"
|
71 |
+
)
|
72 |
+
min_distance = st.number_input(
|
73 |
+
"Minimum Distance (km)", min_value=0.0, value=5.0
|
74 |
+
)
|
75 |
# Calculate distances when button is clicked
|
76 |
if st.button("Calculate Distances"):
|
77 |
+
df_distances, df_closest, df_closest_min_distance = calculate_distances(
|
78 |
+
df1,
|
79 |
+
df2,
|
80 |
+
code_col1,
|
81 |
+
lat_col1,
|
82 |
+
long_col1,
|
83 |
+
code_col2,
|
84 |
+
lat_col2,
|
85 |
+
long_col2,
|
86 |
+
min_distance,
|
87 |
)
|
88 |
|
89 |
+
# # Display all distances
|
90 |
+
# st.subheader("All Distances")
|
91 |
+
# st.dataframe(df_distances)
|
92 |
|
93 |
# Display closest points
|
94 |
st.subheader("Closest Matches")
|
95 |
st.dataframe(df_closest)
|
96 |
+
st.subheader("Closest Matches below Min Distance")
|
97 |
+
st.dataframe(df_closest_min_distance)
|
98 |
|
99 |
+
# Downloadable All distances CSV
|
100 |
+
st.download_button(
|
101 |
+
label="Download All Distances as CSV",
|
102 |
+
data=df_distances.to_csv(index=False),
|
103 |
+
file_name="all_distances.csv",
|
104 |
+
mime="text/csv",
|
105 |
+
on_click="ignore",
|
106 |
+
type="primary",
|
107 |
+
)
|
108 |
+
|
109 |
+
# Downloadable Closest matches CSV
|
110 |
st.download_button(
|
111 |
label="Download Closest Matches as CSV",
|
112 |
data=df_closest.to_csv(index=False),
|
113 |
file_name="closest_matches.csv",
|
114 |
mime="text/csv",
|
115 |
+
on_click="ignore",
|
116 |
+
type="primary",
|
117 |
+
)
|
118 |
+
|
119 |
+
# Downloadable Closest matches below Min Distance CSV
|
120 |
+
st.download_button(
|
121 |
+
label=f"Download Closest Matches below {min_distance}km as CSV",
|
122 |
+
data=df_closest_min_distance.to_csv(index=False),
|
123 |
+
file_name=f"closest_matches_{min_distance}km.csv",
|
124 |
+
mime="text/csv",
|
125 |
+
on_click="ignore",
|
126 |
+
type="primary",
|
127 |
)
|
128 |
|
129 |
except Exception as e:
|
utils/utils_functions.py
CHANGED
@@ -1,41 +1,126 @@
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
from geopy.distance import geodesic
|
3 |
|
4 |
-
|
5 |
# Function to calculate distances while preserving all original columns
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
def calculate_distances(
|
7 |
df1: pd.DataFrame,
|
8 |
df2: pd.DataFrame,
|
9 |
-
code_col1,
|
10 |
-
lat_col1,
|
11 |
-
long_col1,
|
12 |
-
code_col2,
|
13 |
-
lat_col2,
|
14 |
-
long_col2,
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
distances = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
coord1 = (row1[lat_col1], row1[long_col1])
|
21 |
-
coord2 = (row2[lat_col2], row2[long_col2])
|
22 |
-
distance_km = geodesic(coord1, coord2).kilometers # Compute distance
|
23 |
-
|
24 |
-
# Combine all original columns + distance
|
25 |
-
combined_row = {
|
26 |
-
**row1.to_dict(), # Keep all columns from Dataset1
|
27 |
-
**{
|
28 |
-
f"{col}_Dataset2": row2[col] for col in df2.columns
|
29 |
-
}, # Keep all columns from Dataset2
|
30 |
-
"Distance_km": distance_km,
|
31 |
-
}
|
32 |
-
distances.append(combined_row)
|
33 |
|
34 |
df_distances = pd.DataFrame(distances)
|
35 |
|
36 |
-
# Find
|
37 |
-
df_closest
|
38 |
df_distances.groupby(code_col1)["Distance_km"].idxmin()
|
39 |
]
|
40 |
|
41 |
-
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
|
3 |
import pandas as pd
|
4 |
from geopy.distance import geodesic
|
5 |
|
|
|
6 |
# Function to calculate distances while preserving all original columns
|
7 |
+
# def calculate_distances(
|
8 |
+
# df1: pd.DataFrame,
|
9 |
+
# df2: pd.DataFrame,
|
10 |
+
# code_col1,
|
11 |
+
# lat_col1,
|
12 |
+
# long_col1,
|
13 |
+
# code_col2,
|
14 |
+
# lat_col2,
|
15 |
+
# long_col2,
|
16 |
+
# min_distance: int = 1,
|
17 |
+
# ):
|
18 |
+
# distances = []
|
19 |
+
|
20 |
+
# for _, row1 in df1.iterrows():
|
21 |
+
# for _, row2 in df2.iterrows():
|
22 |
+
# coord1 = (row1[lat_col1], row1[long_col1])
|
23 |
+
# coord2 = (row2[lat_col2], row2[long_col2])
|
24 |
+
# distance_km = geodesic(coord1, coord2).kilometers # Compute distance
|
25 |
+
|
26 |
+
# # Combine all original columns + distance
|
27 |
+
# combined_row = {
|
28 |
+
# **row1.to_dict(), # Keep all columns from Dataset1
|
29 |
+
# **{
|
30 |
+
# f"{col}_Dataset2": row2[col] for col in df2.columns
|
31 |
+
# }, # Keep all columns from Dataset2
|
32 |
+
# "Distance_km": distance_km,
|
33 |
+
# }
|
34 |
+
# distances.append(combined_row)
|
35 |
+
|
36 |
+
# df_distances = pd.DataFrame(distances)
|
37 |
+
|
38 |
+
# # Find the closest point for each Point1
|
39 |
+
# df_closest: pd.DataFrame = df_distances.loc[
|
40 |
+
# df_distances.groupby(code_col1)["Distance_km"].idxmin()
|
41 |
+
# ]
|
42 |
+
|
43 |
+
# # Find the distnce below min_distance
|
44 |
+
# df_closest_min_distance = df_distances[df_distances["Distance_km"] < min_distance]
|
45 |
+
|
46 |
+
# return df_distances, df_closest, df_closest_min_distance
|
47 |
+
|
48 |
+
|
49 |
def calculate_distances(
|
50 |
df1: pd.DataFrame,
|
51 |
df2: pd.DataFrame,
|
52 |
+
code_col1: str,
|
53 |
+
lat_col1: str,
|
54 |
+
long_col1: str,
|
55 |
+
code_col2: str,
|
56 |
+
lat_col2: str,
|
57 |
+
long_col2: str,
|
58 |
+
min_distance: float = 1.0,
|
59 |
+
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
60 |
+
"""
|
61 |
+
Calculate distances between points in two datasets and find closest matches.
|
62 |
+
|
63 |
+
Args:
|
64 |
+
df1: First DataFrame containing reference points
|
65 |
+
df2: Second DataFrame containing points to compare
|
66 |
+
code_col1: Column name in df1 containing point identifiers
|
67 |
+
lat_col1: Column name in df1 containing latitude
|
68 |
+
long_col1: Column name in df1 containing longitude
|
69 |
+
code_col2: Column name in df2 containing point identifiers
|
70 |
+
lat_col2: Column name in df2 containing latitude
|
71 |
+
long_col2: Column name in df2 containing longitude
|
72 |
+
min_distance: Minimum distance threshold in kilometers
|
73 |
+
|
74 |
+
Returns:
|
75 |
+
tuple: (all_distances, closest_matches, matches_below_threshold)
|
76 |
+
"""
|
77 |
+
# Validate input columns
|
78 |
+
required_cols_1 = {code_col1, lat_col1, long_col1}
|
79 |
+
required_cols_2 = {code_col2, lat_col2, long_col2}
|
80 |
+
|
81 |
+
if not required_cols_1.issubset(df1.columns):
|
82 |
+
raise ValueError(
|
83 |
+
f"df1 is missing required columns: {required_cols_1 - set(df1.columns)}"
|
84 |
+
)
|
85 |
+
if not required_cols_2.issubset(df2.columns):
|
86 |
+
raise ValueError(
|
87 |
+
f"df2 is missing required columns: {required_cols_2 - set(df2.columns)}"
|
88 |
+
)
|
89 |
+
|
90 |
+
# Convert to list of tuples for vectorized operations
|
91 |
+
coords1 = df1[[lat_col1, long_col1]].apply(tuple, axis=1).tolist()
|
92 |
+
coords2 = df2[[lat_col2, long_col2]].apply(tuple, axis=1).tolist()
|
93 |
+
|
94 |
+
# Calculate all pairwise distances
|
95 |
distances = []
|
96 |
+
for i, coord1 in enumerate(coords1):
|
97 |
+
for j, coord2 in enumerate(coords2):
|
98 |
+
try:
|
99 |
+
distance_km = geodesic(coord1, coord2).kilometers
|
100 |
+
distances.append(
|
101 |
+
{
|
102 |
+
**df1.iloc[i].to_dict(),
|
103 |
+
**{f"{col}_Dataset2": df2.iloc[j][col] for col in df2.columns},
|
104 |
+
"Distance_km": distance_km,
|
105 |
+
}
|
106 |
+
)
|
107 |
+
except ValueError as e:
|
108 |
+
warnings.warn(
|
109 |
+
f"Skipping invalid coordinates: {coord1} or {coord2}: {e}"
|
110 |
+
)
|
111 |
+
continue
|
112 |
|
113 |
+
if not distances:
|
114 |
+
raise ValueError("No valid coordinate pairs were processed")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
df_distances = pd.DataFrame(distances)
|
117 |
|
118 |
+
# Find closest matches
|
119 |
+
df_closest = df_distances.loc[
|
120 |
df_distances.groupby(code_col1)["Distance_km"].idxmin()
|
121 |
]
|
122 |
|
123 |
+
# Filter by minimum distance
|
124 |
+
df_closest_min_distance = df_distances[df_distances["Distance_km"] < min_distance]
|
125 |
+
|
126 |
+
return df_distances, df_closest, df_closest_min_distance
|