DavMelchi commited on
Commit
588f1c0
·
1 Parent(s): c03a5ca

Improve multi distance calculator

Browse files
apps/multi_points_distance_calculator.py CHANGED
@@ -15,16 +15,16 @@ st.write(
15
  dataset1_sample_file_path = "samples/Dataset1.xlsx"
16
  dataset2_sample_file_path = "samples/Dataset2.xlsx"
17
 
18
- col1, col2, col3 = st.columns(3)
19
 
20
- with col1:
21
  st.download_button(
22
  label="Dataset1 Sample File",
23
  data=open(dataset1_sample_file_path, "rb").read(),
24
  file_name="Dataset1.xlsx",
25
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
26
  )
27
- with col2:
28
  st.download_button(
29
  label="Dataset2 Sample File",
30
  data=open(dataset2_sample_file_path, "rb").read(),
@@ -32,51 +32,98 @@ with col2:
32
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
33
  )
34
 
 
35
 
36
- # Upload Dataset 1
37
- st.subheader("Upload Dataset 1 (Reference Points)")
38
- file1 = st.file_uploader("Upload first dataset (Excel)", type=["xlsx"], key="file1")
 
 
 
 
 
 
 
 
39
 
40
- # Upload Dataset 2
41
- st.subheader("Upload Dataset 2 (Comparison Points)")
42
- file2 = st.file_uploader("Upload second dataset (Excel)", type=["xlsx"], key="file2")
43
 
44
  if file1 and file2:
 
45
  try:
46
  # Read the datasets
47
  df1 = pd.read_excel(file1)
48
  df2 = pd.read_excel(file2)
49
 
50
- st.subheader("Select Columns for Dataset 1")
51
- code_col1 = st.selectbox("Select 'CODE' Column", df1.columns, key="code1")
52
- lat_col1 = st.selectbox("Select 'Latitude' Column", df1.columns, key="lat1")
53
- long_col1 = st.selectbox("Select 'Longitude' Column", df1.columns, key="long1")
54
-
55
- st.subheader("Select Columns for Dataset 2")
56
- code_col2 = st.selectbox("Select 'CODE' Column", df2.columns, key="code2")
57
- lat_col2 = st.selectbox("Select 'Latitude' Column", df2.columns, key="lat2")
58
- long_col2 = st.selectbox("Select 'Longitude' Column", df2.columns, key="long2")
59
 
 
 
 
 
 
 
 
 
 
 
60
  # Calculate distances when button is clicked
61
  if st.button("Calculate Distances"):
62
- df_distances, df_closest = calculate_distances(
63
- df1, df2, code_col1, lat_col1, long_col1, code_col2, lat_col2, long_col2
 
 
 
 
 
 
 
 
64
  )
65
 
66
- # Display all distances
67
- st.subheader("All Distances")
68
- st.dataframe(df_distances)
69
 
70
  # Display closest points
71
  st.subheader("Closest Matches")
72
  st.dataframe(df_closest)
 
 
73
 
74
- # Downloadable CSV
 
 
 
 
 
 
 
 
 
 
75
  st.download_button(
76
  label="Download Closest Matches as CSV",
77
  data=df_closest.to_csv(index=False),
78
  file_name="closest_matches.csv",
79
  mime="text/csv",
 
 
 
 
 
 
 
 
 
 
 
 
80
  )
81
 
82
  except Exception as e:
 
15
  dataset1_sample_file_path = "samples/Dataset1.xlsx"
16
  dataset2_sample_file_path = "samples/Dataset2.xlsx"
17
 
18
+ download_col1, download_col2 = st.columns(2)
19
 
20
+ with download_col1:
21
  st.download_button(
22
  label="Dataset1 Sample File",
23
  data=open(dataset1_sample_file_path, "rb").read(),
24
  file_name="Dataset1.xlsx",
25
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
26
  )
27
+ with download_col2:
28
  st.download_button(
29
  label="Dataset2 Sample File",
30
  data=open(dataset2_sample_file_path, "rb").read(),
 
32
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
33
  )
34
 
35
+ upload_data1_col, upload_data2_col = st.columns(2)
36
 
37
+ with upload_data1_col:
38
+ # Upload Dataset 1
39
+ st.subheader("Upload Dataset 1 (Reference Points)")
40
+ file1 = st.file_uploader("Upload first dataset (Excel)", type=["xlsx"], key="file1")
41
+
42
+ with upload_data2_col:
43
+ # Upload Dataset 2
44
+ st.subheader("Upload Dataset 2 (Comparison Points)")
45
+ file2 = st.file_uploader(
46
+ "Upload second dataset (Excel)", type=["xlsx"], key="file2"
47
+ )
48
 
 
 
 
49
 
50
  if file1 and file2:
51
+ param_col1, param_col2 = st.columns(2)
52
  try:
53
  # Read the datasets
54
  df1 = pd.read_excel(file1)
55
  df2 = pd.read_excel(file2)
56
 
57
+ with param_col1:
58
+ st.subheader("Select Columns for Dataset 1")
59
+ code_col1 = st.selectbox("Select 'CODE' Column", df1.columns, key="code1")
60
+ lat_col1 = st.selectbox("Select 'Latitude' Column", df1.columns, key="lat1")
61
+ long_col1 = st.selectbox(
62
+ "Select 'Longitude' Column", df1.columns, key="long1"
63
+ )
 
 
64
 
65
+ with param_col2:
66
+ st.subheader("Select Columns for Dataset 2")
67
+ code_col2 = st.selectbox("Select 'CODE' Column", df2.columns, key="code2")
68
+ lat_col2 = st.selectbox("Select 'Latitude' Column", df2.columns, key="lat2")
69
+ long_col2 = st.selectbox(
70
+ "Select 'Longitude' Column", df2.columns, key="long2"
71
+ )
72
+ min_distance = st.number_input(
73
+ "Minimum Distance (km)", min_value=0.0, value=5.0
74
+ )
75
  # Calculate distances when button is clicked
76
  if st.button("Calculate Distances"):
77
+ df_distances, df_closest, df_closest_min_distance = calculate_distances(
78
+ df1,
79
+ df2,
80
+ code_col1,
81
+ lat_col1,
82
+ long_col1,
83
+ code_col2,
84
+ lat_col2,
85
+ long_col2,
86
+ min_distance,
87
  )
88
 
89
+ # # Display all distances
90
+ # st.subheader("All Distances")
91
+ # st.dataframe(df_distances)
92
 
93
  # Display closest points
94
  st.subheader("Closest Matches")
95
  st.dataframe(df_closest)
96
+ st.subheader("Closest Matches below Min Distance")
97
+ st.dataframe(df_closest_min_distance)
98
 
99
+ # Downloadable All distances CSV
100
+ st.download_button(
101
+ label="Download All Distances as CSV",
102
+ data=df_distances.to_csv(index=False),
103
+ file_name="all_distances.csv",
104
+ mime="text/csv",
105
+ on_click="ignore",
106
+ type="primary",
107
+ )
108
+
109
+ # Downloadable Closest matches CSV
110
  st.download_button(
111
  label="Download Closest Matches as CSV",
112
  data=df_closest.to_csv(index=False),
113
  file_name="closest_matches.csv",
114
  mime="text/csv",
115
+ on_click="ignore",
116
+ type="primary",
117
+ )
118
+
119
+ # Downloadable Closest matches below Min Distance CSV
120
+ st.download_button(
121
+ label=f"Download Closest Matches below {min_distance}km as CSV",
122
+ data=df_closest_min_distance.to_csv(index=False),
123
+ file_name=f"closest_matches_{min_distance}km.csv",
124
+ mime="text/csv",
125
+ on_click="ignore",
126
+ type="primary",
127
  )
128
 
129
  except Exception as e:
utils/utils_functions.py CHANGED
@@ -1,41 +1,126 @@
 
 
1
  import pandas as pd
2
  from geopy.distance import geodesic
3
 
4
-
5
  # Function to calculate distances while preserving all original columns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  def calculate_distances(
7
  df1: pd.DataFrame,
8
  df2: pd.DataFrame,
9
- code_col1,
10
- lat_col1,
11
- long_col1,
12
- code_col2,
13
- lat_col2,
14
- long_col2,
15
- ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  distances = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- for _, row1 in df1.iterrows():
19
- for _, row2 in df2.iterrows():
20
- coord1 = (row1[lat_col1], row1[long_col1])
21
- coord2 = (row2[lat_col2], row2[long_col2])
22
- distance_km = geodesic(coord1, coord2).kilometers # Compute distance
23
-
24
- # Combine all original columns + distance
25
- combined_row = {
26
- **row1.to_dict(), # Keep all columns from Dataset1
27
- **{
28
- f"{col}_Dataset2": row2[col] for col in df2.columns
29
- }, # Keep all columns from Dataset2
30
- "Distance_km": distance_km,
31
- }
32
- distances.append(combined_row)
33
 
34
  df_distances = pd.DataFrame(distances)
35
 
36
- # Find the closest point for each Point1
37
- df_closest: pd.DataFrame = df_distances.loc[
38
  df_distances.groupby(code_col1)["Distance_km"].idxmin()
39
  ]
40
 
41
- return df_distances, df_closest
 
 
 
 
1
+ import warnings
2
+
3
  import pandas as pd
4
  from geopy.distance import geodesic
5
 
 
6
  # Function to calculate distances while preserving all original columns
7
+ # def calculate_distances(
8
+ # df1: pd.DataFrame,
9
+ # df2: pd.DataFrame,
10
+ # code_col1,
11
+ # lat_col1,
12
+ # long_col1,
13
+ # code_col2,
14
+ # lat_col2,
15
+ # long_col2,
16
+ # min_distance: int = 1,
17
+ # ):
18
+ # distances = []
19
+
20
+ # for _, row1 in df1.iterrows():
21
+ # for _, row2 in df2.iterrows():
22
+ # coord1 = (row1[lat_col1], row1[long_col1])
23
+ # coord2 = (row2[lat_col2], row2[long_col2])
24
+ # distance_km = geodesic(coord1, coord2).kilometers # Compute distance
25
+
26
+ # # Combine all original columns + distance
27
+ # combined_row = {
28
+ # **row1.to_dict(), # Keep all columns from Dataset1
29
+ # **{
30
+ # f"{col}_Dataset2": row2[col] for col in df2.columns
31
+ # }, # Keep all columns from Dataset2
32
+ # "Distance_km": distance_km,
33
+ # }
34
+ # distances.append(combined_row)
35
+
36
+ # df_distances = pd.DataFrame(distances)
37
+
38
+ # # Find the closest point for each Point1
39
+ # df_closest: pd.DataFrame = df_distances.loc[
40
+ # df_distances.groupby(code_col1)["Distance_km"].idxmin()
41
+ # ]
42
+
43
+ # # Find the distnce below min_distance
44
+ # df_closest_min_distance = df_distances[df_distances["Distance_km"] < min_distance]
45
+
46
+ # return df_distances, df_closest, df_closest_min_distance
47
+
48
+
49
  def calculate_distances(
50
  df1: pd.DataFrame,
51
  df2: pd.DataFrame,
52
+ code_col1: str,
53
+ lat_col1: str,
54
+ long_col1: str,
55
+ code_col2: str,
56
+ lat_col2: str,
57
+ long_col2: str,
58
+ min_distance: float = 1.0,
59
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
60
+ """
61
+ Calculate distances between points in two datasets and find closest matches.
62
+
63
+ Args:
64
+ df1: First DataFrame containing reference points
65
+ df2: Second DataFrame containing points to compare
66
+ code_col1: Column name in df1 containing point identifiers
67
+ lat_col1: Column name in df1 containing latitude
68
+ long_col1: Column name in df1 containing longitude
69
+ code_col2: Column name in df2 containing point identifiers
70
+ lat_col2: Column name in df2 containing latitude
71
+ long_col2: Column name in df2 containing longitude
72
+ min_distance: Minimum distance threshold in kilometers
73
+
74
+ Returns:
75
+ tuple: (all_distances, closest_matches, matches_below_threshold)
76
+ """
77
+ # Validate input columns
78
+ required_cols_1 = {code_col1, lat_col1, long_col1}
79
+ required_cols_2 = {code_col2, lat_col2, long_col2}
80
+
81
+ if not required_cols_1.issubset(df1.columns):
82
+ raise ValueError(
83
+ f"df1 is missing required columns: {required_cols_1 - set(df1.columns)}"
84
+ )
85
+ if not required_cols_2.issubset(df2.columns):
86
+ raise ValueError(
87
+ f"df2 is missing required columns: {required_cols_2 - set(df2.columns)}"
88
+ )
89
+
90
+ # Convert to list of tuples for vectorized operations
91
+ coords1 = df1[[lat_col1, long_col1]].apply(tuple, axis=1).tolist()
92
+ coords2 = df2[[lat_col2, long_col2]].apply(tuple, axis=1).tolist()
93
+
94
+ # Calculate all pairwise distances
95
  distances = []
96
+ for i, coord1 in enumerate(coords1):
97
+ for j, coord2 in enumerate(coords2):
98
+ try:
99
+ distance_km = geodesic(coord1, coord2).kilometers
100
+ distances.append(
101
+ {
102
+ **df1.iloc[i].to_dict(),
103
+ **{f"{col}_Dataset2": df2.iloc[j][col] for col in df2.columns},
104
+ "Distance_km": distance_km,
105
+ }
106
+ )
107
+ except ValueError as e:
108
+ warnings.warn(
109
+ f"Skipping invalid coordinates: {coord1} or {coord2}: {e}"
110
+ )
111
+ continue
112
 
113
+ if not distances:
114
+ raise ValueError("No valid coordinate pairs were processed")
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  df_distances = pd.DataFrame(distances)
117
 
118
+ # Find closest matches
119
+ df_closest = df_distances.loc[
120
  df_distances.groupby(code_col1)["Distance_km"].idxmin()
121
  ]
122
 
123
+ # Filter by minimum distance
124
+ df_closest_min_distance = df_distances[df_distances["Distance_km"] < min_distance]
125
+
126
+ return df_distances, df_closest, df_closest_min_distance