oberbics commited on
Commit
cf36ecc
·
verified ·
1 Parent(s): 600aab9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -10
app.py CHANGED
@@ -10,12 +10,14 @@ import time
10
  import random
11
  from typing import List, Tuple, Optional
12
  import io
 
 
13
 
14
  # NuExtract API configuration
15
  API_URL = "https://api-inference.huggingface.co/models/numind/NuExtract-1.5"
16
  headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}"}
17
 
18
- # Geocoding Service
19
  class GeocodingService:
20
  def __init__(self, user_agent: str = None, timeout: int = 10, rate_limit: float = 1.1):
21
  if user_agent is None:
@@ -27,6 +29,7 @@ class GeocodingService:
27
  )
28
  self.rate_limit = rate_limit
29
  self.last_request = 0
 
30
 
31
  def _rate_limit_wait(self):
32
  current_time = time.time()
@@ -36,29 +39,72 @@ class GeocodingService:
36
  self.last_request = time.time()
37
 
38
  def geocode_location(self, location: str, max_retries: int = 3) -> Optional[Tuple[float, float]]:
 
 
 
 
39
  for attempt in range(max_retries):
40
  try:
41
  self._rate_limit_wait()
42
  location_data = self.geolocator.geocode(location)
43
  if location_data:
44
- return (location_data.latitude, location_data.longitude)
 
 
 
 
45
  return None
46
  except (GeocoderTimedOut, GeocoderServiceError) as e:
47
  if attempt == max_retries - 1:
48
  print(f"Failed to geocode '{location}' after {max_retries} attempts: {e}")
 
49
  return None
50
  time.sleep(2 ** attempt) # Exponential backoff
51
  except Exception as e:
52
  print(f"Error geocoding '{location}': {e}")
 
53
  return None
54
  return None
55
 
56
- def process_locations(self, locations: str) -> List[Optional[Tuple[float, float]]]:
57
  if pd.isna(locations) or not locations:
58
  return []
59
 
60
  location_list = [loc.strip() for loc in locations.split(',')]
61
- return [self.geocode_location(loc) for loc in location_list]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  # Mapping Functions
64
  def create_location_map(df: pd.DataFrame,
@@ -105,13 +151,17 @@ def create_location_map(df: pd.DataFrame,
105
 
106
  return m
107
 
108
- # Processing Functions
109
- def process_excel(file, places_column):
110
  # Check if file is None
111
  if file is None:
112
  return None, "No file uploaded", None
113
 
114
  try:
 
 
 
 
115
  # Handle various file object types that Gradio might provide
116
  if hasattr(file, 'name'):
117
  # Gradio file object
@@ -126,11 +176,36 @@ def process_excel(file, places_column):
126
  if places_column not in df.columns:
127
  return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
128
 
 
 
 
129
  # Initialize the geocoding service
130
  geocoder = GeocodingService(user_agent="gradio_map_visualization_app")
131
 
132
- # Process locations and add coordinates
133
- df['coordinates'] = df[places_column].apply(geocoder.process_locations)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  # Create the map
136
  map_obj = create_location_map(df, coordinates_col='coordinates', places_col=places_column)
@@ -140,6 +215,9 @@ def process_excel(file, places_column):
140
  map_obj.save(temp_map_path)
141
 
142
  # Save the processed DataFrame to Excel
 
 
 
143
  processed_file_path = "processed_data.xlsx"
144
  df.to_excel(processed_file_path, index=False)
145
 
@@ -152,8 +230,13 @@ def process_excel(file, places_column):
152
  stats += f"Successfully geocoded: {successful_geocodes}\n"
153
  stats += f"Failed to geocode: {failed_geocodes}"
154
 
 
 
 
155
  return temp_map_path, stats, processed_file_path
156
  except Exception as e:
 
 
157
  return None, f"Error processing file: {str(e)}", None
158
 
159
  # NuExtract Functions
@@ -251,16 +334,21 @@ with gr.Blocks() as demo:
251
  process_btn = gr.Button("Process and Map", variant="primary")
252
 
253
  with gr.Column():
 
254
  map_output = gr.HTML(label="Map Visualization")
255
  stats_output = gr.Textbox(label="Statistics", lines=3)
256
  processed_file = gr.File(label="Processed Data", visible=True, interactive=False)
257
 
258
- def process_and_map(file, column):
259
  if file is None:
260
  return None, "Please upload an Excel file", None
261
 
262
  try:
263
- map_path, stats, processed_path = process_excel(file, column)
 
 
 
 
264
 
265
  if map_path and processed_path:
266
  with open(map_path, "r") as f:
 
10
  import random
11
  from typing import List, Tuple, Optional
12
  import io
13
+ import concurrent.futures
14
+ from tqdm import tqdm
15
 
16
  # NuExtract API configuration
17
  API_URL = "https://api-inference.huggingface.co/models/numind/NuExtract-1.5"
18
  headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}"}
19
 
20
+ # Geocoding Service with improved performance
21
  class GeocodingService:
22
  def __init__(self, user_agent: str = None, timeout: int = 10, rate_limit: float = 1.1):
23
  if user_agent is None:
 
29
  )
30
  self.rate_limit = rate_limit
31
  self.last_request = 0
32
+ self.cache = {} # Simple in-memory cache for geocoding results
33
 
34
  def _rate_limit_wait(self):
35
  current_time = time.time()
 
39
  self.last_request = time.time()
40
 
41
  def geocode_location(self, location: str, max_retries: int = 3) -> Optional[Tuple[float, float]]:
42
+ # Check cache first
43
+ if location in self.cache:
44
+ return self.cache[location]
45
+
46
  for attempt in range(max_retries):
47
  try:
48
  self._rate_limit_wait()
49
  location_data = self.geolocator.geocode(location)
50
  if location_data:
51
+ # Store in cache and return
52
+ self.cache[location] = (location_data.latitude, location_data.longitude)
53
+ return self.cache[location]
54
+ # Cache None results too
55
+ self.cache[location] = None
56
  return None
57
  except (GeocoderTimedOut, GeocoderServiceError) as e:
58
  if attempt == max_retries - 1:
59
  print(f"Failed to geocode '{location}' after {max_retries} attempts: {e}")
60
+ self.cache[location] = None
61
  return None
62
  time.sleep(2 ** attempt) # Exponential backoff
63
  except Exception as e:
64
  print(f"Error geocoding '{location}': {e}")
65
+ self.cache[location] = None
66
  return None
67
  return None
68
 
69
+ def process_locations(self, locations: str, progress_callback=None) -> List[Optional[Tuple[float, float]]]:
70
  if pd.isna(locations) or not locations:
71
  return []
72
 
73
  location_list = [loc.strip() for loc in locations.split(',')]
74
+
75
+ # Process locations in parallel with a limited number of workers
76
+ return self.process_locations_parallel(location_list, progress_callback)
77
+
78
+ def process_locations_parallel(self, location_list, progress_callback=None, max_workers=4) -> List[Optional[Tuple[float, float]]]:
79
+ """Process locations in parallel with progress tracking"""
80
+ results = [None] * len(location_list)
81
+
82
+ # Use a ThreadPoolExecutor for parallel processing
83
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
84
+ # Submit all tasks
85
+ future_to_index = {executor.submit(self.geocode_location, loc): i
86
+ for i, loc in enumerate(location_list)}
87
+
88
+ # Process as they complete with progress updates
89
+ total = len(future_to_index)
90
+ completed = 0
91
+
92
+ for future in concurrent.futures.as_completed(future_to_index):
93
+ index = future_to_index[future]
94
+ try:
95
+ results[index] = future.result()
96
+ except Exception as e:
97
+ print(f"Error processing location: {e}")
98
+ results[index] = None
99
+
100
+ # Update progress
101
+ completed += 1
102
+ if progress_callback:
103
+ progress_callback(completed, total)
104
+ else:
105
+ print(f"Geocoded {completed}/{total} locations")
106
+
107
+ return results
108
 
109
  # Mapping Functions
110
  def create_location_map(df: pd.DataFrame,
 
151
 
152
  return m
153
 
154
+ # Processing Functions with progress updates
155
+ def process_excel(file, places_column, progress=None):
156
  # Check if file is None
157
  if file is None:
158
  return None, "No file uploaded", None
159
 
160
  try:
161
+ # Update progress
162
+ if progress:
163
+ progress(0.1, "Reading Excel file...")
164
+
165
  # Handle various file object types that Gradio might provide
166
  if hasattr(file, 'name'):
167
  # Gradio file object
 
176
  if places_column not in df.columns:
177
  return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
178
 
179
+ if progress:
180
+ progress(0.2, "Initializing geocoding...")
181
+
182
  # Initialize the geocoding service
183
  geocoder = GeocodingService(user_agent="gradio_map_visualization_app")
184
 
185
+ # Function to update progress during geocoding
186
+ def geocoding_progress(completed, total):
187
+ if progress:
188
+ # Scale progress between 20% and 80%
189
+ progress_value = 0.2 + (0.6 * (completed / total))
190
+ progress(progress_value, f"Geocoding {completed}/{total} locations...")
191
+
192
+ # Process locations and add coordinates with progress tracking
193
+ print("Starting geocoding process...")
194
+
195
+ # Process each row with progress updates
196
+ coordinates_list = []
197
+ total_rows = len(df)
198
+
199
+ for idx, row in df.iterrows():
200
+ locations = row[places_column]
201
+ coords = geocoder.process_locations(locations, geocoding_progress)
202
+ coordinates_list.append(coords)
203
+ print(f"Processed row {idx+1}/{total_rows}")
204
+
205
+ df['coordinates'] = coordinates_list
206
+
207
+ if progress:
208
+ progress(0.8, "Creating map...")
209
 
210
  # Create the map
211
  map_obj = create_location_map(df, coordinates_col='coordinates', places_col=places_column)
 
215
  map_obj.save(temp_map_path)
216
 
217
  # Save the processed DataFrame to Excel
218
+ if progress:
219
+ progress(0.9, "Saving results...")
220
+
221
  processed_file_path = "processed_data.xlsx"
222
  df.to_excel(processed_file_path, index=False)
223
 
 
230
  stats += f"Successfully geocoded: {successful_geocodes}\n"
231
  stats += f"Failed to geocode: {failed_geocodes}"
232
 
233
+ if progress:
234
+ progress(1.0, "Processing complete!")
235
+
236
  return temp_map_path, stats, processed_file_path
237
  except Exception as e:
238
+ if progress:
239
+ progress(1.0, f"Error: {str(e)}")
240
  return None, f"Error processing file: {str(e)}", None
241
 
242
  # NuExtract Functions
 
334
  process_btn = gr.Button("Process and Map", variant="primary")
335
 
336
  with gr.Column():
337
+ progress_bar = gr.Progress()
338
  map_output = gr.HTML(label="Map Visualization")
339
  stats_output = gr.Textbox(label="Statistics", lines=3)
340
  processed_file = gr.File(label="Processed Data", visible=True, interactive=False)
341
 
342
+ def process_and_map(file, column, progress=gr.Progress()):
343
  if file is None:
344
  return None, "Please upload an Excel file", None
345
 
346
  try:
347
+ # Initialize progress
348
+ progress(0, "Starting process...")
349
+
350
+ # Process the file with progress updates
351
+ map_path, stats, processed_path = process_excel(file, column, progress)
352
 
353
  if map_path and processed_path:
354
  with open(map_path, "r") as f: