oberbics commited on
Commit
5f830c6
Β·
verified Β·
1 Parent(s): df1519d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -202
app.py CHANGED
@@ -10,14 +10,12 @@ import time
10
  import random
11
  from typing import List, Tuple, Optional
12
  import io
13
- import concurrent.futures
14
- from tqdm import tqdm
15
 
16
  # NuExtract API configuration
17
  API_URL = "https://api-inference.huggingface.co/models/numind/NuExtract-1.5"
18
  headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}"}
19
 
20
- # Geocoding Service with improved performance
21
  class GeocodingService:
22
  def __init__(self, user_agent: str = None, timeout: int = 10, rate_limit: float = 1.1):
23
  if user_agent is None:
@@ -29,7 +27,7 @@ class GeocodingService:
29
  )
30
  self.rate_limit = rate_limit
31
  self.last_request = 0
32
- self.cache = {} # Simple in-memory cache for geocoding results
33
 
34
  def _rate_limit_wait(self):
35
  current_time = time.time()
@@ -66,74 +64,30 @@ class GeocodingService:
66
  return None
67
  return None
68
 
69
- def process_locations(self, locations: str, progress_callback=None) -> List[Optional[Tuple[float, float]]]:
70
  if pd.isna(locations) or not locations:
71
  return []
72
 
73
- # Handle special case with "dateline_locations" prefix
74
- if "dateline_locations" in locations:
75
- # Remove the prefix if present
76
- locations = locations.replace("dateline_locations", "").strip()
77
-
78
- # Improved location parsing to handle complex location names with commas
79
- # This regex-based approach attempts to identify well-formed location patterns
80
  try:
 
81
  import re
82
-
83
- # Try to find patterns like "City, Country" or standalone names
84
- # This handles cities like "Paris, France" as single entities
85
- location_pattern = re.compile(r'([A-Za-z\s]+(?:,\s*[A-Za-z\s]+)?)')
86
- matches = location_pattern.findall(locations)
87
-
88
- # Filter out empty matches and strip whitespace
89
  location_list = [match.strip() for match in matches if match.strip()]
90
 
91
- # If regex didn't work properly, fall back to a simpler approach
92
  if not location_list:
93
- # Simple space-based splitting as a fallback
94
- location_list = [loc.strip() for loc in locations.split() if loc.strip()]
95
- print(f"Using fallback location parsing: {location_list}")
96
 
97
- except Exception as e:
98
- print(f"Error parsing locations: {e}, using simple splitting")
99
- # Simple fallback if regex fails
100
- location_list = [loc.strip() for loc in locations.split() if loc.strip()]
101
-
102
- print(f"Parsed locations: {location_list}")
103
-
104
- # Process locations in parallel with a limited number of workers
105
- return self.process_locations_parallel(location_list, progress_callback)
106
-
107
- def process_locations_parallel(self, location_list, progress_callback=None, max_workers=4) -> List[Optional[Tuple[float, float]]]:
108
- """Process locations in parallel with progress tracking"""
109
- results = [None] * len(location_list)
110
-
111
- # Use a ThreadPoolExecutor for parallel processing
112
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
113
- # Submit all tasks
114
- future_to_index = {executor.submit(self.geocode_location, loc): i
115
- for i, loc in enumerate(location_list)}
116
-
117
- # Process as they complete with progress updates
118
- total = len(future_to_index)
119
- completed = 0
120
-
121
- for future in concurrent.futures.as_completed(future_to_index):
122
- index = future_to_index[future]
123
- try:
124
- results[index] = future.result()
125
- except Exception as e:
126
- print(f"Error processing location: {e}")
127
- results[index] = None
128
 
129
- # Update progress
130
- completed += 1
131
- if progress_callback:
132
- progress_callback(completed, total)
133
- else:
134
- print(f"Geocoded {completed}/{total} locations")
135
-
136
- return results
137
 
138
  # Mapping Functions
139
  def create_location_map(df: pd.DataFrame,
@@ -147,24 +101,35 @@ def create_location_map(df: pd.DataFrame,
147
  # Process each row in the DataFrame
148
  for idx, row in df.iterrows():
149
  coordinates = row[coordinates_col]
150
- places = row[places_col].split(',') if pd.notna(row[places_col]) else []
151
  title = row[title_col] if title_col and pd.notna(row[title_col]) else None
152
 
153
  # Skip if no coordinates
154
  if not coordinates:
155
  continue
156
 
157
- # Make sure places and coordinates lists have the same length
158
- # If places list is shorter, pad it with unnamed locations
 
 
 
 
 
 
 
159
  while len(places) < len(coordinates):
160
- places.append(f"Unnamed Location {len(places)+1}")
161
 
162
- # Add individual markers for each location
163
  for i, coord in enumerate(coordinates):
164
  if coord is not None: # Skip None coordinates
165
  lat, lon = coord
166
- # Safely get place name, use a default if index is out of range
167
- place_name = places[i].strip() if i < len(places) else f"Location {i+1}"
 
 
 
 
168
 
169
  # Create popup content
170
  popup_content = f"<b>{place_name}</b>"
@@ -186,17 +151,13 @@ def create_location_map(df: pd.DataFrame,
186
 
187
  return m
188
 
189
- # Processing Functions with progress updates
190
- def process_excel(file, places_column, progress=None):
191
  # Check if file is None
192
  if file is None:
193
  return None, "No file uploaded", None
194
 
195
  try:
196
- # Update progress
197
- if progress:
198
- progress(0.1, "Reading Excel file...")
199
-
200
  # Handle various file object types that Gradio might provide
201
  if hasattr(file, 'name'):
202
  # Gradio file object
@@ -208,112 +169,22 @@ def process_excel(file, places_column, progress=None):
208
  # Assume it's a filepath string
209
  df = pd.read_excel(file)
210
 
 
 
 
 
211
  if places_column not in df.columns:
212
  return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
213
 
214
- # Print column names and first few rows for debugging
215
- print(f"Columns in Excel file: {df.columns.tolist()}")
216
- print(f"First 3 rows of data:\n{df.head(3)}")
217
-
218
- if progress:
219
- progress(0.2, "Initializing geocoding...")
220
-
221
  # Initialize the geocoding service
222
  geocoder = GeocodingService(user_agent="gradio_map_visualization_app")
223
 
224
- # Function to update progress during geocoding
225
- def geocoding_progress(completed, total):
226
- if progress:
227
- # Scale progress between 20% and 80%
228
- progress_value = 0.2 + (0.6 * (completed / total))
229
- progress(progress_value, f"Geocoding {completed}/{total} locations...")
230
-
231
- # Process locations and add coordinates with progress tracking
232
- print("Starting geocoding process...")
233
-
234
- # Process each row with progress updates
235
- coordinates_list = []
236
- total_rows = len(df)
237
 
238
- # Create a helper function to safely parse location data from each row
239
- def parse_excel_locations(location_data):
240
- """Safely parse location data from Excel cell"""
241
- if pd.isna(location_data):
242
- return []
243
-
244
- # Convert to string to handle numeric or other data types
245
- location_data = str(location_data).strip()
246
-
247
- # Skip empty strings
248
- if not location_data:
249
- return []
250
-
251
- # Look for recognized patterns and split accordingly
252
- # First, check if it's a comma-separated list
253
- if "," in location_data:
254
- # This could be a list like "Berlin, Hamburg, Munich"
255
- # Or it could contain locations like "Paris, France"
256
-
257
- # Try to intelligently parse based on common patterns
258
- try:
259
- import re
260
-
261
- # Pattern to match city-country pairs or standalone names
262
- # Examples: "Paris, France" or "Berlin" or "New York, NY, USA"
263
- location_pattern = re.compile(r'([A-Za-z\s]+(?:,\s*[A-Za-z\s]+){0,2})')
264
- matches = location_pattern.findall(location_data)
265
-
266
- locations = [match.strip() for match in matches if match.strip()]
267
-
268
- # If our pattern matching didn't work, fall back to simple comma splitting
269
- if not locations:
270
- locations = [loc.strip() for loc in location_data.split(',') if loc.strip()]
271
-
272
- return locations
273
-
274
- except Exception as e:
275
- print(f"Regex parsing failed: {e}")
276
- # Fallback to simple comma splitting
277
- return [loc.strip() for loc in location_data.split(',') if loc.strip()]
278
-
279
- # Otherwise, treat it as a single location or space-separated list
280
- else:
281
- # Check if it might be space-separated
282
- potential_locations = location_data.split()
283
-
284
- # If it just looks like one word with no spaces, return it as a single location
285
- if len(potential_locations) == 1:
286
- return [location_data]
287
-
288
- # If it has multiple words, it could be a single location name with spaces
289
- # or multiple space-separated locations
290
- # For safety, treat it as a single location
291
- return [location_data]
292
-
293
- for idx, row in df.iterrows():
294
- location_data = row[places_column]
295
- print(f"Processing row {idx+1}/{total_rows}, location data: {location_data}")
296
-
297
- # Parse the locations from the Excel cell
298
- location_list = parse_excel_locations(location_data)
299
- print(f"Parsed locations: {location_list}")
300
-
301
- # Now geocode each location
302
- coords = []
303
- for location in location_list:
304
- coord = geocoder.geocode_location(location)
305
- coords.append(coord)
306
- # Update progress
307
- if progress_callback:
308
- progress_callback(len(coords), len(location_list))
309
-
310
- coordinates_list.append(coords)
311
- print(f"Processed row {idx+1}/{total_rows}, found coordinates: {coords}")
312
-
313
- df['coordinates'] = coordinates_list
314
-
315
- if progress:
316
- progress(0.8, "Creating map...")
317
 
318
  # Create the map
319
  map_obj = create_location_map(df, coordinates_col='coordinates', places_col=places_column)
@@ -323,33 +194,24 @@ def process_excel(file, places_column, progress=None):
323
  map_obj.save(temp_map_path)
324
 
325
  # Save the processed DataFrame to Excel
326
- if progress:
327
- progress(0.9, "Saving results...")
328
-
329
  processed_file_path = "processed_data.xlsx"
330
  df.to_excel(processed_file_path, index=False)
331
 
332
  # Statistics
333
  total_locations = len(df)
334
- successful_geocodes = sum(1 for coords in coordinates_list for coord in coords if coord is not None)
335
- failed_geocodes = sum(1 for coords in coordinates_list for coord in coords if coord is None)
336
 
337
  stats = f"Total data rows: {total_locations}\n"
338
  stats += f"Successfully geocoded locations: {successful_geocodes}\n"
339
  stats += f"Failed to geocode locations: {failed_geocodes}"
340
 
341
- if progress:
342
- progress(1.0, "Processing complete!")
343
-
344
  return temp_map_path, stats, processed_file_path
345
  except Exception as e:
346
  import traceback
347
- error_details = traceback.format_exc()
348
- print(f"Error processing Excel file: {error_details}")
349
-
350
- if progress:
351
- progress(1.0, f"Error: {str(e)}")
352
- return None, f"Error processing file: {str(e)}\n\nDetails: {error_details}", None
353
 
354
  # NuExtract Functions
355
  def extract_info(template, text):
@@ -381,23 +243,23 @@ def extract_info(template, text):
381
  # Process result
382
  result = response.json()
383
 
384
- # Handle different response formats with careful error handling
385
  try:
386
  if isinstance(result, list):
387
  if len(result) > 0:
388
  result_text = result[0].get("generated_text", "")
389
  else:
390
- return "❌ Empty result list from API", "{}"
391
  else:
392
  result_text = str(result)
393
 
394
  # Split at output marker if present
395
  if "<|output|>" in result_text:
396
- split_parts = result_text.split("<|output|>")
397
- if len(split_parts) > 1:
398
- json_text = split_parts[1].strip()
399
  else:
400
- json_text = result_text # Fallback if split didn't work as expected
401
  else:
402
  json_text = result_text
403
 
@@ -410,7 +272,7 @@ def extract_info(template, text):
410
 
411
  return "βœ… Success", formatted
412
  except Exception as inner_e:
413
- return f"❌ Error processing API result: {str(inner_e)}", "{}"
414
  except Exception as e:
415
  return f"❌ Error: {str(e)}", "{}"
416
 
@@ -456,21 +318,16 @@ with gr.Blocks() as demo:
456
  process_btn = gr.Button("Process and Map", variant="primary")
457
 
458
  with gr.Column():
459
- progress_bar = gr.Progress()
460
  map_output = gr.HTML(label="Map Visualization")
461
  stats_output = gr.Textbox(label="Statistics", lines=3)
462
  processed_file = gr.File(label="Processed Data", visible=True, interactive=False)
463
 
464
- def process_and_map(file, column, progress=gr.Progress()):
465
  if file is None:
466
  return None, "Please upload an Excel file", None
467
 
468
  try:
469
- # Initialize progress
470
- progress(0, "Starting process...")
471
-
472
- # Process the file with progress updates
473
- map_path, stats, processed_path = process_excel(file, column, progress)
474
 
475
  if map_path and processed_path:
476
  with open(map_path, "r") as f:
@@ -480,6 +337,9 @@ with gr.Blocks() as demo:
480
  else:
481
  return None, stats, None
482
  except Exception as e:
 
 
 
483
  return None, f"Error: {str(e)}", None
484
 
485
  process_btn.click(
 
10
  import random
11
  from typing import List, Tuple, Optional
12
  import io
 
 
13
 
14
  # NuExtract API configuration
15
  API_URL = "https://api-inference.huggingface.co/models/numind/NuExtract-1.5"
16
  headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}"}
17
 
18
+ # Geocoding Service
19
  class GeocodingService:
20
  def __init__(self, user_agent: str = None, timeout: int = 10, rate_limit: float = 1.1):
21
  if user_agent is None:
 
27
  )
28
  self.rate_limit = rate_limit
29
  self.last_request = 0
30
+ self.cache = {} # Simple in-memory cache
31
 
32
  def _rate_limit_wait(self):
33
  current_time = time.time()
 
64
  return None
65
  return None
66
 
67
+ def process_locations(self, locations: str) -> List[Optional[Tuple[float, float]]]:
68
  if pd.isna(locations) or not locations:
69
  return []
70
 
 
 
 
 
 
 
 
71
  try:
72
+ # First try to intelligently parse
73
  import re
74
+ pattern = r"([^,]+(?:,\s*[A-Za-z]+)?)"
75
+ matches = re.findall(pattern, locations)
 
 
 
 
 
76
  location_list = [match.strip() for match in matches if match.strip()]
77
 
78
+ # If regex finds nothing, fall back to simple comma splitting
79
  if not location_list:
80
+ location_list = [loc.strip() for loc in locations.split(',') if loc.strip()]
 
 
81
 
82
+ # For debugging
83
+ print(f"Parsed '{locations}' into: {location_list}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ return [self.geocode_location(loc) for loc in location_list]
86
+ except Exception as e:
87
+ print(f"Error parsing locations '{locations}': {e}")
88
+ # Fall back to simple method
89
+ location_list = [loc.strip() for loc in locations.split(',') if loc.strip()]
90
+ return [self.geocode_location(loc) for loc in location_list]
 
 
91
 
92
  # Mapping Functions
93
  def create_location_map(df: pd.DataFrame,
 
101
  # Process each row in the DataFrame
102
  for idx, row in df.iterrows():
103
  coordinates = row[coordinates_col]
104
+ places_text = row[places_col] if pd.notna(row[places_col]) else ""
105
  title = row[title_col] if title_col and pd.notna(row[title_col]) else None
106
 
107
  # Skip if no coordinates
108
  if not coordinates:
109
  continue
110
 
111
+ # Parse places into a list
112
+ try:
113
+ places = [p.strip() for p in places_text.split(',') if p.strip()]
114
+ except:
115
+ # Fall back to treating it as a single place if splitting fails
116
+ places = [places_text] if places_text else []
117
+
118
+ # Ensure places and coordinates have compatible lengths
119
+ # If places is shorter, add placeholder names
120
  while len(places) < len(coordinates):
121
+ places.append(f"Location {len(places) + 1}")
122
 
123
+ # Add markers for each coordinate
124
  for i, coord in enumerate(coordinates):
125
  if coord is not None: # Skip None coordinates
126
  lat, lon = coord
127
+
128
+ # Get place name safely
129
+ if i < len(places):
130
+ place_name = places[i]
131
+ else:
132
+ place_name = f"Location {i + 1}"
133
 
134
  # Create popup content
135
  popup_content = f"<b>{place_name}</b>"
 
151
 
152
  return m
153
 
154
+ # Processing Functions
155
+ def process_excel(file, places_column):
156
  # Check if file is None
157
  if file is None:
158
  return None, "No file uploaded", None
159
 
160
  try:
 
 
 
 
161
  # Handle various file object types that Gradio might provide
162
  if hasattr(file, 'name'):
163
  # Gradio file object
 
169
  # Assume it's a filepath string
170
  df = pd.read_excel(file)
171
 
172
+ # Print column names for debugging
173
+ print(f"Columns in Excel file: {list(df.columns)}")
174
+ print(f"Preview of data:\n{df.head(2)}")
175
+
176
  if places_column not in df.columns:
177
  return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
178
 
 
 
 
 
 
 
 
179
  # Initialize the geocoding service
180
  geocoder = GeocodingService(user_agent="gradio_map_visualization_app")
181
 
182
+ # Process locations and add coordinates
183
+ print(f"Processing locations from column: {places_column}")
184
+ print(f"First few values: {df[places_column].head().tolist()}")
 
 
 
 
 
 
 
 
 
 
185
 
186
+ # Apply geocoding to each row
187
+ df['coordinates'] = df[places_column].apply(geocoder.process_locations)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  # Create the map
190
  map_obj = create_location_map(df, coordinates_col='coordinates', places_col=places_column)
 
194
  map_obj.save(temp_map_path)
195
 
196
  # Save the processed DataFrame to Excel
 
 
 
197
  processed_file_path = "processed_data.xlsx"
198
  df.to_excel(processed_file_path, index=False)
199
 
200
  # Statistics
201
  total_locations = len(df)
202
+ successful_geocodes = sum(1 for row in df['coordinates'] for coord in row if coord is not None)
203
+ failed_geocodes = sum(1 for row in df['coordinates'] for coord in row if coord is None)
204
 
205
  stats = f"Total data rows: {total_locations}\n"
206
  stats += f"Successfully geocoded locations: {successful_geocodes}\n"
207
  stats += f"Failed to geocode locations: {failed_geocodes}"
208
 
 
 
 
209
  return temp_map_path, stats, processed_file_path
210
  except Exception as e:
211
  import traceback
212
+ trace = traceback.format_exc()
213
+ print(f"Error processing file: {e}\n{trace}")
214
+ return None, f"Error processing file: {str(e)}", None
 
 
 
215
 
216
  # NuExtract Functions
217
  def extract_info(template, text):
 
243
  # Process result
244
  result = response.json()
245
 
246
+ # Handle different response formats
247
  try:
248
  if isinstance(result, list):
249
  if len(result) > 0:
250
  result_text = result[0].get("generated_text", "")
251
  else:
252
+ return "❌ Empty result list", "{}"
253
  else:
254
  result_text = str(result)
255
 
256
  # Split at output marker if present
257
  if "<|output|>" in result_text:
258
+ parts = result_text.split("<|output|>")
259
+ if len(parts) > 1:
260
+ json_text = parts[1].strip()
261
  else:
262
+ json_text = result_text
263
  else:
264
  json_text = result_text
265
 
 
272
 
273
  return "βœ… Success", formatted
274
  except Exception as inner_e:
275
+ return f"❌ Error processing result: {str(inner_e)}", "{}"
276
  except Exception as e:
277
  return f"❌ Error: {str(e)}", "{}"
278
 
 
318
  process_btn = gr.Button("Process and Map", variant="primary")
319
 
320
  with gr.Column():
 
321
  map_output = gr.HTML(label="Map Visualization")
322
  stats_output = gr.Textbox(label="Statistics", lines=3)
323
  processed_file = gr.File(label="Processed Data", visible=True, interactive=False)
324
 
325
+ def process_and_map(file, column):
326
  if file is None:
327
  return None, "Please upload an Excel file", None
328
 
329
  try:
330
+ map_path, stats, processed_path = process_excel(file, column)
 
 
 
 
331
 
332
  if map_path and processed_path:
333
  with open(map_path, "r") as f:
 
337
  else:
338
  return None, stats, None
339
  except Exception as e:
340
+ import traceback
341
+ trace = traceback.format_exc()
342
+ print(f"Error in process_and_map: {e}\n{trace}")
343
  return None, f"Error: {str(e)}", None
344
 
345
  process_btn.click(