oberbics commited on
Commit
df1519d
Β·
verified Β·
1 Parent(s): 65988e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -29
app.py CHANGED
@@ -70,8 +70,37 @@ class GeocodingService:
70
  if pd.isna(locations) or not locations:
71
  return []
72
 
73
- location_list = [loc.strip() for loc in locations.split(',')]
74
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  # Process locations in parallel with a limited number of workers
76
  return self.process_locations_parallel(location_list, progress_callback)
77
 
@@ -182,6 +211,10 @@ def process_excel(file, places_column, progress=None):
182
  if places_column not in df.columns:
183
  return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
184
 
 
 
 
 
185
  if progress:
186
  progress(0.2, "Initializing geocoding...")
187
 
@@ -202,11 +235,80 @@ def process_excel(file, places_column, progress=None):
202
  coordinates_list = []
203
  total_rows = len(df)
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  for idx, row in df.iterrows():
206
- locations = row[places_column]
207
- coords = geocoder.process_locations(locations, geocoding_progress)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  coordinates_list.append(coords)
209
- print(f"Processed row {idx+1}/{total_rows}")
210
 
211
  df['coordinates'] = coordinates_list
212
 
@@ -229,21 +331,25 @@ def process_excel(file, places_column, progress=None):
229
 
230
  # Statistics
231
  total_locations = len(df)
232
- successful_geocodes = df['coordinates'].apply(lambda x: len([c for c in x if c is not None])).sum()
233
- failed_geocodes = df['coordinates'].apply(lambda x: len([c for c in x if c is None])).sum()
234
 
235
- stats = f"Total locations: {total_locations}\n"
236
- stats += f"Successfully geocoded: {successful_geocodes}\n"
237
- stats += f"Failed to geocode: {failed_geocodes}"
238
 
239
  if progress:
240
  progress(1.0, "Processing complete!")
241
 
242
  return temp_map_path, stats, processed_file_path
243
  except Exception as e:
 
 
 
 
244
  if progress:
245
  progress(1.0, f"Error: {str(e)}")
246
- return None, f"Error processing file: {str(e)}", None
247
 
248
  # NuExtract Functions
249
  def extract_info(template, text):
@@ -275,26 +381,36 @@ def extract_info(template, text):
275
  # Process result
276
  result = response.json()
277
 
278
- # Handle different response formats
279
- if isinstance(result, list) and len(result) > 0:
280
- result_text = result[0].get("generated_text", "")
281
- else:
282
- result_text = str(result)
283
-
284
- # Split at output marker if present
285
- if "<|output|>" in result_text:
286
- json_text = result_text.split("<|output|>")[1].strip()
287
- else:
288
- json_text = result_text
289
-
290
- # Try to parse as JSON
291
  try:
292
- extracted = json.loads(json_text)
293
- formatted = json.dumps(extracted, indent=2)
294
- except json.JSONDecodeError:
295
- return "❌ JSON parsing error", json_text
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
- return "βœ… Success", formatted
 
 
 
 
 
 
 
 
 
298
  except Exception as e:
299
  return f"❌ Error: {str(e)}", "{}"
300
 
 
70
  if pd.isna(locations) or not locations:
71
  return []
72
 
73
+ # Handle special case with "dateline_locations" prefix
74
+ if "dateline_locations" in locations:
75
+ # Remove the prefix if present
76
+ locations = locations.replace("dateline_locations", "").strip()
77
+
78
+ # Improved location parsing to handle complex location names with commas
79
+ # This regex-based approach attempts to identify well-formed location patterns
80
+ try:
81
+ import re
82
+
83
+ # Try to find patterns like "City, Country" or standalone names
84
+ # This handles cities like "Paris, France" as single entities
85
+ location_pattern = re.compile(r'([A-Za-z\s]+(?:,\s*[A-Za-z\s]+)?)')
86
+ matches = location_pattern.findall(locations)
87
+
88
+ # Filter out empty matches and strip whitespace
89
+ location_list = [match.strip() for match in matches if match.strip()]
90
+
91
+ # If regex didn't work properly, fall back to a simpler approach
92
+ if not location_list:
93
+ # Simple space-based splitting as a fallback
94
+ location_list = [loc.strip() for loc in locations.split() if loc.strip()]
95
+ print(f"Using fallback location parsing: {location_list}")
96
+
97
+ except Exception as e:
98
+ print(f"Error parsing locations: {e}, using simple splitting")
99
+ # Simple fallback if regex fails
100
+ location_list = [loc.strip() for loc in locations.split() if loc.strip()]
101
+
102
+ print(f"Parsed locations: {location_list}")
103
+
104
  # Process locations in parallel with a limited number of workers
105
  return self.process_locations_parallel(location_list, progress_callback)
106
 
 
211
  if places_column not in df.columns:
212
  return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
213
 
214
+ # Print column names and first few rows for debugging
215
+ print(f"Columns in Excel file: {df.columns.tolist()}")
216
+ print(f"First 3 rows of data:\n{df.head(3)}")
217
+
218
  if progress:
219
  progress(0.2, "Initializing geocoding...")
220
 
 
235
  coordinates_list = []
236
  total_rows = len(df)
237
 
238
+ # Create a helper function to safely parse location data from each row
239
+ def parse_excel_locations(location_data):
240
+ """Safely parse location data from Excel cell"""
241
+ if pd.isna(location_data):
242
+ return []
243
+
244
+ # Convert to string to handle numeric or other data types
245
+ location_data = str(location_data).strip()
246
+
247
+ # Skip empty strings
248
+ if not location_data:
249
+ return []
250
+
251
+ # Look for recognized patterns and split accordingly
252
+ # First, check if it's a comma-separated list
253
+ if "," in location_data:
254
+ # This could be a list like "Berlin, Hamburg, Munich"
255
+ # Or it could contain locations like "Paris, France"
256
+
257
+ # Try to intelligently parse based on common patterns
258
+ try:
259
+ import re
260
+
261
+ # Pattern to match city-country pairs or standalone names
262
+ # Examples: "Paris, France" or "Berlin" or "New York, NY, USA"
263
+ location_pattern = re.compile(r'([A-Za-z\s]+(?:,\s*[A-Za-z\s]+){0,2})')
264
+ matches = location_pattern.findall(location_data)
265
+
266
+ locations = [match.strip() for match in matches if match.strip()]
267
+
268
+ # If our pattern matching didn't work, fall back to simple comma splitting
269
+ if not locations:
270
+ locations = [loc.strip() for loc in location_data.split(',') if loc.strip()]
271
+
272
+ return locations
273
+
274
+ except Exception as e:
275
+ print(f"Regex parsing failed: {e}")
276
+ # Fallback to simple comma splitting
277
+ return [loc.strip() for loc in location_data.split(',') if loc.strip()]
278
+
279
+ # Otherwise, treat it as a single location or space-separated list
280
+ else:
281
+ # Check if it might be space-separated
282
+ potential_locations = location_data.split()
283
+
284
+ # If it just looks like one word with no spaces, return it as a single location
285
+ if len(potential_locations) == 1:
286
+ return [location_data]
287
+
288
+ # If it has multiple words, it could be a single location name with spaces
289
+ # or multiple space-separated locations
290
+ # For safety, treat it as a single location
291
+ return [location_data]
292
+
293
  for idx, row in df.iterrows():
294
+ location_data = row[places_column]
295
+ print(f"Processing row {idx+1}/{total_rows}, location data: {location_data}")
296
+
297
+ # Parse the locations from the Excel cell
298
+ location_list = parse_excel_locations(location_data)
299
+ print(f"Parsed locations: {location_list}")
300
+
301
+ # Now geocode each location
302
+ coords = []
303
+ for location in location_list:
304
+ coord = geocoder.geocode_location(location)
305
+ coords.append(coord)
306
+ # Update progress
307
+ if progress_callback:
308
+ progress_callback(len(coords), len(location_list))
309
+
310
  coordinates_list.append(coords)
311
+ print(f"Processed row {idx+1}/{total_rows}, found coordinates: {coords}")
312
 
313
  df['coordinates'] = coordinates_list
314
 
 
331
 
332
  # Statistics
333
  total_locations = len(df)
334
+ successful_geocodes = sum(1 for coords in coordinates_list for coord in coords if coord is not None)
335
+ failed_geocodes = sum(1 for coords in coordinates_list for coord in coords if coord is None)
336
 
337
+ stats = f"Total data rows: {total_locations}\n"
338
+ stats += f"Successfully geocoded locations: {successful_geocodes}\n"
339
+ stats += f"Failed to geocode locations: {failed_geocodes}"
340
 
341
  if progress:
342
  progress(1.0, "Processing complete!")
343
 
344
  return temp_map_path, stats, processed_file_path
345
  except Exception as e:
346
+ import traceback
347
+ error_details = traceback.format_exc()
348
+ print(f"Error processing Excel file: {error_details}")
349
+
350
  if progress:
351
  progress(1.0, f"Error: {str(e)}")
352
+ return None, f"Error processing file: {str(e)}\n\nDetails: {error_details}", None
353
 
354
  # NuExtract Functions
355
  def extract_info(template, text):
 
381
  # Process result
382
  result = response.json()
383
 
384
+ # Handle different response formats with careful error handling
 
 
 
 
 
 
 
 
 
 
 
 
385
  try:
386
+ if isinstance(result, list):
387
+ if len(result) > 0:
388
+ result_text = result[0].get("generated_text", "")
389
+ else:
390
+ return "❌ Empty result list from API", "{}"
391
+ else:
392
+ result_text = str(result)
393
+
394
+ # Split at output marker if present
395
+ if "<|output|>" in result_text:
396
+ split_parts = result_text.split("<|output|>")
397
+ if len(split_parts) > 1:
398
+ json_text = split_parts[1].strip()
399
+ else:
400
+ json_text = result_text # Fallback if split didn't work as expected
401
+ else:
402
+ json_text = result_text
403
 
404
+ # Try to parse as JSON
405
+ try:
406
+ extracted = json.loads(json_text)
407
+ formatted = json.dumps(extracted, indent=2)
408
+ except json.JSONDecodeError:
409
+ return "❌ JSON parsing error", json_text
410
+
411
+ return "βœ… Success", formatted
412
+ except Exception as inner_e:
413
+ return f"❌ Error processing API result: {str(inner_e)}", "{}"
414
  except Exception as e:
415
  return f"❌ Error: {str(e)}", "{}"
416