Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -70,8 +70,37 @@ class GeocodingService:
|
|
70 |
if pd.isna(locations) or not locations:
|
71 |
return []
|
72 |
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
# Process locations in parallel with a limited number of workers
|
76 |
return self.process_locations_parallel(location_list, progress_callback)
|
77 |
|
@@ -182,6 +211,10 @@ def process_excel(file, places_column, progress=None):
|
|
182 |
if places_column not in df.columns:
|
183 |
return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
|
184 |
|
|
|
|
|
|
|
|
|
185 |
if progress:
|
186 |
progress(0.2, "Initializing geocoding...")
|
187 |
|
@@ -202,11 +235,80 @@ def process_excel(file, places_column, progress=None):
|
|
202 |
coordinates_list = []
|
203 |
total_rows = len(df)
|
204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
for idx, row in df.iterrows():
|
206 |
-
|
207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
coordinates_list.append(coords)
|
209 |
-
print(f"Processed row {idx+1}/{total_rows}")
|
210 |
|
211 |
df['coordinates'] = coordinates_list
|
212 |
|
@@ -229,21 +331,25 @@ def process_excel(file, places_column, progress=None):
|
|
229 |
|
230 |
# Statistics
|
231 |
total_locations = len(df)
|
232 |
-
successful_geocodes =
|
233 |
-
failed_geocodes =
|
234 |
|
235 |
-
stats = f"Total
|
236 |
-
stats += f"Successfully geocoded: {successful_geocodes}\n"
|
237 |
-
stats += f"Failed to geocode: {failed_geocodes}"
|
238 |
|
239 |
if progress:
|
240 |
progress(1.0, "Processing complete!")
|
241 |
|
242 |
return temp_map_path, stats, processed_file_path
|
243 |
except Exception as e:
|
|
|
|
|
|
|
|
|
244 |
if progress:
|
245 |
progress(1.0, f"Error: {str(e)}")
|
246 |
-
return None, f"Error processing file: {str(e)}", None
|
247 |
|
248 |
# NuExtract Functions
|
249 |
def extract_info(template, text):
|
@@ -275,26 +381,36 @@ def extract_info(template, text):
|
|
275 |
# Process result
|
276 |
result = response.json()
|
277 |
|
278 |
-
# Handle different response formats
|
279 |
-
if isinstance(result, list) and len(result) > 0:
|
280 |
-
result_text = result[0].get("generated_text", "")
|
281 |
-
else:
|
282 |
-
result_text = str(result)
|
283 |
-
|
284 |
-
# Split at output marker if present
|
285 |
-
if "<|output|>" in result_text:
|
286 |
-
json_text = result_text.split("<|output|>")[1].strip()
|
287 |
-
else:
|
288 |
-
json_text = result_text
|
289 |
-
|
290 |
-
# Try to parse as JSON
|
291 |
try:
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
except Exception as e:
|
299 |
return f"β Error: {str(e)}", "{}"
|
300 |
|
|
|
70 |
if pd.isna(locations) or not locations:
|
71 |
return []
|
72 |
|
73 |
+
# Handle special case with "dateline_locations" prefix
|
74 |
+
if "dateline_locations" in locations:
|
75 |
+
# Remove the prefix if present
|
76 |
+
locations = locations.replace("dateline_locations", "").strip()
|
77 |
+
|
78 |
+
# Improved location parsing to handle complex location names with commas
|
79 |
+
# This regex-based approach attempts to identify well-formed location patterns
|
80 |
+
try:
|
81 |
+
import re
|
82 |
+
|
83 |
+
# Try to find patterns like "City, Country" or standalone names
|
84 |
+
# This handles cities like "Paris, France" as single entities
|
85 |
+
location_pattern = re.compile(r'([A-Za-z\s]+(?:,\s*[A-Za-z\s]+)?)')
|
86 |
+
matches = location_pattern.findall(locations)
|
87 |
+
|
88 |
+
# Filter out empty matches and strip whitespace
|
89 |
+
location_list = [match.strip() for match in matches if match.strip()]
|
90 |
+
|
91 |
+
# If regex didn't work properly, fall back to a simpler approach
|
92 |
+
if not location_list:
|
93 |
+
# Simple space-based splitting as a fallback
|
94 |
+
location_list = [loc.strip() for loc in locations.split() if loc.strip()]
|
95 |
+
print(f"Using fallback location parsing: {location_list}")
|
96 |
+
|
97 |
+
except Exception as e:
|
98 |
+
print(f"Error parsing locations: {e}, using simple splitting")
|
99 |
+
# Simple fallback if regex fails
|
100 |
+
location_list = [loc.strip() for loc in locations.split() if loc.strip()]
|
101 |
+
|
102 |
+
print(f"Parsed locations: {location_list}")
|
103 |
+
|
104 |
# Process locations in parallel with a limited number of workers
|
105 |
return self.process_locations_parallel(location_list, progress_callback)
|
106 |
|
|
|
211 |
if places_column not in df.columns:
|
212 |
return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
|
213 |
|
214 |
+
# Print column names and first few rows for debugging
|
215 |
+
print(f"Columns in Excel file: {df.columns.tolist()}")
|
216 |
+
print(f"First 3 rows of data:\n{df.head(3)}")
|
217 |
+
|
218 |
if progress:
|
219 |
progress(0.2, "Initializing geocoding...")
|
220 |
|
|
|
235 |
coordinates_list = []
|
236 |
total_rows = len(df)
|
237 |
|
238 |
+
# Create a helper function to safely parse location data from each row
|
239 |
+
def parse_excel_locations(location_data):
|
240 |
+
"""Safely parse location data from Excel cell"""
|
241 |
+
if pd.isna(location_data):
|
242 |
+
return []
|
243 |
+
|
244 |
+
# Convert to string to handle numeric or other data types
|
245 |
+
location_data = str(location_data).strip()
|
246 |
+
|
247 |
+
# Skip empty strings
|
248 |
+
if not location_data:
|
249 |
+
return []
|
250 |
+
|
251 |
+
# Look for recognized patterns and split accordingly
|
252 |
+
# First, check if it's a comma-separated list
|
253 |
+
if "," in location_data:
|
254 |
+
# This could be a list like "Berlin, Hamburg, Munich"
|
255 |
+
# Or it could contain locations like "Paris, France"
|
256 |
+
|
257 |
+
# Try to intelligently parse based on common patterns
|
258 |
+
try:
|
259 |
+
import re
|
260 |
+
|
261 |
+
# Pattern to match city-country pairs or standalone names
|
262 |
+
# Examples: "Paris, France" or "Berlin" or "New York, NY, USA"
|
263 |
+
location_pattern = re.compile(r'([A-Za-z\s]+(?:,\s*[A-Za-z\s]+){0,2})')
|
264 |
+
matches = location_pattern.findall(location_data)
|
265 |
+
|
266 |
+
locations = [match.strip() for match in matches if match.strip()]
|
267 |
+
|
268 |
+
# If our pattern matching didn't work, fall back to simple comma splitting
|
269 |
+
if not locations:
|
270 |
+
locations = [loc.strip() for loc in location_data.split(',') if loc.strip()]
|
271 |
+
|
272 |
+
return locations
|
273 |
+
|
274 |
+
except Exception as e:
|
275 |
+
print(f"Regex parsing failed: {e}")
|
276 |
+
# Fallback to simple comma splitting
|
277 |
+
return [loc.strip() for loc in location_data.split(',') if loc.strip()]
|
278 |
+
|
279 |
+
# Otherwise, treat it as a single location or space-separated list
|
280 |
+
else:
|
281 |
+
# Check if it might be space-separated
|
282 |
+
potential_locations = location_data.split()
|
283 |
+
|
284 |
+
# If it just looks like one word with no spaces, return it as a single location
|
285 |
+
if len(potential_locations) == 1:
|
286 |
+
return [location_data]
|
287 |
+
|
288 |
+
# If it has multiple words, it could be a single location name with spaces
|
289 |
+
# or multiple space-separated locations
|
290 |
+
# For safety, treat it as a single location
|
291 |
+
return [location_data]
|
292 |
+
|
293 |
for idx, row in df.iterrows():
|
294 |
+
location_data = row[places_column]
|
295 |
+
print(f"Processing row {idx+1}/{total_rows}, location data: {location_data}")
|
296 |
+
|
297 |
+
# Parse the locations from the Excel cell
|
298 |
+
location_list = parse_excel_locations(location_data)
|
299 |
+
print(f"Parsed locations: {location_list}")
|
300 |
+
|
301 |
+
# Now geocode each location
|
302 |
+
coords = []
|
303 |
+
for location in location_list:
|
304 |
+
coord = geocoder.geocode_location(location)
|
305 |
+
coords.append(coord)
|
306 |
+
# Update progress
|
307 |
+
if progress_callback:
|
308 |
+
progress_callback(len(coords), len(location_list))
|
309 |
+
|
310 |
coordinates_list.append(coords)
|
311 |
+
print(f"Processed row {idx+1}/{total_rows}, found coordinates: {coords}")
|
312 |
|
313 |
df['coordinates'] = coordinates_list
|
314 |
|
|
|
331 |
|
332 |
# Statistics
|
333 |
total_locations = len(df)
|
334 |
+
successful_geocodes = sum(1 for coords in coordinates_list for coord in coords if coord is not None)
|
335 |
+
failed_geocodes = sum(1 for coords in coordinates_list for coord in coords if coord is None)
|
336 |
|
337 |
+
stats = f"Total data rows: {total_locations}\n"
|
338 |
+
stats += f"Successfully geocoded locations: {successful_geocodes}\n"
|
339 |
+
stats += f"Failed to geocode locations: {failed_geocodes}"
|
340 |
|
341 |
if progress:
|
342 |
progress(1.0, "Processing complete!")
|
343 |
|
344 |
return temp_map_path, stats, processed_file_path
|
345 |
except Exception as e:
|
346 |
+
import traceback
|
347 |
+
error_details = traceback.format_exc()
|
348 |
+
print(f"Error processing Excel file: {error_details}")
|
349 |
+
|
350 |
if progress:
|
351 |
progress(1.0, f"Error: {str(e)}")
|
352 |
+
return None, f"Error processing file: {str(e)}\n\nDetails: {error_details}", None
|
353 |
|
354 |
# NuExtract Functions
|
355 |
def extract_info(template, text):
|
|
|
381 |
# Process result
|
382 |
result = response.json()
|
383 |
|
384 |
+
# Handle different response formats with careful error handling
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
try:
|
386 |
+
if isinstance(result, list):
|
387 |
+
if len(result) > 0:
|
388 |
+
result_text = result[0].get("generated_text", "")
|
389 |
+
else:
|
390 |
+
return "β Empty result list from API", "{}"
|
391 |
+
else:
|
392 |
+
result_text = str(result)
|
393 |
+
|
394 |
+
# Split at output marker if present
|
395 |
+
if "<|output|>" in result_text:
|
396 |
+
split_parts = result_text.split("<|output|>")
|
397 |
+
if len(split_parts) > 1:
|
398 |
+
json_text = split_parts[1].strip()
|
399 |
+
else:
|
400 |
+
json_text = result_text # Fallback if split didn't work as expected
|
401 |
+
else:
|
402 |
+
json_text = result_text
|
403 |
|
404 |
+
# Try to parse as JSON
|
405 |
+
try:
|
406 |
+
extracted = json.loads(json_text)
|
407 |
+
formatted = json.dumps(extracted, indent=2)
|
408 |
+
except json.JSONDecodeError:
|
409 |
+
return "β JSON parsing error", json_text
|
410 |
+
|
411 |
+
return "β
Success", formatted
|
412 |
+
except Exception as inner_e:
|
413 |
+
return f"β Error processing API result: {str(inner_e)}", "{}"
|
414 |
except Exception as e:
|
415 |
return f"β Error: {str(e)}", "{}"
|
416 |
|