Spaces:

oberbics
/

HistorySpace

Sleeping

App Files Files Community

oberbics commited on Apr 14

Commit

df1519d

verified ·

1 Parent(s): 65988e9

Update app.py

Browse files

Files changed (1) hide show

app.py +145 -29

app.py CHANGED Viewed

@@ -70,8 +70,37 @@ class GeocodingService:
         if pd.isna(locations) or not locations:
             return []
-        location_list = [loc.strip() for loc in locations.split(',')]
         # Process locations in parallel with a limited number of workers
         return self.process_locations_parallel(location_list, progress_callback)
@@ -182,6 +211,10 @@ def process_excel(file, places_column, progress=None):
         if places_column not in df.columns:
             return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
         if progress:
             progress(0.2, "Initializing geocoding...")
@@ -202,11 +235,80 @@ def process_excel(file, places_column, progress=None):
         coordinates_list = []
         total_rows = len(df)
         for idx, row in df.iterrows():
-            locations = row[places_column]
-            coords = geocoder.process_locations(locations, geocoding_progress)
             coordinates_list.append(coords)
-            print(f"Processed row {idx+1}/{total_rows}")
         df['coordinates'] = coordinates_list
@@ -229,21 +331,25 @@ def process_excel(file, places_column, progress=None):
         # Statistics
         total_locations = len(df)
-        successful_geocodes = df['coordinates'].apply(lambda x: len([c for c in x if c is not None])).sum()
-        failed_geocodes = df['coordinates'].apply(lambda x: len([c for c in x if c is None])).sum()
-        stats = f"Total locations: {total_locations}\n"
-        stats += f"Successfully geocoded: {successful_geocodes}\n"
-        stats += f"Failed to geocode: {failed_geocodes}"
         if progress:
             progress(1.0, "Processing complete!")
         return temp_map_path, stats, processed_file_path
     except Exception as e:
         if progress:
             progress(1.0, f"Error: {str(e)}")
-        return None, f"Error processing file: {str(e)}", None
 # NuExtract Functions
 def extract_info(template, text):
@@ -275,26 +381,36 @@ def extract_info(template, text):
         # Process result
         result = response.json()
-        # Handle different response formats
-        if isinstance(result, list) and len(result) > 0:
-            result_text = result[0].get("generated_text", "")
-        else:
-            result_text = str(result)
-        # Split at output marker if present
-        if "<|output|>" in result_text:
-            json_text = result_text.split("<|output|>")[1].strip()
-        else:
-            json_text = result_text
-        # Try to parse as JSON
         try:
-            extracted = json.loads(json_text)
-            formatted = json.dumps(extracted, indent=2)
-        except json.JSONDecodeError:
-            return "❌ JSON parsing error", json_text
-        return "✅ Success", formatted
     except Exception as e:
         return f"❌ Error: {str(e)}", "{}"

         if pd.isna(locations) or not locations:
             return []
+        # Handle special case with "dateline_locations" prefix
+        if "dateline_locations" in locations:
+            # Remove the prefix if present
+            locations = locations.replace("dateline_locations", "").strip()
+        # Improved location parsing to handle complex location names with commas
+        # This regex-based approach attempts to identify well-formed location patterns
+        try:
+            import re
+            # Try to find patterns like "City, Country" or standalone names
+            # This handles cities like "Paris, France" as single entities
+            location_pattern = re.compile(r'([A-Za-z\s]+(?:,\s*[A-Za-z\s]+)?)')
+            matches = location_pattern.findall(locations)
+            # Filter out empty matches and strip whitespace
+            location_list = [match.strip() for match in matches if match.strip()]
+            # If regex didn't work properly, fall back to a simpler approach
+            if not location_list:
+                # Simple space-based splitting as a fallback
+                location_list = [loc.strip() for loc in locations.split() if loc.strip()]
+                print(f"Using fallback location parsing: {location_list}")
+        except Exception as e:
+            print(f"Error parsing locations: {e}, using simple splitting")
+            # Simple fallback if regex fails
+            location_list = [loc.strip() for loc in locations.split() if loc.strip()]
+        print(f"Parsed locations: {location_list}")
         # Process locations in parallel with a limited number of workers
         return self.process_locations_parallel(location_list, progress_callback)
         if places_column not in df.columns:
             return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
+        # Print column names and first few rows for debugging
+        print(f"Columns in Excel file: {df.columns.tolist()}")
+        print(f"First 3 rows of data:\n{df.head(3)}")
         if progress:
             progress(0.2, "Initializing geocoding...")
         coordinates_list = []
         total_rows = len(df)
+        # Create a helper function to safely parse location data from each row
+        def parse_excel_locations(location_data):
+            """Safely parse location data from Excel cell"""
+            if pd.isna(location_data):
+                return []
+            # Convert to string to handle numeric or other data types
+            location_data = str(location_data).strip()
+            # Skip empty strings
+            if not location_data:
+                return []
+            # Look for recognized patterns and split accordingly
+            # First, check if it's a comma-separated list
+            if "," in location_data:
+                # This could be a list like "Berlin, Hamburg, Munich"
+                # Or it could contain locations like "Paris, France"
+                # Try to intelligently parse based on common patterns
+                try:
+                    import re
+                    # Pattern to match city-country pairs or standalone names
+                    # Examples: "Paris, France" or "Berlin" or "New York, NY, USA"
+                    location_pattern = re.compile(r'([A-Za-z\s]+(?:,\s*[A-Za-z\s]+){0,2})')
+                    matches = location_pattern.findall(location_data)
+                    locations = [match.strip() for match in matches if match.strip()]
+                    # If our pattern matching didn't work, fall back to simple comma splitting
+                    if not locations:
+                        locations = [loc.strip() for loc in location_data.split(',') if loc.strip()]
+                    return locations
+                except Exception as e:
+                    print(f"Regex parsing failed: {e}")
+                    # Fallback to simple comma splitting
+                    return [loc.strip() for loc in location_data.split(',') if loc.strip()]
+            # Otherwise, treat it as a single location or space-separated list
+            else:
+                # Check if it might be space-separated
+                potential_locations = location_data.split()
+                # If it just looks like one word with no spaces, return it as a single location
+                if len(potential_locations) == 1:
+                    return [location_data]
+                # If it has multiple words, it could be a single location name with spaces
+                # or multiple space-separated locations
+                # For safety, treat it as a single location
+                return [location_data]
         for idx, row in df.iterrows():
+            location_data = row[places_column]
+            print(f"Processing row {idx+1}/{total_rows}, location data: {location_data}")
+            # Parse the locations from the Excel cell
+            location_list = parse_excel_locations(location_data)
+            print(f"Parsed locations: {location_list}")
+            # Now geocode each location
+            coords = []
+            for location in location_list:
+                coord = geocoder.geocode_location(location)
+                coords.append(coord)
+                # Update progress
+                if progress_callback:
+                    progress_callback(len(coords), len(location_list))
             coordinates_list.append(coords)
+            print(f"Processed row {idx+1}/{total_rows}, found coordinates: {coords}")
         df['coordinates'] = coordinates_list
         # Statistics
         total_locations = len(df)
+        successful_geocodes = sum(1 for coords in coordinates_list for coord in coords if coord is not None)
+        failed_geocodes = sum(1 for coords in coordinates_list for coord in coords if coord is None)
+        stats = f"Total data rows: {total_locations}\n"
+        stats += f"Successfully geocoded locations: {successful_geocodes}\n"
+        stats += f"Failed to geocode locations: {failed_geocodes}"
         if progress:
             progress(1.0, "Processing complete!")
         return temp_map_path, stats, processed_file_path
     except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        print(f"Error processing Excel file: {error_details}")
         if progress:
             progress(1.0, f"Error: {str(e)}")
+        return None, f"Error processing file: {str(e)}\n\nDetails: {error_details}", None
 # NuExtract Functions
 def extract_info(template, text):
         # Process result
         result = response.json()
+        # Handle different response formats with careful error handling
         try:
+            if isinstance(result, list):
+                if len(result) > 0:
+                    result_text = result[0].get("generated_text", "")
+                else:
+                    return "❌ Empty result list from API", "{}"
+            else:
+                result_text = str(result)
+            # Split at output marker if present
+            if "<|output|>" in result_text:
+                split_parts = result_text.split("<|output|>")
+                if len(split_parts) > 1:
+                    json_text = split_parts[1].strip()
+                else:
+                    json_text = result_text  # Fallback if split didn't work as expected
+            else:
+                json_text = result_text
+            # Try to parse as JSON
+            try:
+                extracted = json.loads(json_text)
+                formatted = json.dumps(extracted, indent=2)
+            except json.JSONDecodeError:
+                return "❌ JSON parsing error", json_text
+            return "✅ Success", formatted
+        except Exception as inner_e:
+            return f"❌ Error processing API result: {str(inner_e)}", "{}"
     except Exception as e:
         return f"❌ Error: {str(e)}", "{}"