Spaces:

oberbics
/

HistorySpace

Sleeping

App Files Files Community

oberbics commited on Apr 14

Commit

5f830c6

verified ·

1 Parent(s): df1519d

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -202

app.py CHANGED Viewed

@@ -10,14 +10,12 @@ import time
 import random
 from typing import List, Tuple, Optional
 import io
-import concurrent.futures
-from tqdm import tqdm
 # NuExtract API configuration
 API_URL = "https://api-inference.huggingface.co/models/numind/NuExtract-1.5"
 headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}"}
-# Geocoding Service with improved performance
 class GeocodingService:
     def __init__(self, user_agent: str = None, timeout: int = 10, rate_limit: float = 1.1):
         if user_agent is None:
@@ -29,7 +27,7 @@ class GeocodingService:
         )
         self.rate_limit = rate_limit
         self.last_request = 0
-        self.cache = {}  # Simple in-memory cache for geocoding results
     def _rate_limit_wait(self):
         current_time = time.time()
@@ -66,74 +64,30 @@ class GeocodingService:
                 return None
         return None
-    def process_locations(self, locations: str, progress_callback=None) -> List[Optional[Tuple[float, float]]]:
         if pd.isna(locations) or not locations:
             return []
-        # Handle special case with "dateline_locations" prefix
-        if "dateline_locations" in locations:
-            # Remove the prefix if present
-            locations = locations.replace("dateline_locations", "").strip()
-        # Improved location parsing to handle complex location names with commas
-        # This regex-based approach attempts to identify well-formed location patterns
         try:
             import re
-            # Try to find patterns like "City, Country" or standalone names
-            # This handles cities like "Paris, France" as single entities
-            location_pattern = re.compile(r'([A-Za-z\s]+(?:,\s*[A-Za-z\s]+)?)')
-            matches = location_pattern.findall(locations)
-            # Filter out empty matches and strip whitespace
             location_list = [match.strip() for match in matches if match.strip()]
-            # If regex didn't work properly, fall back to a simpler approach
             if not location_list:
-                # Simple space-based splitting as a fallback
-                location_list = [loc.strip() for loc in locations.split() if loc.strip()]
-                print(f"Using fallback location parsing: {location_list}")
-        except Exception as e:
-            print(f"Error parsing locations: {e}, using simple splitting")
-            # Simple fallback if regex fails
-            location_list = [loc.strip() for loc in locations.split() if loc.strip()]
-        print(f"Parsed locations: {location_list}")
-        # Process locations in parallel with a limited number of workers
-        return self.process_locations_parallel(location_list, progress_callback)
-    def process_locations_parallel(self, location_list, progress_callback=None, max_workers=4) -> List[Optional[Tuple[float, float]]]:
-        """Process locations in parallel with progress tracking"""
-        results = [None] * len(location_list)
-        # Use a ThreadPoolExecutor for parallel processing
-        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-            # Submit all tasks
-            future_to_index = {executor.submit(self.geocode_location, loc): i
-                              for i, loc in enumerate(location_list)}
-            # Process as they complete with progress updates
-            total = len(future_to_index)
-            completed = 0
-            for future in concurrent.futures.as_completed(future_to_index):
-                index = future_to_index[future]
-                try:
-                    results[index] = future.result()
-                except Exception as e:
-                    print(f"Error processing location: {e}")
-                    results[index] = None
-                # Update progress
-                completed += 1
-                if progress_callback:
-                    progress_callback(completed, total)
-                else:
-                    print(f"Geocoded {completed}/{total} locations")
-        return results
 # Mapping Functions
 def create_location_map(df: pd.DataFrame,
@@ -147,24 +101,35 @@ def create_location_map(df: pd.DataFrame,
     # Process each row in the DataFrame
     for idx, row in df.iterrows():
         coordinates = row[coordinates_col]
-        places = row[places_col].split(',') if pd.notna(row[places_col]) else []
         title = row[title_col] if title_col and pd.notna(row[title_col]) else None
         # Skip if no coordinates
         if not coordinates:
             continue
-        # Make sure places and coordinates lists have the same length
-        # If places list is shorter, pad it with unnamed locations
         while len(places) < len(coordinates):
-            places.append(f"Unnamed Location {len(places)+1}")
-        # Add individual markers for each location
         for i, coord in enumerate(coordinates):
             if coord is not None:  # Skip None coordinates
                 lat, lon = coord
-                # Safely get place name, use a default if index is out of range
-                place_name = places[i].strip() if i < len(places) else f"Location {i+1}"
                 # Create popup content
                 popup_content = f"<b>{place_name}</b>"
@@ -186,17 +151,13 @@ def create_location_map(df: pd.DataFrame,
     return m
-# Processing Functions with progress updates
-def process_excel(file, places_column, progress=None):
     # Check if file is None
     if file is None:
         return None, "No file uploaded", None
     try:
-        # Update progress
-        if progress:
-            progress(0.1, "Reading Excel file...")
         # Handle various file object types that Gradio might provide
         if hasattr(file, 'name'):
             # Gradio file object
@@ -208,112 +169,22 @@ def process_excel(file, places_column, progress=None):
             # Assume it's a filepath string
             df = pd.read_excel(file)
         if places_column not in df.columns:
             return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
-        # Print column names and first few rows for debugging
-        print(f"Columns in Excel file: {df.columns.tolist()}")
-        print(f"First 3 rows of data:\n{df.head(3)}")
-        if progress:
-            progress(0.2, "Initializing geocoding...")
         # Initialize the geocoding service
         geocoder = GeocodingService(user_agent="gradio_map_visualization_app")
-        # Function to update progress during geocoding
-        def geocoding_progress(completed, total):
-            if progress:
-                # Scale progress between 20% and 80%
-                progress_value = 0.2 + (0.6 * (completed / total))
-                progress(progress_value, f"Geocoding {completed}/{total} locations...")
-        # Process locations and add coordinates with progress tracking
-        print("Starting geocoding process...")
-        # Process each row with progress updates
-        coordinates_list = []
-        total_rows = len(df)
-        # Create a helper function to safely parse location data from each row
-        def parse_excel_locations(location_data):
-            """Safely parse location data from Excel cell"""
-            if pd.isna(location_data):
-                return []
-            # Convert to string to handle numeric or other data types
-            location_data = str(location_data).strip()
-            # Skip empty strings
-            if not location_data:
-                return []
-            # Look for recognized patterns and split accordingly
-            # First, check if it's a comma-separated list
-            if "," in location_data:
-                # This could be a list like "Berlin, Hamburg, Munich"
-                # Or it could contain locations like "Paris, France"
-                # Try to intelligently parse based on common patterns
-                try:
-                    import re
-                    # Pattern to match city-country pairs or standalone names
-                    # Examples: "Paris, France" or "Berlin" or "New York, NY, USA"
-                    location_pattern = re.compile(r'([A-Za-z\s]+(?:,\s*[A-Za-z\s]+){0,2})')
-                    matches = location_pattern.findall(location_data)
-                    locations = [match.strip() for match in matches if match.strip()]
-                    # If our pattern matching didn't work, fall back to simple comma splitting
-                    if not locations:
-                        locations = [loc.strip() for loc in location_data.split(',') if loc.strip()]
-                    return locations
-                except Exception as e:
-                    print(f"Regex parsing failed: {e}")
-                    # Fallback to simple comma splitting
-                    return [loc.strip() for loc in location_data.split(',') if loc.strip()]
-            # Otherwise, treat it as a single location or space-separated list
-            else:
-                # Check if it might be space-separated
-                potential_locations = location_data.split()
-                # If it just looks like one word with no spaces, return it as a single location
-                if len(potential_locations) == 1:
-                    return [location_data]
-                # If it has multiple words, it could be a single location name with spaces
-                # or multiple space-separated locations
-                # For safety, treat it as a single location
-                return [location_data]
-        for idx, row in df.iterrows():
-            location_data = row[places_column]
-            print(f"Processing row {idx+1}/{total_rows}, location data: {location_data}")
-            # Parse the locations from the Excel cell
-            location_list = parse_excel_locations(location_data)
-            print(f"Parsed locations: {location_list}")
-            # Now geocode each location
-            coords = []
-            for location in location_list:
-                coord = geocoder.geocode_location(location)
-                coords.append(coord)
-                # Update progress
-                if progress_callback:
-                    progress_callback(len(coords), len(location_list))
-            coordinates_list.append(coords)
-            print(f"Processed row {idx+1}/{total_rows}, found coordinates: {coords}")
-        df['coordinates'] = coordinates_list
-        if progress:
-            progress(0.8, "Creating map...")
         # Create the map
         map_obj = create_location_map(df, coordinates_col='coordinates', places_col=places_column)
@@ -323,33 +194,24 @@ def process_excel(file, places_column, progress=None):
         map_obj.save(temp_map_path)
         # Save the processed DataFrame to Excel
-        if progress:
-            progress(0.9, "Saving results...")
         processed_file_path = "processed_data.xlsx"
         df.to_excel(processed_file_path, index=False)
         # Statistics
         total_locations = len(df)
-        successful_geocodes = sum(1 for coords in coordinates_list for coord in coords if coord is not None)
-        failed_geocodes = sum(1 for coords in coordinates_list for coord in coords if coord is None)
         stats = f"Total data rows: {total_locations}\n"
         stats += f"Successfully geocoded locations: {successful_geocodes}\n"
         stats += f"Failed to geocode locations: {failed_geocodes}"
-        if progress:
-            progress(1.0, "Processing complete!")
         return temp_map_path, stats, processed_file_path
     except Exception as e:
         import traceback
-        error_details = traceback.format_exc()
-        print(f"Error processing Excel file: {error_details}")
-        if progress:
-            progress(1.0, f"Error: {str(e)}")
-        return None, f"Error processing file: {str(e)}\n\nDetails: {error_details}", None
 # NuExtract Functions
 def extract_info(template, text):
@@ -381,23 +243,23 @@ def extract_info(template, text):
         # Process result
         result = response.json()
-        # Handle different response formats with careful error handling
         try:
             if isinstance(result, list):
                 if len(result) > 0:
                     result_text = result[0].get("generated_text", "")
                 else:
-                    return "❌ Empty result list from API", "{}"
             else:
                 result_text = str(result)
             # Split at output marker if present
             if "<|output|>" in result_text:
-                split_parts = result_text.split("<|output|>")
-                if len(split_parts) > 1:
-                    json_text = split_parts[1].strip()
                 else:
-                    json_text = result_text  # Fallback if split didn't work as expected
             else:
                 json_text = result_text
@@ -410,7 +272,7 @@ def extract_info(template, text):
             return "✅ Success", formatted
         except Exception as inner_e:
-            return f"❌ Error processing API result: {str(inner_e)}", "{}"
     except Exception as e:
         return f"❌ Error: {str(e)}", "{}"
@@ -456,21 +318,16 @@ with gr.Blocks() as demo:
                     process_btn = gr.Button("Process and Map", variant="primary")
                 with gr.Column():
-                    progress_bar = gr.Progress()
                     map_output = gr.HTML(label="Map Visualization")
                     stats_output = gr.Textbox(label="Statistics", lines=3)
                     processed_file = gr.File(label="Processed Data", visible=True, interactive=False)
-            def process_and_map(file, column, progress=gr.Progress()):
                 if file is None:
                     return None, "Please upload an Excel file", None
                 try:
-                    # Initialize progress
-                    progress(0, "Starting process...")
-                    # Process the file with progress updates
-                    map_path, stats, processed_path = process_excel(file, column, progress)
                     if map_path and processed_path:
                         with open(map_path, "r") as f:
@@ -480,6 +337,9 @@ with gr.Blocks() as demo:
                     else:
                         return None, stats, None
                 except Exception as e:
                     return None, f"Error: {str(e)}", None
             process_btn.click(

 import random
 from typing import List, Tuple, Optional
 import io
 # NuExtract API configuration
 API_URL = "https://api-inference.huggingface.co/models/numind/NuExtract-1.5"
 headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}"}
+# Geocoding Service
 class GeocodingService:
     def __init__(self, user_agent: str = None, timeout: int = 10, rate_limit: float = 1.1):
         if user_agent is None:
         )
         self.rate_limit = rate_limit
         self.last_request = 0
+        self.cache = {}  # Simple in-memory cache
     def _rate_limit_wait(self):
         current_time = time.time()
                 return None
         return None
+    def process_locations(self, locations: str) -> List[Optional[Tuple[float, float]]]:
         if pd.isna(locations) or not locations:
             return []
         try:
+            # First try to intelligently parse
             import re
+            pattern = r"([^,]+(?:,\s*[A-Za-z]+)?)"
+            matches = re.findall(pattern, locations)
             location_list = [match.strip() for match in matches if match.strip()]
+            # If regex finds nothing, fall back to simple comma splitting
             if not location_list:
+                location_list = [loc.strip() for loc in locations.split(',') if loc.strip()]
+            # For debugging
+            print(f"Parsed '{locations}' into: {location_list}")
+            return [self.geocode_location(loc) for loc in location_list]
+        except Exception as e:
+            print(f"Error parsing locations '{locations}': {e}")
+            # Fall back to simple method
+            location_list = [loc.strip() for loc in locations.split(',') if loc.strip()]
+            return [self.geocode_location(loc) for loc in location_list]
 # Mapping Functions
 def create_location_map(df: pd.DataFrame,
     # Process each row in the DataFrame
     for idx, row in df.iterrows():
         coordinates = row[coordinates_col]
+        places_text = row[places_col] if pd.notna(row[places_col]) else ""
         title = row[title_col] if title_col and pd.notna(row[title_col]) else None
         # Skip if no coordinates
         if not coordinates:
             continue
+        # Parse places into a list
+        try:
+            places = [p.strip() for p in places_text.split(',') if p.strip()]
+        except:
+            # Fall back to treating it as a single place if splitting fails
+            places = [places_text] if places_text else []
+        # Ensure places and coordinates have compatible lengths
+        # If places is shorter, add placeholder names
         while len(places) < len(coordinates):
+            places.append(f"Location {len(places) + 1}")
+        # Add markers for each coordinate
         for i, coord in enumerate(coordinates):
             if coord is not None:  # Skip None coordinates
                 lat, lon = coord
+                # Get place name safely
+                if i < len(places):
+                    place_name = places[i]
+                else:
+                    place_name = f"Location {i + 1}"
                 # Create popup content
                 popup_content = f"<b>{place_name}</b>"
     return m
+# Processing Functions
+def process_excel(file, places_column):
     # Check if file is None
     if file is None:
         return None, "No file uploaded", None
     try:
         # Handle various file object types that Gradio might provide
         if hasattr(file, 'name'):
             # Gradio file object
             # Assume it's a filepath string
             df = pd.read_excel(file)
+        # Print column names for debugging
+        print(f"Columns in Excel file: {list(df.columns)}")
+        print(f"Preview of data:\n{df.head(2)}")
         if places_column not in df.columns:
             return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
         # Initialize the geocoding service
         geocoder = GeocodingService(user_agent="gradio_map_visualization_app")
+        # Process locations and add coordinates
+        print(f"Processing locations from column: {places_column}")
+        print(f"First few values: {df[places_column].head().tolist()}")
+        # Apply geocoding to each row
+        df['coordinates'] = df[places_column].apply(geocoder.process_locations)
         # Create the map
         map_obj = create_location_map(df, coordinates_col='coordinates', places_col=places_column)
         map_obj.save(temp_map_path)
         # Save the processed DataFrame to Excel
         processed_file_path = "processed_data.xlsx"
         df.to_excel(processed_file_path, index=False)
         # Statistics
         total_locations = len(df)
+        successful_geocodes = sum(1 for row in df['coordinates'] for coord in row if coord is not None)
+        failed_geocodes = sum(1 for row in df['coordinates'] for coord in row if coord is None)
         stats = f"Total data rows: {total_locations}\n"
         stats += f"Successfully geocoded locations: {successful_geocodes}\n"
         stats += f"Failed to geocode locations: {failed_geocodes}"
         return temp_map_path, stats, processed_file_path
     except Exception as e:
         import traceback
+        trace = traceback.format_exc()
+        print(f"Error processing file: {e}\n{trace}")
+        return None, f"Error processing file: {str(e)}", None
 # NuExtract Functions
 def extract_info(template, text):
         # Process result
         result = response.json()
+        # Handle different response formats
         try:
             if isinstance(result, list):
                 if len(result) > 0:
                     result_text = result[0].get("generated_text", "")
                 else:
+                    return "❌ Empty result list", "{}"
             else:
                 result_text = str(result)
             # Split at output marker if present
             if "<|output|>" in result_text:
+                parts = result_text.split("<|output|>")
+                if len(parts) > 1:
+                    json_text = parts[1].strip()
                 else:
+                    json_text = result_text
             else:
                 json_text = result_text
             return "✅ Success", formatted
         except Exception as inner_e:
+            return f"❌ Error processing result: {str(inner_e)}", "{}"
     except Exception as e:
         return f"❌ Error: {str(e)}", "{}"
                     process_btn = gr.Button("Process and Map", variant="primary")
                 with gr.Column():
                     map_output = gr.HTML(label="Map Visualization")
                     stats_output = gr.Textbox(label="Statistics", lines=3)
                     processed_file = gr.File(label="Processed Data", visible=True, interactive=False)
+            def process_and_map(file, column):
                 if file is None:
                     return None, "Please upload an Excel file", None
                 try:
+                    map_path, stats, processed_path = process_excel(file, column)
                     if map_path and processed_path:
                         with open(map_path, "r") as f:
                     else:
                         return None, stats, None
                 except Exception as e:
+                    import traceback
+                    trace = traceback.format_exc()
+                    print(f"Error in process_and_map: {e}\n{trace}")
                     return None, f"Error: {str(e)}", None
             process_btn.click(