Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -10,12 +10,14 @@ import time
|
|
10 |
import random
|
11 |
from typing import List, Tuple, Optional
|
12 |
import io
|
|
|
|
|
13 |
|
14 |
# NuExtract API configuration
|
15 |
API_URL = "https://api-inference.huggingface.co/models/numind/NuExtract-1.5"
|
16 |
headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}"}
|
17 |
|
18 |
-
# Geocoding Service
|
19 |
class GeocodingService:
|
20 |
def __init__(self, user_agent: str = None, timeout: int = 10, rate_limit: float = 1.1):
|
21 |
if user_agent is None:
|
@@ -27,6 +29,7 @@ class GeocodingService:
|
|
27 |
)
|
28 |
self.rate_limit = rate_limit
|
29 |
self.last_request = 0
|
|
|
30 |
|
31 |
def _rate_limit_wait(self):
|
32 |
current_time = time.time()
|
@@ -36,29 +39,72 @@ class GeocodingService:
|
|
36 |
self.last_request = time.time()
|
37 |
|
38 |
def geocode_location(self, location: str, max_retries: int = 3) -> Optional[Tuple[float, float]]:
|
|
|
|
|
|
|
|
|
39 |
for attempt in range(max_retries):
|
40 |
try:
|
41 |
self._rate_limit_wait()
|
42 |
location_data = self.geolocator.geocode(location)
|
43 |
if location_data:
|
44 |
-
|
|
|
|
|
|
|
|
|
45 |
return None
|
46 |
except (GeocoderTimedOut, GeocoderServiceError) as e:
|
47 |
if attempt == max_retries - 1:
|
48 |
print(f"Failed to geocode '{location}' after {max_retries} attempts: {e}")
|
|
|
49 |
return None
|
50 |
time.sleep(2 ** attempt) # Exponential backoff
|
51 |
except Exception as e:
|
52 |
print(f"Error geocoding '{location}': {e}")
|
|
|
53 |
return None
|
54 |
return None
|
55 |
|
56 |
-
def process_locations(self, locations: str) -> List[Optional[Tuple[float, float]]]:
|
57 |
if pd.isna(locations) or not locations:
|
58 |
return []
|
59 |
|
60 |
location_list = [loc.strip() for loc in locations.split(',')]
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
# Mapping Functions
|
64 |
def create_location_map(df: pd.DataFrame,
|
@@ -105,13 +151,17 @@ def create_location_map(df: pd.DataFrame,
|
|
105 |
|
106 |
return m
|
107 |
|
108 |
-
# Processing Functions
|
109 |
-
def process_excel(file, places_column):
|
110 |
# Check if file is None
|
111 |
if file is None:
|
112 |
return None, "No file uploaded", None
|
113 |
|
114 |
try:
|
|
|
|
|
|
|
|
|
115 |
# Handle various file object types that Gradio might provide
|
116 |
if hasattr(file, 'name'):
|
117 |
# Gradio file object
|
@@ -126,11 +176,36 @@ def process_excel(file, places_column):
|
|
126 |
if places_column not in df.columns:
|
127 |
return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
|
128 |
|
|
|
|
|
|
|
129 |
# Initialize the geocoding service
|
130 |
geocoder = GeocodingService(user_agent="gradio_map_visualization_app")
|
131 |
|
132 |
-
#
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
# Create the map
|
136 |
map_obj = create_location_map(df, coordinates_col='coordinates', places_col=places_column)
|
@@ -140,6 +215,9 @@ def process_excel(file, places_column):
|
|
140 |
map_obj.save(temp_map_path)
|
141 |
|
142 |
# Save the processed DataFrame to Excel
|
|
|
|
|
|
|
143 |
processed_file_path = "processed_data.xlsx"
|
144 |
df.to_excel(processed_file_path, index=False)
|
145 |
|
@@ -152,8 +230,13 @@ def process_excel(file, places_column):
|
|
152 |
stats += f"Successfully geocoded: {successful_geocodes}\n"
|
153 |
stats += f"Failed to geocode: {failed_geocodes}"
|
154 |
|
|
|
|
|
|
|
155 |
return temp_map_path, stats, processed_file_path
|
156 |
except Exception as e:
|
|
|
|
|
157 |
return None, f"Error processing file: {str(e)}", None
|
158 |
|
159 |
# NuExtract Functions
|
@@ -251,16 +334,21 @@ with gr.Blocks() as demo:
|
|
251 |
process_btn = gr.Button("Process and Map", variant="primary")
|
252 |
|
253 |
with gr.Column():
|
|
|
254 |
map_output = gr.HTML(label="Map Visualization")
|
255 |
stats_output = gr.Textbox(label="Statistics", lines=3)
|
256 |
processed_file = gr.File(label="Processed Data", visible=True, interactive=False)
|
257 |
|
258 |
-
def process_and_map(file, column):
|
259 |
if file is None:
|
260 |
return None, "Please upload an Excel file", None
|
261 |
|
262 |
try:
|
263 |
-
|
|
|
|
|
|
|
|
|
264 |
|
265 |
if map_path and processed_path:
|
266 |
with open(map_path, "r") as f:
|
|
|
10 |
import random
|
11 |
from typing import List, Tuple, Optional
|
12 |
import io
|
13 |
+
import concurrent.futures
|
14 |
+
from tqdm import tqdm
|
15 |
|
16 |
# NuExtract API configuration
|
17 |
API_URL = "https://api-inference.huggingface.co/models/numind/NuExtract-1.5"
|
18 |
headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}"}
|
19 |
|
20 |
+
# Geocoding Service with improved performance
|
21 |
class GeocodingService:
|
22 |
def __init__(self, user_agent: str = None, timeout: int = 10, rate_limit: float = 1.1):
|
23 |
if user_agent is None:
|
|
|
29 |
)
|
30 |
self.rate_limit = rate_limit
|
31 |
self.last_request = 0
|
32 |
+
self.cache = {} # Simple in-memory cache for geocoding results
|
33 |
|
34 |
def _rate_limit_wait(self):
|
35 |
current_time = time.time()
|
|
|
39 |
self.last_request = time.time()
|
40 |
|
41 |
def geocode_location(self, location: str, max_retries: int = 3) -> Optional[Tuple[float, float]]:
|
42 |
+
# Check cache first
|
43 |
+
if location in self.cache:
|
44 |
+
return self.cache[location]
|
45 |
+
|
46 |
for attempt in range(max_retries):
|
47 |
try:
|
48 |
self._rate_limit_wait()
|
49 |
location_data = self.geolocator.geocode(location)
|
50 |
if location_data:
|
51 |
+
# Store in cache and return
|
52 |
+
self.cache[location] = (location_data.latitude, location_data.longitude)
|
53 |
+
return self.cache[location]
|
54 |
+
# Cache None results too
|
55 |
+
self.cache[location] = None
|
56 |
return None
|
57 |
except (GeocoderTimedOut, GeocoderServiceError) as e:
|
58 |
if attempt == max_retries - 1:
|
59 |
print(f"Failed to geocode '{location}' after {max_retries} attempts: {e}")
|
60 |
+
self.cache[location] = None
|
61 |
return None
|
62 |
time.sleep(2 ** attempt) # Exponential backoff
|
63 |
except Exception as e:
|
64 |
print(f"Error geocoding '{location}': {e}")
|
65 |
+
self.cache[location] = None
|
66 |
return None
|
67 |
return None
|
68 |
|
69 |
+
def process_locations(self, locations: str, progress_callback=None) -> List[Optional[Tuple[float, float]]]:
|
70 |
if pd.isna(locations) or not locations:
|
71 |
return []
|
72 |
|
73 |
location_list = [loc.strip() for loc in locations.split(',')]
|
74 |
+
|
75 |
+
# Process locations in parallel with a limited number of workers
|
76 |
+
return self.process_locations_parallel(location_list, progress_callback)
|
77 |
+
|
78 |
+
def process_locations_parallel(self, location_list, progress_callback=None, max_workers=4) -> List[Optional[Tuple[float, float]]]:
|
79 |
+
"""Process locations in parallel with progress tracking"""
|
80 |
+
results = [None] * len(location_list)
|
81 |
+
|
82 |
+
# Use a ThreadPoolExecutor for parallel processing
|
83 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
84 |
+
# Submit all tasks
|
85 |
+
future_to_index = {executor.submit(self.geocode_location, loc): i
|
86 |
+
for i, loc in enumerate(location_list)}
|
87 |
+
|
88 |
+
# Process as they complete with progress updates
|
89 |
+
total = len(future_to_index)
|
90 |
+
completed = 0
|
91 |
+
|
92 |
+
for future in concurrent.futures.as_completed(future_to_index):
|
93 |
+
index = future_to_index[future]
|
94 |
+
try:
|
95 |
+
results[index] = future.result()
|
96 |
+
except Exception as e:
|
97 |
+
print(f"Error processing location: {e}")
|
98 |
+
results[index] = None
|
99 |
+
|
100 |
+
# Update progress
|
101 |
+
completed += 1
|
102 |
+
if progress_callback:
|
103 |
+
progress_callback(completed, total)
|
104 |
+
else:
|
105 |
+
print(f"Geocoded {completed}/{total} locations")
|
106 |
+
|
107 |
+
return results
|
108 |
|
109 |
# Mapping Functions
|
110 |
def create_location_map(df: pd.DataFrame,
|
|
|
151 |
|
152 |
return m
|
153 |
|
154 |
+
# Processing Functions with progress updates
|
155 |
+
def process_excel(file, places_column, progress=None):
|
156 |
# Check if file is None
|
157 |
if file is None:
|
158 |
return None, "No file uploaded", None
|
159 |
|
160 |
try:
|
161 |
+
# Update progress
|
162 |
+
if progress:
|
163 |
+
progress(0.1, "Reading Excel file...")
|
164 |
+
|
165 |
# Handle various file object types that Gradio might provide
|
166 |
if hasattr(file, 'name'):
|
167 |
# Gradio file object
|
|
|
176 |
if places_column not in df.columns:
|
177 |
return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
|
178 |
|
179 |
+
if progress:
|
180 |
+
progress(0.2, "Initializing geocoding...")
|
181 |
+
|
182 |
# Initialize the geocoding service
|
183 |
geocoder = GeocodingService(user_agent="gradio_map_visualization_app")
|
184 |
|
185 |
+
# Function to update progress during geocoding
|
186 |
+
def geocoding_progress(completed, total):
|
187 |
+
if progress:
|
188 |
+
# Scale progress between 20% and 80%
|
189 |
+
progress_value = 0.2 + (0.6 * (completed / total))
|
190 |
+
progress(progress_value, f"Geocoding {completed}/{total} locations...")
|
191 |
+
|
192 |
+
# Process locations and add coordinates with progress tracking
|
193 |
+
print("Starting geocoding process...")
|
194 |
+
|
195 |
+
# Process each row with progress updates
|
196 |
+
coordinates_list = []
|
197 |
+
total_rows = len(df)
|
198 |
+
|
199 |
+
for idx, row in df.iterrows():
|
200 |
+
locations = row[places_column]
|
201 |
+
coords = geocoder.process_locations(locations, geocoding_progress)
|
202 |
+
coordinates_list.append(coords)
|
203 |
+
print(f"Processed row {idx+1}/{total_rows}")
|
204 |
+
|
205 |
+
df['coordinates'] = coordinates_list
|
206 |
+
|
207 |
+
if progress:
|
208 |
+
progress(0.8, "Creating map...")
|
209 |
|
210 |
# Create the map
|
211 |
map_obj = create_location_map(df, coordinates_col='coordinates', places_col=places_column)
|
|
|
215 |
map_obj.save(temp_map_path)
|
216 |
|
217 |
# Save the processed DataFrame to Excel
|
218 |
+
if progress:
|
219 |
+
progress(0.9, "Saving results...")
|
220 |
+
|
221 |
processed_file_path = "processed_data.xlsx"
|
222 |
df.to_excel(processed_file_path, index=False)
|
223 |
|
|
|
230 |
stats += f"Successfully geocoded: {successful_geocodes}\n"
|
231 |
stats += f"Failed to geocode: {failed_geocodes}"
|
232 |
|
233 |
+
if progress:
|
234 |
+
progress(1.0, "Processing complete!")
|
235 |
+
|
236 |
return temp_map_path, stats, processed_file_path
|
237 |
except Exception as e:
|
238 |
+
if progress:
|
239 |
+
progress(1.0, f"Error: {str(e)}")
|
240 |
return None, f"Error processing file: {str(e)}", None
|
241 |
|
242 |
# NuExtract Functions
|
|
|
334 |
process_btn = gr.Button("Process and Map", variant="primary")
|
335 |
|
336 |
with gr.Column():
|
337 |
+
progress_bar = gr.Progress()
|
338 |
map_output = gr.HTML(label="Map Visualization")
|
339 |
stats_output = gr.Textbox(label="Statistics", lines=3)
|
340 |
processed_file = gr.File(label="Processed Data", visible=True, interactive=False)
|
341 |
|
342 |
+
def process_and_map(file, column, progress=gr.Progress()):
|
343 |
if file is None:
|
344 |
return None, "Please upload an Excel file", None
|
345 |
|
346 |
try:
|
347 |
+
# Initialize progress
|
348 |
+
progress(0, "Starting process...")
|
349 |
+
|
350 |
+
# Process the file with progress updates
|
351 |
+
map_path, stats, processed_path = process_excel(file, column, progress)
|
352 |
|
353 |
if map_path and processed_path:
|
354 |
with open(map_path, "r") as f:
|