oberbics commited on
Commit
ce5e315
·
verified ·
1 Parent(s): 2cf9cb4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -325
app.py CHANGED
@@ -1,352 +1,188 @@
1
  import gradio as gr
2
- import json
3
- import requests
4
- import os
5
  import pandas as pd
6
  import folium
7
  from geopy.geocoders import Nominatim
8
- from geopy.exc import GeocoderTimedOut, GeocoderServiceError
 
9
  import time
10
- import random
11
- from typing import List, Tuple, Optional
12
- import io
13
 
14
- # NuExtract API configuration
15
- API_URL = "https://api-inference.huggingface.co/models/numind/NuExtract-1.5"
16
- headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}"}
17
-
18
- # Geocoding Service
19
- class GeocodingService:
20
- def __init__(self, user_agent: str = None, timeout: int = 10, rate_limit: float = 1.1):
21
- if user_agent is None:
22
- user_agent = f"python_geocoding_script_{random.randint(1000, 9999)}"
23
 
 
 
 
24
  self.geolocator = Nominatim(
25
- user_agent=user_agent,
26
- timeout=timeout
27
  )
28
- self.rate_limit = rate_limit
29
- self.last_request = 0
30
- self.cache = {} # Simple in-memory cache
31
-
32
- def _rate_limit_wait(self):
33
- current_time = time.time()
34
- time_since_last = current_time - self.last_request
35
- if time_since_last < self.rate_limit:
36
- time.sleep(self.rate_limit - time_since_last)
37
- self.last_request = time.time()
38
-
39
- def geocode_location(self, location: str, max_retries: int = 3) -> Optional[Tuple[float, float]]:
40
- # Check cache first
41
  if location in self.cache:
42
  return self.cache[location]
43
 
44
- for attempt in range(max_retries):
45
- try:
46
- self._rate_limit_wait()
47
- location_data = self.geolocator.geocode(location)
48
- if location_data:
49
- # Store in cache and return
50
- self.cache[location] = (location_data.latitude, location_data.longitude)
51
- return self.cache[location]
52
- # Cache None results too
53
- self.cache[location] = None
54
- return None
55
- except (GeocoderTimedOut, GeocoderServiceError) as e:
56
- if attempt == max_retries - 1:
57
- print(f"Failed to geocode '{location}' after {max_retries} attempts: {e}")
58
- self.cache[location] = None
59
- return None
60
- time.sleep(2 ** attempt) # Exponential backoff
61
- except Exception as e:
62
- print(f"Error geocoding '{location}': {e}")
63
- self.cache[location] = None
64
- return None
65
- return None
66
-
67
- def process_locations(self, locations: str) -> List[Optional[Tuple[float, float]]]:
68
- if pd.isna(locations) or not locations:
69
- return []
70
-
71
  try:
72
- # First try to intelligently parse
73
- import re
74
- pattern = r"([^,]+(?:,\s*[A-Za-z]+)?)"
75
- matches = re.findall(pattern, locations)
76
- location_list = [match.strip() for match in matches if match.strip()]
77
-
78
- # If regex finds nothing, fall back to simple comma splitting
79
- if not location_list:
80
- location_list = [loc.strip() for loc in locations.split(',') if loc.strip()]
81
-
82
- # For debugging
83
- print(f"Parsed '{locations}' into: {location_list}")
84
-
85
- return [self.geocode_location(loc) for loc in location_list]
86
  except Exception as e:
87
- print(f"Error parsing locations '{locations}': {e}")
88
- # Fall back to simple method
89
- location_list = [loc.strip() for loc in locations.split(',') if loc.strip()]
90
- return [self.geocode_location(loc) for loc in location_list]
91
-
92
- # Mapping Functions
93
- def create_location_map(df: pd.DataFrame,
94
- coordinates_col: str = 'coordinates',
95
- places_col: str = 'places',
96
- title_col: Optional[str] = None) -> folium.Map:
97
- # Initialize the map
98
- m = folium.Map(location=[0, 0], zoom_start=2)
99
- all_coords = []
100
-
101
- # Process each row in the DataFrame
102
- for idx, row in df.iterrows():
103
- coordinates = row[coordinates_col]
104
- places_text = row[places_col] if pd.notna(row[places_col]) else ""
105
- title = row[title_col] if title_col and pd.notna(row[title_col]) else None
106
-
107
- # Skip if no coordinates
108
- if not coordinates:
109
- continue
110
-
111
- # Parse places into a list
112
- try:
113
- places = [p.strip() for p in places_text.split(',') if p.strip()]
114
- except:
115
- # Fall back to treating it as a single place if splitting fails
116
- places = [places_text] if places_text else []
117
 
118
- # Ensure places and coordinates have compatible lengths
119
- # If places is shorter, add placeholder names
120
- while len(places) < len(coordinates):
121
- places.append(f"Location {len(places) + 1}")
122
-
123
- # Add markers for each coordinate
124
- for i, coord in enumerate(coordinates):
125
- if coord is not None: # Skip None coordinates
126
- lat, lon = coord
127
-
128
- # Get place name safely
129
- if i < len(places):
130
- place_name = places[i]
131
- else:
132
- place_name = f"Location {i + 1}"
133
-
134
- # Create popup content
135
- popup_content = f"<b>{place_name}</b>"
136
- if title:
137
- popup_content += f"<br>{title}"
138
-
139
- # Add marker to the map
140
- folium.Marker(
141
- location=[lat, lon],
142
- popup=folium.Popup(popup_content, max_width=300),
143
- tooltip=place_name,
144
- ).add_to(m)
145
-
146
- all_coords.append([lat, lon])
147
-
148
- # If we have coordinates, fit the map bounds to include all points
149
- if all_coords:
150
- m.fit_bounds(all_coords)
151
-
152
- return m
153
 
154
- # Processing Functions
155
- def process_excel(file, places_column):
156
- # Check if file is None
157
- if file is None:
158
- return None, "No file uploaded", None
159
 
160
- try:
161
- # Handle various file object types that Gradio might provide
162
- if hasattr(file, 'name'):
163
- # Gradio file object
164
- df = pd.read_excel(file.name)
165
- elif isinstance(file, bytes):
166
- # Raw bytes
167
- df = pd.read_excel(io.BytesIO(file))
168
- else:
169
- # Assume it's a filepath string
170
- df = pd.read_excel(file)
171
-
172
- # Print column names for debugging
173
- print(f"Columns in Excel file: {list(df.columns)}")
174
- print(f"Preview of data:\n{df.head(2)}")
175
-
176
- if places_column not in df.columns:
177
- return None, f"Column '{places_column}' not found in the Excel file. Available columns: {', '.join(df.columns)}", None
178
-
179
- # Initialize the geocoding service
180
- geocoder = GeocodingService(user_agent="gradio_map_visualization_app")
181
-
182
- # Process locations and add coordinates
183
- print(f"Processing locations from column: {places_column}")
184
- print(f"First few values: {df[places_column].head().tolist()}")
185
-
186
- # Apply geocoding to each row
187
- df['coordinates'] = df[places_column].apply(geocoder.process_locations)
188
-
189
- # Create the map
190
- map_obj = create_location_map(df, coordinates_col='coordinates', places_col=places_column)
191
-
192
- # Save the map to a temporary HTML file
193
- temp_map_path = "temp_map.html"
194
- map_obj.save(temp_map_path)
195
-
196
- # Save the processed DataFrame to Excel
197
- processed_file_path = "processed_data.xlsx"
198
- df.to_excel(processed_file_path, index=False)
199
-
200
- # Statistics
201
- total_locations = len(df)
202
- successful_geocodes = sum(1 for row in df['coordinates'] for coord in row if coord is not None)
203
- failed_geocodes = sum(1 for row in df['coordinates'] for coord in row if coord is None)
204
-
205
- stats = f"Total data rows: {total_locations}\n"
206
- stats += f"Successfully geocoded locations: {successful_geocodes}\n"
207
- stats += f"Failed to geocode locations: {failed_geocodes}"
208
-
209
- return temp_map_path, stats, processed_file_path
210
- except Exception as e:
211
- import traceback
212
- trace = traceback.format_exc()
213
- print(f"Error processing file: {e}\n{trace}")
214
- return None, f"Error processing file: {str(e)}", None
215
 
216
- # NuExtract Functions
217
- def extract_info(template, text):
 
 
 
 
 
218
  try:
219
- # Format prompt according to NuExtract-1.5 requirements
220
- prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"
221
-
222
- # Call API
223
- payload = {
224
- "inputs": prompt,
225
- "parameters": {
226
- "max_new_tokens": 1000,
227
- "do_sample": False
228
- }
229
- }
230
-
231
- response = requests.post(API_URL, headers=headers, json=payload)
232
-
233
- # If the model is loading, inform the user
234
- if response.status_code == 503:
235
- response_json = response.json()
236
- if "error" in response_json and "loading" in response_json["error"]:
237
- estimated_time = response_json.get("estimated_time", "unknown")
238
- return f"⏳ Model is loading (ETA: {int(float(estimated_time)) if isinstance(estimated_time, (int, float, str)) else 'unknown'} seconds)", "Please try again in a few minutes"
239
-
240
- if response.status_code != 200:
241
- return f" API Error: {response.status_code}", response.text
 
 
242
 
243
- # Process result
244
- result = response.json()
 
 
 
245
 
246
- # Handle different response formats
247
- try:
248
- if isinstance(result, list):
249
- if len(result) > 0:
250
- result_text = result[0].get("generated_text", "")
251
- else:
252
- return "❌ Empty result list", "{}"
253
- else:
254
- result_text = str(result)
255
-
256
- # Split at output marker if present
257
- if "<|output|>" in result_text:
258
- parts = result_text.split("<|output|>")
259
- if len(parts) > 1:
260
- json_text = parts[1].strip()
261
- else:
262
- json_text = result_text
263
- else:
264
- json_text = result_text
265
-
266
- # Try to parse as JSON
267
- try:
268
- extracted = json.loads(json_text)
269
- formatted = json.dumps(extracted, indent=2)
270
- except json.JSONDecodeError:
271
- return "❌ JSON parsing error", json_text
272
-
273
- return "✅ Success", formatted
274
- except Exception as inner_e:
275
- return f"❌ Error processing result: {str(inner_e)}", "{}"
276
  except Exception as e:
277
- return f" Error: {str(e)}", "{}"
278
-
279
- # Create the Gradio interface
280
- with gr.Blocks() as demo:
 
 
 
 
 
281
  gr.Markdown("# Historical Data Analysis Tools")
282
 
283
- with gr.Tabs():
284
- with gr.TabItem("Text Extraction"):
285
- gr.Markdown("## NuExtract-1.5 Structured Data Extraction")
286
-
287
- with gr.Row():
288
- with gr.Column():
289
- template = gr.Textbox(
290
- label="JSON Template",
291
- value='{"earthquake location": "", "dateline location": ""}',
292
- lines=5
293
- )
294
- text = gr.Textbox(
295
- label="Text to Extract From",
296
- value="Neues Erdbeben in Japan. Aus Tokio wird berichtet, daß in Yokohama bei einem Erdbeben sechs Personen getötet und 22 verwundet, in Tokio vier getötet und 22 verwundet wurden. In Yokohama seien 6VV Häuser zerstört worden. Die telephonische und telegraphische Verbindung zwischen Tokio und Osaka ist unterbrochen worden. Der Trambahnverkehr in Tokio liegt still. Auch der Eisenbahnverkehr zwischen Tokio und Yokohama ist unterbrochen. In Sngamo, einer Vorstadt von Tokio sind Brände ausgebrochen. Ein Eisenbahnzug stürzte in den Vajugawafluß zwischen Gotemba und Tokio. Sechs Züge wurden umgeworfen. Mit dem letzten japanischen Erdbeben sind seit eineinhalb Jahrtausenden bis heute in Japan 229 größere Erdbeben zu verzeichnen gewesen.",
297
- lines=8
298
- )
299
- extract_btn = gr.Button("Extract Information", variant="primary")
 
 
300
 
301
- with gr.Column():
302
- status = gr.Textbox(label="Status")
303
- output = gr.Textbox(label="Output", lines=10)
304
-
305
- extract_btn.click(
306
- fn=extract_info,
307
- inputs=[template, text],
308
- outputs=[status, output]
309
- )
310
-
311
- with gr.TabItem("Geocoding & Mapping"):
312
- gr.Markdown("## Location Mapping Tool")
313
-
314
- with gr.Row():
315
- with gr.Column():
316
- excel_file = gr.File(label="Upload Excel File")
317
- places_column = gr.Textbox(label="Places Column Name", value="places")
318
- process_btn = gr.Button("Process and Map", variant="primary")
319
-
320
- with gr.Column():
321
- map_output = gr.HTML(label="Map Visualization")
322
- stats_output = gr.Textbox(label="Statistics", lines=3)
323
- processed_file = gr.File(label="Processed Data", visible=True, interactive=False)
324
-
325
- def process_and_map(file, column):
326
- if file is None:
327
- return None, "Please upload an Excel file", None
328
-
329
- try:
330
- map_path, stats, processed_path = process_excel(file, column)
331
-
332
- if map_path and processed_path:
333
- with open(map_path, "r") as f:
334
- map_html = f.read()
335
-
336
- return map_html, stats, processed_path
337
- else:
338
- return None, stats, None
339
- except Exception as e:
340
- import traceback
341
- trace = traceback.format_exc()
342
- print(f"Error in process_and_map: {e}\n{trace}")
343
- return None, f"Error: {str(e)}", None
344
-
345
- process_btn.click(
346
- fn=process_and_map,
347
- inputs=[excel_file, places_column],
348
- outputs=[map_output, stats_output, processed_file]
349
- )
350
 
 
351
  if __name__ == "__main__":
352
- demo.launch()
 
 
 
 
 
1
  import gradio as gr
 
 
 
2
  import pandas as pd
3
  import folium
4
  from geopy.geocoders import Nominatim
5
+ from geopy.extra.rate_limiter import RateLimiter
6
+ import tempfile
7
  import time
8
+ from typing import Optional, Tuple
9
+ import warnings
 
10
 
11
+ # Suppress geopy warnings
12
+ warnings.filterwarnings("ignore", category=UserWarning, module="geopy")
 
 
 
 
 
 
 
13
 
14
+ # Configure geocoder
15
+ class Geocoder:
16
+ def __init__(self):
17
  self.geolocator = Nominatim(
18
+ user_agent="historical_data_mapper",
19
+ timeout=10
20
  )
21
+ self.geocode = RateLimiter(
22
+ self.geolocator.geocode,
23
+ min_delay_seconds=1,
24
+ max_retries=2,
25
+ error_wait_seconds=5
26
+ )
27
+ self.cache = {}
28
+
29
+ def get_coordinates(self, location: str) -> Optional[Tuple[float, float]]:
30
+ if not location or pd.isna(location):
31
+ return None
32
+
 
33
  if location in self.cache:
34
  return self.cache[location]
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  try:
37
+ location_data = self.geocode(location)
38
+ if location_data:
39
+ coords = (location_data.latitude, location_data.longitude)
40
+ self.cache[location] = coords
41
+ return coords
 
 
 
 
 
 
 
 
 
42
  except Exception as e:
43
+ print(f"Geocoding error for '{location}': {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ self.cache[location] = None
46
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ def create_interactive_map(df: pd.DataFrame, location_column: str) -> str:
49
+ """Create a folium map with all valid locations"""
50
+ geocoder = Geocoder()
51
+ valid_locations = []
 
52
 
53
+ # Process all unique locations
54
+ unique_locations = df[location_column].dropna().unique()
55
+
56
+ for loc in unique_locations:
57
+ coords = geocoder.get_coordinates(str(loc))
58
+ if coords:
59
+ valid_locations.append((loc, coords))
60
+
61
+ if not valid_locations:
62
+ return "<div style='color:red;text-align:center'>No valid locations found</div>"
63
+
64
+ # Create map centered on first location
65
+ m = folium.Map(
66
+ location=valid_locations[0][1],
67
+ zoom_start=5,
68
+ tiles="CartoDB positron",
69
+ control_scale=True
70
+ )
71
+
72
+ # Add all markers
73
+ for loc, coords in valid_locations:
74
+ folium.Marker(
75
+ location=coords,
76
+ popup=folium.Popup(loc, max_width=300),
77
+ icon=folium.Icon(color="blue", icon="info-sign")
78
+ ).add_to(m)
79
+
80
+ # Fit bounds if multiple locations
81
+ if len(valid_locations) > 1:
82
+ m.fit_bounds([coords for _, coords in valid_locations])
83
+
84
+ # Return HTML string
85
+ return m._repr_html_()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ def process_data(file_obj, column_name: str):
88
+ """Process uploaded file and return results"""
89
+ start_time = time.time()
90
+
91
+ if not file_obj:
92
+ return None, "Please upload a file", None
93
+
94
  try:
95
+ # Read input file
96
+ df = pd.read_excel(file_obj.name)
97
+
98
+ # Validate column exists
99
+ if column_name not in df.columns:
100
+ return None, f"Column '{column_name}' not found in data", None
101
+
102
+ # Create map
103
+ map_html = create_interactive_map(df, column_name)
104
+
105
+ # Create processed output
106
+ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp_file:
107
+ df.to_excel(tmp_file.name, index=False)
108
+ processed_path = tmp_file.name
109
+
110
+ # Generate statistics
111
+ total_rows = len(df)
112
+ unique_locations = df[column_name].nunique()
113
+ processing_time = round(time.time() - start_time, 2)
114
+
115
+ stats = (
116
+ f"Total rows processed: {total_rows}\n"
117
+ f"Unique locations found: {unique_locations}\n"
118
+ f"Processing time: {processing_time}s"
119
+ )
120
 
121
+ return (
122
+ f"<div style='width:100%; height:65vh'>{map_html}</div>",
123
+ stats,
124
+ processed_path
125
+ )
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  except Exception as e:
128
+ error_msg = f"Error processing file: {str(e)}"
129
+ print(error_msg)
130
+ return None, error_msg, None
131
+
132
+ # Gradio Interface
133
+ with gr.Blocks(
134
+ title="Historical Data Mapper",
135
+ theme=gr.themes.Soft()
136
+ ) as app:
137
  gr.Markdown("# Historical Data Analysis Tools")
138
 
139
+ with gr.Tab("Location Mapping"):
140
+ gr.Markdown("### Geocode and visualize location data from Excel files")
141
+
142
+ with gr.Row():
143
+ with gr.Column(scale=1):
144
+ file_input = gr.File(
145
+ label="Upload Excel File",
146
+ type="file",
147
+ file_types=[".xlsx", ".xls"]
148
+ )
149
+ column_input = gr.Textbox(
150
+ label="Location Column Name",
151
+ value="dateline_locations",
152
+ placeholder="Enter the column containing location names"
153
+ )
154
+ process_btn = gr.Button(
155
+ "Process and Map",
156
+ variant="primary"
157
+ )
158
 
159
+ with gr.Column(scale=2):
160
+ map_display = gr.HTML(
161
+ label="Interactive Map",
162
+ value="<div style='text-align:center;padding:20px;'>"
163
+ "Map will appear here after processing"
164
+ "</div>"
165
+ )
166
+ stats_output = gr.Textbox(
167
+ label="Processing Statistics",
168
+ interactive=False
169
+ )
170
+ download_output = gr.File(
171
+ label="Download Processed Data",
172
+ visible=False
173
+ )
174
+
175
+ # Configure button action
176
+ process_btn.click(
177
+ fn=process_data,
178
+ inputs=[file_input, column_input],
179
+ outputs=[map_display, stats_output, download_output]
180
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
+ # Launch settings
183
  if __name__ == "__main__":
184
+ app.launch(
185
+ server_name="0.0.0.0",
186
+ server_port=7860,
187
+ share=False
188
+ )