milwright commited on
Commit
6c31eb1
·
1 Parent(s): 525ef5c

Ensure deployment package mirrors preview functionality

Browse files

- Enhanced URL fetching with domain validation and improved headers
- Added comprehensive error handling matching preview behavior
- Updated message format handling for modern and legacy compatibility
- Improved web search fallback logic and error messages
- Enhanced conversation export supporting both message formats
- Added missing regex import and enhanced URL extraction
- Smart content truncation with sentence boundary detection
- Better API response validation and empty content detection

Deployment templates now provide identical functionality to preview sandbox

Files changed (1) hide show
  1. app.py +124 -35
app.py CHANGED
@@ -6,6 +6,11 @@ import json
6
  import zipfile
7
  import io
8
  import os
 
 
 
 
 
9
  from datetime import datetime
10
  from dotenv import load_dotenv
11
  import requests
@@ -131,6 +136,7 @@ SPACE_TEMPLATE = '''import gradio as gr
131
  import os
132
  import requests
133
  import json
 
134
  from bs4 import BeautifulSoup
135
  from datetime import datetime
136
  import tempfile
@@ -181,38 +187,84 @@ def validate_api_key():
181
  # Validate on startup
182
  API_KEY_VALID = validate_api_key()
183
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  def fetch_url_content(url):
185
- """Fetch and extract text content from a URL using requests and BeautifulSoup"""
 
 
 
186
  try:
187
- response = requests.get(url, timeout=10, headers={{'User-Agent': 'Mozilla/5.0'}})
 
 
 
 
 
 
 
 
 
188
  response.raise_for_status()
189
  soup = BeautifulSoup(response.content, 'html.parser')
190
 
191
- # Remove script and style elements
192
- for script in soup(["script", "style", "nav", "header", "footer"]):
193
- script.decompose()
194
 
195
- # Get text content
196
- text = soup.get_text()
 
197
 
198
- # Clean up whitespace
199
  lines = (line.strip() for line in text.splitlines())
200
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
201
- text = ' '.join(chunk for chunk in chunks if chunk)
202
 
203
- # Truncate to ~4000 characters
204
  if len(text) > 4000:
205
- text = text[:4000] + "..."
 
 
 
 
 
206
 
207
- return text
208
- except Exception as e:
 
 
 
209
  return f"Error fetching {{url}}: {{str(e)}}"
 
 
210
 
211
  def extract_urls_from_text(text):
212
- """Extract URLs from text using regex"""
213
  import re
214
- url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
215
- return re.findall(url_pattern, text)
 
 
 
 
 
 
 
 
 
 
 
216
 
217
  # Global cache for URL content to avoid re-crawling in generated spaces
218
  _url_content_cache = {{}}
@@ -256,15 +308,25 @@ Generated on: {{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}}
256
 
257
  """
258
 
 
259
  for i, message in enumerate(conversation_history):
260
  if isinstance(message, dict):
261
  role = message.get('role', 'unknown')
262
  content = message.get('content', '')
263
 
264
  if role == 'user':
265
- markdown_content += f"## User Message {{(i//2) + 1}}\\n\\n{{content}}\\n\\n"
 
266
  elif role == 'assistant':
267
- markdown_content += f"## Assistant Response {{(i//2) + 1}}\\n\\n{{content}}\\n\\n---\\n\\n"
 
 
 
 
 
 
 
 
268
 
269
  return markdown_content
270
 
@@ -447,14 +509,16 @@ def generate_response(message, history):
447
 
448
  grounding_context += f"\\n\\nWeb search results for '{{search_query}}':\\n{{search_result}}"
449
  except Exception as e:
450
- # Fallback to URL extraction if web search fails
451
  urls = extract_urls_from_text(search_query)
452
  if urls:
 
453
  for url in urls[:2]: # Limit to 2 URLs for fallback
454
  content = fetch_url_content(url)
455
- grounding_context += f"\\n\\nFallback content from {{url}}:\\n{{content[:500]}}..."
 
456
  else:
457
- grounding_context += f"\\n\\nWeb search requested: {{search_query}} (external search not available)"
458
 
459
  # Build enhanced system prompt with grounding context
460
  enhanced_system_prompt = SYSTEM_PROMPT + grounding_context
@@ -462,17 +526,18 @@ def generate_response(message, history):
462
  # Build messages array for the API
463
  messages = [{{"role": "system", "content": enhanced_system_prompt}}]
464
 
465
- # Add conversation history - compatible with Gradio 5.x format
466
  for chat in history:
467
  if isinstance(chat, dict):
468
- # New format: {{"role": "user", "content": "..."}} or {{"role": "assistant", "content": "..."}}
469
  messages.append(chat)
470
- else:
471
- # Legacy format: ("user msg", "bot msg")
472
- user_msg, bot_msg = chat
473
- messages.append({{"role": "user", "content": user_msg}})
474
- if bot_msg:
475
- messages.append({{"role": "assistant", "content": bot_msg}})
 
476
 
477
  # Add current message
478
  messages.append({{"role": "user", "content": message}})
@@ -503,10 +568,33 @@ def generate_response(message, history):
503
  print(f"📡 API Response: {{response.status_code}}")
504
 
505
  if response.status_code == 200:
506
- result = response.json()
507
- content = result['choices'][0]['message']['content']
508
- print(f"✅ API request successful")
509
- return content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  elif response.status_code == 401:
511
  error_msg = f"🔐 **Authentication Error**\\n\\n"
512
  error_msg += f"Your API key appears to be invalid or expired.\\n\\n"
@@ -677,7 +765,8 @@ with gr.Blocks(title=SPACE_NAME) as demo:
677
  fn=protected_generate_response,
678
  title="", # Title already shown above
679
  description="", # Description already shown above
680
- examples=None
 
681
  )
682
 
683
  # Export functionality
@@ -1945,7 +2034,7 @@ with gr.Blocks(
1945
  # State to store RAG tool
1946
  rag_tool_state = gr.State(None)
1947
 
1948
- with gr.Accordion("URL Grounding (Optional)", open=False):
1949
  gr.Markdown("Add URLs to provide context. Content will be fetched and added to the system prompt.")
1950
 
1951
  # Initial URL fields
 
6
  import zipfile
7
  import io
8
  import os
9
+ # Set environment variables early to prevent multiprocessing issues with RAG
10
+ os.environ['TOKENIZERS_PARALLELISM'] = 'false'
11
+ os.environ['OMP_NUM_THREADS'] = '1'
12
+ os.environ['MKL_NUM_THREADS'] = '1'
13
+
14
  from datetime import datetime
15
  from dotenv import load_dotenv
16
  import requests
 
136
  import os
137
  import requests
138
  import json
139
+ import re
140
  from bs4 import BeautifulSoup
141
  from datetime import datetime
142
  import tempfile
 
187
  # Validate on startup
188
  API_KEY_VALID = validate_api_key()
189
 
190
+ def validate_url_domain(url):
191
+ """Basic URL domain validation"""
192
+ try:
193
+ from urllib.parse import urlparse
194
+ parsed = urlparse(url)
195
+ # Check for valid domain structure
196
+ if parsed.netloc and '.' in parsed.netloc:
197
+ return True
198
+ except:
199
+ pass
200
+ return False
201
+
202
  def fetch_url_content(url):
203
+ """Enhanced URL content fetching with improved compatibility and error handling"""
204
+ if not validate_url_domain(url):
205
+ return f"Invalid URL format: {{url}}"
206
+
207
  try:
208
+ # Enhanced headers for better compatibility
209
+ headers = {{
210
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
211
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
212
+ 'Accept-Language': 'en-US,en;q=0.5',
213
+ 'Accept-Encoding': 'gzip, deflate',
214
+ 'Connection': 'keep-alive'
215
+ }}
216
+
217
+ response = requests.get(url, timeout=15, headers=headers)
218
  response.raise_for_status()
219
  soup = BeautifulSoup(response.content, 'html.parser')
220
 
221
+ # Enhanced content cleaning
222
+ for element in soup(["script", "style", "nav", "header", "footer", "aside", "form", "button"]):
223
+ element.decompose()
224
 
225
+ # Extract main content preferentially
226
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=lambda x: bool(x and 'content' in x.lower())) or soup
227
+ text = main_content.get_text()
228
 
229
+ # Enhanced text cleaning
230
  lines = (line.strip() for line in text.splitlines())
231
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
232
+ text = ' '.join(chunk for chunk in chunks if chunk and len(chunk) > 2)
233
 
234
+ # Smart truncation - try to end at sentence boundaries
235
  if len(text) > 4000:
236
+ truncated = text[:4000]
237
+ last_period = truncated.rfind('.')
238
+ if last_period > 3000: # If we can find a reasonable sentence break
239
+ text = truncated[:last_period + 1]
240
+ else:
241
+ text = truncated + "..."
242
 
243
+ return text if text.strip() else "No readable content found at this URL"
244
+
245
+ except requests.exceptions.Timeout:
246
+ return f"Timeout error fetching {{url}} (15s limit exceeded)"
247
+ except requests.exceptions.RequestException as e:
248
  return f"Error fetching {{url}}: {{str(e)}}"
249
+ except Exception as e:
250
+ return f"Error processing content from {{url}}: {{str(e)}}"
251
 
252
  def extract_urls_from_text(text):
253
+ """Extract URLs from text using regex with enhanced validation"""
254
  import re
255
+ url_pattern = r'https?://[^\\s<>"{{}}|\\\\^`\\[\\]"]+'
256
+ urls = re.findall(url_pattern, text)
257
+
258
+ # Basic URL validation and cleanup
259
+ validated_urls = []
260
+ for url in urls:
261
+ # Remove trailing punctuation that might be captured
262
+ url = url.rstrip('.,!?;:')
263
+ # Basic domain validation
264
+ if '.' in url and len(url) > 10:
265
+ validated_urls.append(url)
266
+
267
+ return validated_urls
268
 
269
  # Global cache for URL content to avoid re-crawling in generated spaces
270
  _url_content_cache = {{}}
 
308
 
309
  """
310
 
311
+ message_pair_count = 0
312
  for i, message in enumerate(conversation_history):
313
  if isinstance(message, dict):
314
  role = message.get('role', 'unknown')
315
  content = message.get('content', '')
316
 
317
  if role == 'user':
318
+ message_pair_count += 1
319
+ markdown_content += f"## User Message {{message_pair_count}}\\n\\n{{content}}\\n\\n"
320
  elif role == 'assistant':
321
+ markdown_content += f"## Assistant Response {{message_pair_count}}\\n\\n{{content}}\\n\\n---\\n\\n"
322
+ elif isinstance(message, (list, tuple)) and len(message) >= 2:
323
+ # Handle legacy tuple format: ["user msg", "assistant msg"]
324
+ message_pair_count += 1
325
+ user_msg, assistant_msg = message[0], message[1]
326
+ if user_msg:
327
+ markdown_content += f"## User Message {{message_pair_count}}\\n\\n{{user_msg}}\\n\\n"
328
+ if assistant_msg:
329
+ markdown_content += f"## Assistant Response {{message_pair_count}}\\n\\n{{assistant_msg}}\\n\\n---\\n\\n"
330
 
331
  return markdown_content
332
 
 
509
 
510
  grounding_context += f"\\n\\nWeb search results for '{{search_query}}':\\n{{search_result}}"
511
  except Exception as e:
512
+ # Enhanced fallback with better error handling
513
  urls = extract_urls_from_text(search_query)
514
  if urls:
515
+ fallback_results = []
516
  for url in urls[:2]: # Limit to 2 URLs for fallback
517
  content = fetch_url_content(url)
518
+ fallback_results.append(f"Content from {{url}}:\\n{{content[:500]}}...")
519
+ grounding_context += f"\\n\\nWeb search fallback for '{{search_query}}':\\n" + "\\n\\n".join(fallback_results)
520
  else:
521
+ grounding_context += f"\\n\\nWeb search requested for '{{search_query}}' but search functionality is unavailable"
522
 
523
  # Build enhanced system prompt with grounding context
524
  enhanced_system_prompt = SYSTEM_PROMPT + grounding_context
 
526
  # Build messages array for the API
527
  messages = [{{"role": "system", "content": enhanced_system_prompt}}]
528
 
529
+ # Add conversation history - handle both modern messages format and legacy tuples
530
  for chat in history:
531
  if isinstance(chat, dict):
532
+ # Modern format: {{"role": "user", "content": "..."}} or {{"role": "assistant", "content": "..."}}
533
  messages.append(chat)
534
+ elif isinstance(chat, (list, tuple)) and len(chat) >= 2:
535
+ # Legacy format: ["user msg", "assistant msg"] or ("user msg", "assistant msg")
536
+ user_msg, assistant_msg = chat[0], chat[1]
537
+ if user_msg:
538
+ messages.append({{"role": "user", "content": user_msg}})
539
+ if assistant_msg:
540
+ messages.append({{"role": "assistant", "content": assistant_msg}})
541
 
542
  # Add current message
543
  messages.append({{"role": "user", "content": message}})
 
568
  print(f"📡 API Response: {{response.status_code}}")
569
 
570
  if response.status_code == 200:
571
+ try:
572
+ result = response.json()
573
+
574
+ # Enhanced validation of API response structure
575
+ if 'choices' not in result or not result['choices']:
576
+ print(f"⚠️ API response missing choices: {{result}}")
577
+ return "API Error: No response choices available"
578
+ elif 'message' not in result['choices'][0]:
579
+ print(f"⚠️ API response missing message: {{result}}")
580
+ return "API Error: No message in response"
581
+ elif 'content' not in result['choices'][0]['message']:
582
+ print(f"⚠️ API response missing content: {{result}}")
583
+ return "API Error: No content in message"
584
+ else:
585
+ content = result['choices'][0]['message']['content']
586
+
587
+ # Check for empty content
588
+ if not content or content.strip() == "":
589
+ print(f"⚠️ API returned empty content")
590
+ return "API Error: Empty response content"
591
+
592
+ print(f"✅ API request successful")
593
+ return content
594
+
595
+ except (KeyError, IndexError, json.JSONDecodeError) as e:
596
+ print(f"❌ Failed to parse API response: {{str(e)}}")
597
+ return f"API Error: Failed to parse response - {{str(e)}}"
598
  elif response.status_code == 401:
599
  error_msg = f"🔐 **Authentication Error**\\n\\n"
600
  error_msg += f"Your API key appears to be invalid or expired.\\n\\n"
 
765
  fn=protected_generate_response,
766
  title="", # Title already shown above
767
  description="", # Description already shown above
768
+ examples=None,
769
+ type="messages" # Use modern message format for better compatibility
770
  )
771
 
772
  # Export functionality
 
2034
  # State to store RAG tool
2035
  rag_tool_state = gr.State(None)
2036
 
2037
+ with gr.Accordion("URL Grounding (Optional)", open=True):
2038
  gr.Markdown("Add URLs to provide context. Content will be fetched and added to the system prompt.")
2039
 
2040
  # Initial URL fields