Spaces:

milwright
/

chatui-helper

Running

milwright commited on 15 days ago

Commit

6c31eb1

1 Parent(s): 525ef5c

Ensure deployment package mirrors preview functionality

- Enhanced URL fetching with domain validation and improved headers
- Added comprehensive error handling matching preview behavior
- Updated message format handling for modern and legacy compatibility
- Improved web search fallback logic and error messages
- Enhanced conversation export supporting both message formats
- Added missing regex import and enhanced URL extraction
- Smart content truncation with sentence boundary detection
- Better API response validation and empty content detection

Deployment templates now provide identical functionality to preview sandbox

Files changed (1) hide show

app.py +124 -35

app.py CHANGED Viewed

@@ -6,6 +6,11 @@ import json
 import zipfile
 import io
 import os
 from datetime import datetime
 from dotenv import load_dotenv
 import requests
@@ -131,6 +136,7 @@ SPACE_TEMPLATE = '''import gradio as gr
 import os
 import requests
 import json
 from bs4 import BeautifulSoup
 from datetime import datetime
 import tempfile
@@ -181,38 +187,84 @@ def validate_api_key():
 # Validate on startup
 API_KEY_VALID = validate_api_key()
 def fetch_url_content(url):
-    """Fetch and extract text content from a URL using requests and BeautifulSoup"""
     try:
-        response = requests.get(url, timeout=10, headers={{'User-Agent': 'Mozilla/5.0'}})
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
-        # Remove script and style elements
-        for script in soup(["script", "style", "nav", "header", "footer"]):
-            script.decompose()
-        # Get text content
-        text = soup.get_text()
-        # Clean up whitespace
         lines = (line.strip() for line in text.splitlines())
         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-        text = ' '.join(chunk for chunk in chunks if chunk)
-        # Truncate to ~4000 characters
         if len(text) > 4000:
-            text = text[:4000] + "..."
-        return text
-    except Exception as e:
         return f"Error fetching {{url}}: {{str(e)}}"
 def extract_urls_from_text(text):
-    """Extract URLs from text using regex"""
     import re
-    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
-    return re.findall(url_pattern, text)
 # Global cache for URL content to avoid re-crawling in generated spaces
 _url_content_cache = {{}}
@@ -256,15 +308,25 @@ Generated on: {{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}}
 """
     for i, message in enumerate(conversation_history):
         if isinstance(message, dict):
             role = message.get('role', 'unknown')
             content = message.get('content', '')
             if role == 'user':
-                markdown_content += f"## User Message {{(i//2) + 1}}\\n\\n{{content}}\\n\\n"
             elif role == 'assistant':
-                markdown_content += f"## Assistant Response {{(i//2) + 1}}\\n\\n{{content}}\\n\\n---\\n\\n"
     return markdown_content
@@ -447,14 +509,16 @@ def generate_response(message, history):
                 grounding_context += f"\\n\\nWeb search results for '{{search_query}}':\\n{{search_result}}"
             except Exception as e:
-                # Fallback to URL extraction if web search fails
                 urls = extract_urls_from_text(search_query)
                 if urls:
                     for url in urls[:2]:  # Limit to 2 URLs for fallback
                         content = fetch_url_content(url)
-                        grounding_context += f"\\n\\nFallback content from {{url}}:\\n{{content[:500]}}..."
                 else:
-                    grounding_context += f"\\n\\nWeb search requested: {{search_query}} (external search not available)"
     # Build enhanced system prompt with grounding context
     enhanced_system_prompt = SYSTEM_PROMPT + grounding_context
@@ -462,17 +526,18 @@ def generate_response(message, history):
     # Build messages array for the API
     messages = [{{"role": "system", "content": enhanced_system_prompt}}]
-    # Add conversation history - compatible with Gradio 5.x format
     for chat in history:
         if isinstance(chat, dict):
-            # New format: {{"role": "user", "content": "..."}} or {{"role": "assistant", "content": "..."}}
             messages.append(chat)
-        else:
-            # Legacy format: ("user msg", "bot msg")
-            user_msg, bot_msg = chat
-            messages.append({{"role": "user", "content": user_msg}})
-            if bot_msg:
-                messages.append({{"role": "assistant", "content": bot_msg}})
     # Add current message
     messages.append({{"role": "user", "content": message}})
@@ -503,10 +568,33 @@ def generate_response(message, history):
         print(f"📡 API Response: {{response.status_code}}")
         if response.status_code == 200:
-            result = response.json()
-            content = result['choices'][0]['message']['content']
-            print(f"✅ API request successful")
-            return content
         elif response.status_code == 401:
             error_msg = f"🔐 **Authentication Error**\\n\\n"
             error_msg += f"Your API key appears to be invalid or expired.\\n\\n"
@@ -677,7 +765,8 @@ with gr.Blocks(title=SPACE_NAME) as demo:
             fn=protected_generate_response,
             title="",  # Title already shown above
             description="",  # Description already shown above
-            examples=None
         )
         # Export functionality
@@ -1945,7 +2034,7 @@ with gr.Blocks(
                         # State to store RAG tool
                         rag_tool_state = gr.State(None)
-                    with gr.Accordion("URL Grounding (Optional)", open=False):
                         gr.Markdown("Add URLs to provide context. Content will be fetched and added to the system prompt.")
                         # Initial URL fields

 import zipfile
 import io
 import os
+# Set environment variables early to prevent multiprocessing issues with RAG
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
 from datetime import datetime
 from dotenv import load_dotenv
 import requests
 import os
 import requests
 import json
+import re
 from bs4 import BeautifulSoup
 from datetime import datetime
 import tempfile
 # Validate on startup
 API_KEY_VALID = validate_api_key()
+def validate_url_domain(url):
+    """Basic URL domain validation"""
+    try:
+        from urllib.parse import urlparse
+        parsed = urlparse(url)
+        # Check for valid domain structure
+        if parsed.netloc and '.' in parsed.netloc:
+            return True
+    except:
+        pass
+    return False
 def fetch_url_content(url):
+    """Enhanced URL content fetching with improved compatibility and error handling"""
+    if not validate_url_domain(url):
+        return f"Invalid URL format: {{url}}"
     try:
+        # Enhanced headers for better compatibility
+        headers = {{
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive'
+        }}
+        response = requests.get(url, timeout=15, headers=headers)
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
+        # Enhanced content cleaning
+        for element in soup(["script", "style", "nav", "header", "footer", "aside", "form", "button"]):
+            element.decompose()
+        # Extract main content preferentially
+        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=lambda x: bool(x and 'content' in x.lower())) or soup
+        text = main_content.get_text()
+        # Enhanced text cleaning
         lines = (line.strip() for line in text.splitlines())
         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = ' '.join(chunk for chunk in chunks if chunk and len(chunk) > 2)
+        # Smart truncation - try to end at sentence boundaries
         if len(text) > 4000:
+            truncated = text[:4000]
+            last_period = truncated.rfind('.')
+            if last_period > 3000:  # If we can find a reasonable sentence break
+                text = truncated[:last_period + 1]
+            else:
+                text = truncated + "..."
+        return text if text.strip() else "No readable content found at this URL"
+    except requests.exceptions.Timeout:
+        return f"Timeout error fetching {{url}} (15s limit exceeded)"
+    except requests.exceptions.RequestException as e:
         return f"Error fetching {{url}}: {{str(e)}}"
+    except Exception as e:
+        return f"Error processing content from {{url}}: {{str(e)}}"
 def extract_urls_from_text(text):
+    """Extract URLs from text using regex with enhanced validation"""
     import re
+    url_pattern = r'https?://[^\\s<>"{{}}|\\\\^`\\[\\]"]+'
+    urls = re.findall(url_pattern, text)
+    # Basic URL validation and cleanup
+    validated_urls = []
+    for url in urls:
+        # Remove trailing punctuation that might be captured
+        url = url.rstrip('.,!?;:')
+        # Basic domain validation
+        if '.' in url and len(url) > 10:
+            validated_urls.append(url)
+    return validated_urls
 # Global cache for URL content to avoid re-crawling in generated spaces
 _url_content_cache = {{}}
 """
+    message_pair_count = 0
     for i, message in enumerate(conversation_history):
         if isinstance(message, dict):
             role = message.get('role', 'unknown')
             content = message.get('content', '')
             if role == 'user':
+                message_pair_count += 1
+                markdown_content += f"## User Message {{message_pair_count}}\\n\\n{{content}}\\n\\n"
             elif role == 'assistant':
+                markdown_content += f"## Assistant Response {{message_pair_count}}\\n\\n{{content}}\\n\\n---\\n\\n"
+        elif isinstance(message, (list, tuple)) and len(message) >= 2:
+            # Handle legacy tuple format: ["user msg", "assistant msg"]
+            message_pair_count += 1
+            user_msg, assistant_msg = message[0], message[1]
+            if user_msg:
+                markdown_content += f"## User Message {{message_pair_count}}\\n\\n{{user_msg}}\\n\\n"
+            if assistant_msg:
+                markdown_content += f"## Assistant Response {{message_pair_count}}\\n\\n{{assistant_msg}}\\n\\n---\\n\\n"
     return markdown_content
                 grounding_context += f"\\n\\nWeb search results for '{{search_query}}':\\n{{search_result}}"
             except Exception as e:
+                # Enhanced fallback with better error handling
                 urls = extract_urls_from_text(search_query)
                 if urls:
+                    fallback_results = []
                     for url in urls[:2]:  # Limit to 2 URLs for fallback
                         content = fetch_url_content(url)
+                        fallback_results.append(f"Content from {{url}}:\\n{{content[:500]}}...")
+                    grounding_context += f"\\n\\nWeb search fallback for '{{search_query}}':\\n" + "\\n\\n".join(fallback_results)
                 else:
+                    grounding_context += f"\\n\\nWeb search requested for '{{search_query}}' but search functionality is unavailable"
     # Build enhanced system prompt with grounding context
     enhanced_system_prompt = SYSTEM_PROMPT + grounding_context
     # Build messages array for the API
     messages = [{{"role": "system", "content": enhanced_system_prompt}}]
+    # Add conversation history - handle both modern messages format and legacy tuples
     for chat in history:
         if isinstance(chat, dict):
+            # Modern format: {{"role": "user", "content": "..."}} or {{"role": "assistant", "content": "..."}}
             messages.append(chat)
+        elif isinstance(chat, (list, tuple)) and len(chat) >= 2:
+            # Legacy format: ["user msg", "assistant msg"] or ("user msg", "assistant msg")
+            user_msg, assistant_msg = chat[0], chat[1]
+            if user_msg:
+                messages.append({{"role": "user", "content": user_msg}})
+            if assistant_msg:
+                messages.append({{"role": "assistant", "content": assistant_msg}})
     # Add current message
     messages.append({{"role": "user", "content": message}})
         print(f"📡 API Response: {{response.status_code}}")
         if response.status_code == 200:
+            try:
+                result = response.json()
+                # Enhanced validation of API response structure
+                if 'choices' not in result or not result['choices']:
+                    print(f"⚠️  API response missing choices: {{result}}")
+                    return "API Error: No response choices available"
+                elif 'message' not in result['choices'][0]:
+                    print(f"⚠️  API response missing message: {{result}}")
+                    return "API Error: No message in response"
+                elif 'content' not in result['choices'][0]['message']:
+                    print(f"⚠️  API response missing content: {{result}}")
+                    return "API Error: No content in message"
+                else:
+                    content = result['choices'][0]['message']['content']
+                    # Check for empty content
+                    if not content or content.strip() == "":
+                        print(f"⚠️  API returned empty content")
+                        return "API Error: Empty response content"
+                    print(f"✅ API request successful")
+                    return content
+            except (KeyError, IndexError, json.JSONDecodeError) as e:
+                print(f"❌ Failed to parse API response: {{str(e)}}")
+                return f"API Error: Failed to parse response - {{str(e)}}"
         elif response.status_code == 401:
             error_msg = f"🔐 **Authentication Error**\\n\\n"
             error_msg += f"Your API key appears to be invalid or expired.\\n\\n"
             fn=protected_generate_response,
             title="",  # Title already shown above
             description="",  # Description already shown above
+            examples=None,
+            type="messages"  # Use modern message format for better compatibility
         )
         # Export functionality
                         # State to store RAG tool
                         rag_tool_state = gr.State(None)
+                    with gr.Accordion("URL Grounding (Optional)", open=True):
                         gr.Markdown("Add URLs to provide context. Content will be fetched and added to the system prompt.")
                         # Initial URL fields