milwright commited on
Commit
6dfc79e
Β·
1 Parent(s): 8d22c88

Optimize v2-search with enhanced URL processing and search capabilities

Browse files

- Enhanced URL validation with domain checking and smart truncation
- Improved content extraction targeting main/article elements
- Increased dynamic URL limit to 3 with better error handling
- Fixed research template integration after field simplification
- Added backward compatibility wrapper for fetch_url_content
- Extended timeout to 15s for better reliability

Files changed (2) hide show
  1. app.py +101 -82
  2. claude.local.md +314 -0
app.py CHANGED
@@ -13,15 +13,19 @@ from support_docs import create_support_docs, export_conversation_to_markdown
13
 
14
  # Simple URL content fetching using requests and BeautifulSoup
15
  def get_grounding_context_simple(urls):
16
- """Fetch grounding context using simple HTTP requests"""
17
  if not urls:
18
  return ""
19
 
20
  context_parts = []
21
  for i, url in enumerate(urls, 1):
22
  if url and url.strip():
23
- content = fetch_url_content(url.strip())
24
- context_parts.append(f"Context from URL {i} ({url}):\n{content}")
 
 
 
 
25
 
26
  if context_parts:
27
  return "\n\n" + "\n\n".join(context_parts) + "\n\n"
@@ -42,9 +46,82 @@ load_dotenv()
42
  import re
43
 
44
  def extract_urls_from_text(text):
45
- """Extract URLs from text using regex"""
46
  url_pattern = r'https?://[^\s<>"{}|\\^`\[\]"]+'
47
- return re.findall(url_pattern, text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  # Template for generated space app (based on mvp_simple.py)
50
  SPACE_TEMPLATE = '''import gradio as gr
@@ -358,31 +435,8 @@ MODELS = [
358
  ]
359
 
360
  def fetch_url_content(url):
361
- """Fetch and extract text content from a URL"""
362
- try:
363
- response = requests.get(url, timeout=10)
364
- response.raise_for_status()
365
- soup = BeautifulSoup(response.content, 'html.parser')
366
-
367
- # Remove script and style elements
368
- for script in soup(["script", "style"]):
369
- script.decompose()
370
-
371
- # Get text content
372
- text = soup.get_text()
373
-
374
- # Clean up whitespace
375
- lines = (line.strip() for line in text.splitlines())
376
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
377
- text = ' '.join(chunk for chunk in chunks if chunk)
378
-
379
- # Truncate to ~4000 characters
380
- if len(text) > 4000:
381
- text = text[:4000] + "..."
382
-
383
- return text
384
- except Exception as e:
385
- return f"Error fetching {url}: {str(e)}"
386
 
387
  def get_grounding_context(urls):
388
  """Fetch context from grounding URLs"""
@@ -697,35 +751,16 @@ def on_preview_combined(name, description, system_prompt, enable_research_assist
697
  )
698
 
699
  try:
700
- # Combine system prompt components if research assistant is enabled
701
- if enable_research_assistant:
702
- if not role_purpose or not role_purpose.strip():
703
- return (
704
- {},
705
- gr.update(value="**Error:** Please provide a Role and Purpose for the research assistant", visible=True),
706
- gr.update(visible=False),
707
- gr.update(value="Configuration will appear here after preview generation.")
708
- )
709
- system_prompt_parts = []
710
- if role_purpose and role_purpose.strip():
711
- system_prompt_parts.append(role_purpose.strip())
712
- if intended_audience and intended_audience.strip():
713
- system_prompt_parts.append(intended_audience.strip())
714
- if key_tasks and key_tasks.strip():
715
- system_prompt_parts.append(key_tasks.strip())
716
- if additional_context and additional_context.strip():
717
- system_prompt_parts.append(additional_context.strip())
718
-
719
- final_system_prompt = " ".join(system_prompt_parts)
720
- else:
721
- if not system_prompt or not system_prompt.strip():
722
- return (
723
- {},
724
- gr.update(value="**Error:** Please provide a System Prompt for the assistant", visible=True),
725
- gr.update(visible=False),
726
- gr.update(value="Configuration will appear here after preview generation.")
727
- )
728
- final_system_prompt = system_prompt.strip()
729
 
730
  # Create configuration for preview
731
  config_data = {
@@ -881,8 +916,8 @@ def preview_chat_response(message, history, config_data, url1="", url2="", url3=
881
  urls_in_message = extract_urls_from_text(message)
882
  if urls_in_message:
883
  dynamic_context_parts = []
884
- for url in urls_in_message[:2]: # Limit to 2 URLs in preview
885
- content = fetch_url_content(url)
886
  dynamic_context_parts.append(f"\n\nDynamic context from {url}:\n{content}")
887
  if dynamic_context_parts:
888
  dynamic_context = "\n".join(dynamic_context_parts)
@@ -988,27 +1023,11 @@ def on_generate(name, description, system_prompt, enable_research_assistant, mod
988
  if enable_vector_rag and rag_tool_state:
989
  rag_data = rag_tool_state.get_serialized_data()
990
 
991
- # Combine system prompt components if research assistant is enabled
992
- if enable_research_assistant:
993
- # Use the research assistant fields if enabled
994
- if not role_purpose or not role_purpose.strip():
995
- return gr.update(value="Error: Please provide a Role and Purpose for the research assistant", visible=True), gr.update(visible=False)
996
- system_prompt_parts = []
997
- if role_purpose and role_purpose.strip():
998
- system_prompt_parts.append(role_purpose.strip())
999
- if intended_audience and intended_audience.strip():
1000
- system_prompt_parts.append(intended_audience.strip())
1001
- if key_tasks and key_tasks.strip():
1002
- system_prompt_parts.append(key_tasks.strip())
1003
- if additional_context and additional_context.strip():
1004
- system_prompt_parts.append(additional_context.strip())
1005
-
1006
- final_system_prompt = " ".join(system_prompt_parts)
1007
- else:
1008
- # Use the direct system prompt field
1009
- if not system_prompt or not system_prompt.strip():
1010
- return gr.update(value="Error: Please provide a System Prompt for the assistant", visible=True), gr.update(visible=False)
1011
- final_system_prompt = system_prompt.strip()
1012
 
1013
  filename = generate_zip(name, description, final_system_prompt, model, api_key_var, temperature, max_tokens, examples_text, access_code, enable_dynamic_urls, url1, url2, url3, url4, enable_vector_rag, rag_data)
1014
 
 
13
 
14
  # Simple URL content fetching using requests and BeautifulSoup
15
  def get_grounding_context_simple(urls):
16
+ """Fetch grounding context using enhanced HTTP requests"""
17
  if not urls:
18
  return ""
19
 
20
  context_parts = []
21
  for i, url in enumerate(urls, 1):
22
  if url and url.strip():
23
+ # Use enhanced URL extraction for any URLs within the URL text
24
+ extracted_urls = extract_urls_from_text(url.strip())
25
+ target_url = extracted_urls[0] if extracted_urls else url.strip()
26
+
27
+ content = enhanced_fetch_url_content(target_url)
28
+ context_parts.append(f"Context from URL {i} ({target_url}):\n{content}")
29
 
30
  if context_parts:
31
  return "\n\n" + "\n\n".join(context_parts) + "\n\n"
 
46
  import re
47
 
48
  def extract_urls_from_text(text):
49
+ """Extract URLs from text using regex with enhanced validation"""
50
  url_pattern = r'https?://[^\s<>"{}|\\^`\[\]"]+'
51
+ urls = re.findall(url_pattern, text)
52
+
53
+ # Basic URL validation and cleanup
54
+ validated_urls = []
55
+ for url in urls:
56
+ # Remove trailing punctuation that might be captured
57
+ url = url.rstrip('.,!?;:')
58
+ # Basic domain validation
59
+ if '.' in url and len(url) > 10:
60
+ validated_urls.append(url)
61
+
62
+ return validated_urls
63
+
64
+ def validate_url_domain(url):
65
+ """Basic URL domain validation"""
66
+ try:
67
+ from urllib.parse import urlparse
68
+ parsed = urlparse(url)
69
+ # Check for valid domain structure
70
+ if parsed.netloc and '.' in parsed.netloc:
71
+ return True
72
+ except:
73
+ pass
74
+ return False
75
+
76
+ def enhanced_fetch_url_content(url, enable_search_validation=False):
77
+ """Enhanced URL content fetching with optional search validation"""
78
+ if not validate_url_domain(url):
79
+ return f"Invalid URL format: {url}"
80
+
81
+ try:
82
+ # Enhanced headers for better compatibility
83
+ headers = {
84
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
85
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
86
+ 'Accept-Language': 'en-US,en;q=0.5',
87
+ 'Accept-Encoding': 'gzip, deflate',
88
+ 'Connection': 'keep-alive'
89
+ }
90
+
91
+ response = requests.get(url, timeout=15, headers=headers)
92
+ response.raise_for_status()
93
+ soup = BeautifulSoup(response.content, 'html.parser')
94
+
95
+ # Enhanced content cleaning
96
+ for element in soup(["script", "style", "nav", "header", "footer", "aside", "form", "button"]):
97
+ element.decompose()
98
+
99
+ # Extract main content preferentially
100
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=lambda x: x and 'content' in x.lower()) or soup
101
+ text = main_content.get_text()
102
+
103
+ # Enhanced text cleaning
104
+ lines = (line.strip() for line in text.splitlines())
105
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
106
+ text = ' '.join(chunk for chunk in chunks if chunk and len(chunk) > 2)
107
+
108
+ # Smart truncation - try to end at sentence boundaries
109
+ if len(text) > 4000:
110
+ truncated = text[:4000]
111
+ last_period = truncated.rfind('.')
112
+ if last_period > 3000: # If we can find a reasonable sentence break
113
+ text = truncated[:last_period + 1]
114
+ else:
115
+ text = truncated + "..."
116
+
117
+ return text if text.strip() else "No readable content found at this URL"
118
+
119
+ except requests.exceptions.Timeout:
120
+ return f"Timeout error fetching {url} (15s limit exceeded)"
121
+ except requests.exceptions.RequestException as e:
122
+ return f"Error fetching {url}: {str(e)}"
123
+ except Exception as e:
124
+ return f"Error processing content from {url}: {str(e)}"
125
 
126
  # Template for generated space app (based on mvp_simple.py)
127
  SPACE_TEMPLATE = '''import gradio as gr
 
435
  ]
436
 
437
  def fetch_url_content(url):
438
+ """Fetch and extract text content from a URL - maintained for backward compatibility"""
439
+ return enhanced_fetch_url_content(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
 
441
  def get_grounding_context(urls):
442
  """Fetch context from grounding URLs"""
 
751
  )
752
 
753
  try:
754
+ # Use the system prompt directly (research assistant toggle already updates it)
755
+ if not system_prompt or not system_prompt.strip():
756
+ return (
757
+ {},
758
+ gr.update(value="**Error:** Please provide a System Prompt for the assistant", visible=True),
759
+ gr.update(visible=False),
760
+ gr.update(value="Configuration will appear here after preview generation.")
761
+ )
762
+
763
+ final_system_prompt = system_prompt.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
764
 
765
  # Create configuration for preview
766
  config_data = {
 
916
  urls_in_message = extract_urls_from_text(message)
917
  if urls_in_message:
918
  dynamic_context_parts = []
919
+ for url in urls_in_message[:3]: # Increased limit to 3 URLs with enhanced processing
920
+ content = enhanced_fetch_url_content(url)
921
  dynamic_context_parts.append(f"\n\nDynamic context from {url}:\n{content}")
922
  if dynamic_context_parts:
923
  dynamic_context = "\n".join(dynamic_context_parts)
 
1023
  if enable_vector_rag and rag_tool_state:
1024
  rag_data = rag_tool_state.get_serialized_data()
1025
 
1026
+ # Use the system prompt directly (research assistant toggle already updates it)
1027
+ if not system_prompt or not system_prompt.strip():
1028
+ return gr.update(value="Error: Please provide a System Prompt for the assistant", visible=True), gr.update(visible=False)
1029
+
1030
+ final_system_prompt = system_prompt.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1031
 
1032
  filename = generate_zip(name, description, final_system_prompt, model, api_key_var, temperature, max_tokens, examples_text, access_code, enable_dynamic_urls, url1, url2, url3, url4, enable_vector_rag, rag_data)
1033
 
claude.local.md ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ Chat UI Helper is a Gradio-based tool for generating and configuring chat interfaces for HuggingFace Spaces. It creates deployable packages with custom assistants, web scraping capabilities, and optional vector RAG functionality.
8
+
9
+ ## Core Architecture
10
+
11
+ ### Main Application Flow (`app.py`)
12
+ The application follows a three-tab Gradio interface pattern:
13
+ 1. **Configuration Tab**: Space setup, assistant configuration, tool settings (lines 1267-1589)
14
+ 2. **Sandbox Preview Tab**: Interactive testing with real OpenRouter API integration (lines 1591-1699)
15
+ 3. **Support Docs Tab**: Comprehensive guidance and templates via `support_docs.py`
16
+
17
+ ### Template Generation System
18
+ - `SPACE_TEMPLATE` (lines 50-347): Complete HuggingFace Space template with export functionality and legacy tuple format compatibility
19
+ - `generate_zip()` function (lines 562-652): Orchestrates package creation with all dependencies
20
+ - Key template variables: `{system_prompt}`, `{model}`, `{enable_vector_rag}`, `{api_key_var}`, `{grounding_urls}`, `{enable_dynamic_urls}`
21
+
22
+ ### Preview Sandbox Architecture (Enhanced)
23
+ - Real OpenRouter API integration in preview mode (`preview_chat_response()` line 855)
24
+ - URL context testing with dynamic add/remove functionality
25
+ - Configuration-aware responses using exact model and parameters from user configuration
26
+ - Fallback messaging when `OPENROUTER_API_KEY` environment variable not set
27
+ - Legacy tuple format compatibility for Gradio 4.44.1 ChatInterface
28
+ - **Comprehensive Debugging**: Enhanced error handling with detailed API response validation (lines 928-955)
29
+ - Empty response detection and logging
30
+ - API structure validation (choices, message, content)
31
+ - Request payload debugging for troubleshooting
32
+ - Timeout handling (30 seconds) for API requests
33
+
34
+ ### Document Processing Pipeline (RAG)
35
+ - **RAGTool** (`rag_tool.py`): Main orchestrator with 10MB file size validation (lines 19-79)
36
+ - **DocumentProcessor** (`document_processor.py`): PDF/DOCX/TXT/MD parsing with semantic chunking (800 chars, 100 overlap)
37
+ - **VectorStore** (`vector_store.py`): FAISS-based similarity search and base64 serialization
38
+
39
+ ### Web Scraping Architecture
40
+ Simple HTTP + BeautifulSoup approach (replacing previous Crawl4AI):
41
+ - `fetch_url_content()` (lines 390-415): Basic requests with timeout and user-agent headers
42
+ - Content cleaning: Removes scripts, styles, navigation elements
43
+ - Content limits: ~4000 character truncation for context management
44
+ - URL content caching: `get_cached_grounding_context()` (line 1019) prevents redundant fetches
45
+ - `extract_urls_from_text()` (line 44): Regex-based URL extraction for dynamic fetching
46
+
47
+ ## Development-Only Utilities
48
+
49
+ ### MCP Servers
50
+ - **Gradio Docs**: Available at https://gradio-docs-mcp.hf.space/gradio_api/mcp/sse
51
+ - Use `gradio_docs.py` utility for development assistance
52
+ - **CRITICAL**: Do NOT import in main application - this is for development tooling only
53
+
54
+ Usage for development:
55
+ ```bash
56
+ python -c "from gradio_docs import gradio_docs; print(gradio_docs.search_docs('ChatInterface'))"
57
+ ```
58
+
59
+ ## Development Commands
60
+
61
+ ### Environment Setup
62
+ **Important**: This application requires Python β‰₯3.10 for Gradio 5.x compatibility.
63
+
64
+ ```bash
65
+ # Recommended: Use Python 3.11+ environment
66
+ python3.11 -m venv venv311
67
+ source venv311/bin/activate # or venv311\Scripts\activate on Windows
68
+ pip install -r requirements.txt
69
+ ```
70
+
71
+ ### Running the Application
72
+ ```bash
73
+ # With virtual environment activated
74
+ python app.py
75
+ ```
76
+
77
+ ### Testing Commands
78
+ ```bash
79
+ # Test vector database functionality (requires all RAG dependencies)
80
+ python test_vector_db.py
81
+
82
+ # Test OpenRouter API key validation
83
+ python test_api_key.py
84
+
85
+ # Test minimal Gradio functionality (for debugging)
86
+ python test_minimal.py
87
+
88
+ # Test preview functionality components (new)
89
+ python test_preview.py
90
+
91
+ # Test individual RAG components
92
+ python -c "from test_vector_db import test_document_processing; test_document_processing()"
93
+ python -c "from test_vector_db import test_vector_store; test_vector_store()"
94
+ python -c "from test_vector_db import test_rag_tool; test_rag_tool()"
95
+ ```
96
+
97
+ ### Pre-Test Setup for RAG Components
98
+ ```bash
99
+ # Create test document for vector database testing
100
+ echo "This is a test document for RAG functionality testing." > test_document.txt
101
+
102
+ # Verify all dependencies are installed
103
+ python -c "import sentence_transformers, faiss, fitz; print('RAG dependencies available')"
104
+ ```
105
+
106
+ ### Key Dependencies and Versions
107
+
108
+ #### Required Dependencies
109
+ - **Gradio β‰₯4.44.1**: Main UI framework (5.37.0 recommended for Python β‰₯3.10)
110
+ - **requests β‰₯2.32.3**: HTTP requests for web content fetching
111
+ - **beautifulsoup4 β‰₯4.12.3**: HTML parsing for web scraping
112
+ - **python-dotenv β‰₯1.0.0**: Environment variable management
113
+
114
+ #### Optional RAG Dependencies
115
+ - **sentence-transformers β‰₯2.2.2**: Text embeddings
116
+ - **faiss-cpu ==1.7.4**: Vector similarity search
117
+ - **PyMuPDF β‰₯1.23.0**: PDF text extraction
118
+ - **python-docx β‰₯0.8.11**: DOCX document processing
119
+ - **numpy ==1.26.4**: Numerical operations
120
+
121
+ ## Configuration Patterns
122
+
123
+ ### Conditional Dependency Loading
124
+ ```python
125
+ try:
126
+ from rag_tool import RAGTool
127
+ HAS_RAG = True
128
+ except ImportError:
129
+ HAS_RAG = False
130
+ RAGTool = None
131
+ ```
132
+ This pattern allows graceful degradation when optional vector dependencies are unavailable.
133
+
134
+ ### Template Variable Substitution
135
+ Generated spaces use these key substitutions:
136
+ - `{system_prompt}`: Combined assistant configuration
137
+ - `{grounding_urls}`: Static URL list for context
138
+ - `{enable_dynamic_urls}`: Runtime URL fetching capability
139
+ - `{enable_vector_rag}`: Document search integration
140
+ - `{rag_data_json}`: Serialized embeddings and chunks
141
+ - `{api_key_var}`: Customizable API key environment variable name
142
+
143
+ ### Access Control Pattern
144
+ - Environment variable `SPACE_ACCESS_CODE` for student access control
145
+ - Global state management for session-based access in generated spaces
146
+ - Security-first approach storing credentials as HuggingFace Spaces secrets
147
+
148
+ ### RAG Integration Workflow
149
+ 1. Documents uploaded through Gradio File component with conditional visibility (`HAS_RAG` flag)
150
+ 2. Processed via DocumentProcessor (PDF/DOCX/TXT/MD support) in `process_documents()` function
151
+ 3. Chunked and embedded using sentence-transformers (800 chars, 100 overlap)
152
+ 4. FAISS index created and serialized to base64 for deployment portability
153
+ 5. Embedded in generated template via `{rag_data_json}` template variable
154
+
155
+ ## Implementation Notes
156
+
157
+ ### Research Template System (Simplified)
158
+ - **Simple Toggle**: `toggle_research_assistant()` function (line 1225) now provides simple on/off functionality
159
+ - **Direct System Prompt**: Enables predefined academic research prompt with DOI verification and LibKey integration
160
+ - **Auto-Enable Dynamic URLs**: Research template automatically enables dynamic URL fetching for academic sources
161
+ - **Template Content**: Academic inquiry focus with DOI-verified sources, fact-checking, and proper citation requirements
162
+ - **Note**: Previous complex field system (Role and Purpose, Intended Audience, Key Tasks, Additional Context) has been removed for simplified architecture
163
+
164
+ ### State Management Across Tabs
165
+ - Extensive use of `gr.State()` for maintaining session data
166
+ - Cross-tab functionality through shared state variables (`sandbox_state`, `preview_config_state`)
167
+ - URL content caching to prevent redundant web requests (`url_content_cache` global variable)
168
+ - Preview debugging with comprehensive error handling and API response validation
169
+
170
+ ### Gradio Compatibility and Message Format Handling
171
+ - **Target Version**: Gradio 5.37.0 (requires Python β‰₯3.10)
172
+ - **Legacy Support**: Gradio 4.44.1 compatibility with JSON schema workarounds
173
+ - **Message Format**: Preview uses legacy tuple format `[user_msg, bot_msg]` for ChatInterface compatibility
174
+ - **Generated Spaces**: Use modern dictionary format `{"role": "user", "content": "..."}` for OpenRouter API
175
+
176
+ ### Security Considerations
177
+ - Never embed API keys or access codes in generated templates
178
+ - Environment variable pattern for all sensitive configuration (`{api_key_var}` template variable)
179
+ - Input validation for uploaded files and URL processing
180
+ - Content length limits for web scraping operations
181
+
182
+ ## Testing Infrastructure
183
+
184
+ ### Current Test Structure
185
+ - `test_vector_db.py`: Comprehensive RAG component testing (196 lines)
186
+ - `test_api_key.py`: OpenRouter API validation (85 lines)
187
+ - `test_minimal.py`: Basic Gradio functionality debugging (20 lines)
188
+ - `test_preview.py`: Preview functionality component testing (URL extraction, fetching, chat response)
189
+
190
+ ### Test Dependencies
191
+ RAG testing requires: `sentence-transformers`, `faiss-cpu`, `PyMuPDF`, `python-docx`
192
+
193
+ Core testing requires: `gradio`, `requests`, `beautifulsoup4`, `python-dotenv`
194
+
195
+ ### Testing Status
196
+ - **Functional**: Three main test files covering core functionality
197
+ - **Missing**: Automated test scripts referenced in TEST_PROCEDURE.md (`quick_test.sh`, `full_test.sh`) are documented but not implemented
198
+ - **Usage**: Run individual Python test modules directly
199
+
200
+ ## File Structure Notes
201
+
202
+ ### Generated Space Structure
203
+ All generated HuggingFace Spaces follow consistent structure:
204
+ 1. Configuration section with environment variable loading
205
+ 2. Web scraping functions (simple HTTP requests with BeautifulSoup)
206
+ 3. RAG context retrieval (if enabled)
207
+ 4. OpenRouter API integration with conversation history
208
+ 5. Gradio ChatInterface with access control
209
+
210
+ ### Development Files Not For Production
211
+ - `gradio_docs.py`: MCP server integration (development only)
212
+ - `test_*.py`: Testing utilities
213
+ - `TEST_PROCEDURE.md`: Comprehensive testing methodology
214
+ - `file_upload_proposal.md`: Technical architecture proposals
215
+
216
+ ## Known Issues and Compatibility
217
+
218
+ ### Gradio 4.44.1 JSON Schema Bug
219
+ - **Issue**: TypeError in `json_schema_to_python_type` prevents app startup in some environments
220
+ - **Symptom**: "argument of type 'bool' is not iterable" error during API schema generation
221
+ - **Workaround**: Individual component functions work correctly (as verified by `test_preview.py`)
222
+ - **Solution**: Upgrade to Gradio 5.x for full compatibility, or wait for Gradio 4.x patch
223
+
224
+ ### Message Format Compatibility
225
+ - **Preview Mode**: Uses legacy tuple format `[user_msg, bot_msg]` for Gradio 4.44.1 ChatInterface
226
+ - **Generated Spaces**: Use modern dictionary format for OpenRouter API compatibility
227
+ - **Cross-Version Support**: Template generation handles both formats appropriately
228
+
229
+ ### Python Version Requirements
230
+ - **Minimum**: Python 3.9 (for Gradio 4.44.1)
231
+ - **Recommended**: Python 3.11+ (for Gradio 5.x and optimal performance)
232
+
233
+ ## Common Claude Code Anti-Patterns to Avoid
234
+
235
+ ### Message Format Reversion
236
+ **❌ Don't revert to:** New dictionary format in preview functions
237
+ ```python
238
+ # WRONG - breaks Gradio 4.44.1 ChatInterface
239
+ history.append({"role": "user", "content": message})
240
+ history.append({"role": "assistant", "content": response})
241
+ ```
242
+ **βœ… Keep:** Legacy tuple format for preview compatibility
243
+ ```python
244
+ # CORRECT - works with current Gradio ChatInterface
245
+ history.append([message, response])
246
+ ```
247
+
248
+ ### Template Variable Substitution
249
+ **❌ Don't change:** Template string escaping patterns in `SPACE_TEMPLATE`
250
+ - Keep double backslashes: `\\n\\n` (becomes `\n\n` after Python string processing)
251
+ - Keep double braces: `{{variable}}` (becomes `{variable}` after format())
252
+ - **Reason**: Template undergoes two levels of processing (Python format + HuggingFace deployment)
253
+
254
+ ### Research Template Function Signature
255
+ **βœ… Current Implementation:** Simplified function signature for research template
256
+ ```python
257
+ # CURRENT - simplified toggle with direct system prompt management
258
+ def toggle_research_assistant(enable_research):
259
+ if enable_research:
260
+ return (gr.update(value=combined_prompt), gr.update(value=True))
261
+ else:
262
+ return (gr.update(value=""), gr.update(value=False))
263
+ ```
264
+ **❌ Don't revert to:** Complex field management patterns that are no longer needed
265
+ - The research template no longer uses separate fields for role, audience, tasks, context
266
+ - Current implementation directly manages system prompt and dynamic URL setting only
267
+
268
+ ### Import Organization Anti-Patterns
269
+ **❌ Don't move:** `extract_urls_from_text()` back into template string
270
+ - Function must remain in main app code (line 44) for preview functionality
271
+ - Template version is for generated spaces only
272
+
273
+ ### URL Management Simplification
274
+ **❌ Don't remove:** Dynamic URL add/remove functionality
275
+ - Keep `add_urls()`, `remove_urls()`, `add_chat_urls()`, `remove_chat_urls()` functions
276
+ - Maintain URL count state management with `gr.State()`
277
+ - **Reason**: Users expect scalable URL input interface
278
+
279
+ ### Preview Functionality Degradation
280
+ **❌ Don't revert to:** Simple mock responses in preview
281
+ ```python
282
+ # WRONG - provides no real testing value
283
+ def preview_chat_response(message, history, config_data):
284
+ return "", history + [[message, "Mock response"]]
285
+ ```
286
+ **βœ… Keep:** Real API integration with comprehensive debugging
287
+ - Actual OpenRouter API calls when `OPENROUTER_API_KEY` is set
288
+ - URL context fetching and processing
289
+ - Configuration-aware responses using exact user settings
290
+ - Comprehensive debugging for empty responses and API errors (lines 928-955)
291
+
292
+ ### Research Template Simplification
293
+ **βœ… Current Implementation:** Simplified research template system
294
+ - Simple toggle functionality without complex field management
295
+ - Direct system prompt injection for academic research use cases
296
+ - Auto-enables dynamic URL fetching for academic sources
297
+ - **Reason**: Simplified architecture reduces maintenance complexity while preserving core functionality
298
+
299
+ ### Conditional Dependency Loading
300
+ **❌ Don't remove:** `HAS_RAG` flag and conditional imports
301
+ ```python
302
+ # WRONG - breaks installations without vector dependencies
303
+ from rag_tool import RAGTool
304
+ ```
305
+ **βœ… Keep:** Graceful degradation pattern
306
+ ```python
307
+ # CORRECT - allows app to work without optional dependencies
308
+ try:
309
+ from rag_tool import RAGTool
310
+ HAS_RAG = True
311
+ except ImportError:
312
+ HAS_RAG = False
313
+ RAGTool = None
314
+ ```