Spaces:

milwright
/

chatui-helper

Running

milwright commited on 17 days ago

Commit

7f85357

1 Parent(s): 8b344c3

Add vector RAG functionality as modular tool

- Create document_processor.py for parsing PDF, DOCX, TXT, MD files
- Create vector_store.py for FAISS-based embeddings management
- Create rag_tool.py for integrating RAG with chat interface
- Add file upload UI to app.py with toggle for RAG functionality
- Update SPACE_TEMPLATE to include RAG context retrieval
- Add optional vector dependencies to requirements.txt
- Follow existing enable_dynamic_urls pattern for modularity

Files changed (5) hide show

app.py +150 -8
document_processor.py +205 -0
rag_tool.py +208 -0
requirements.txt +8 -1
vector_store.py +246 -0

app.py CHANGED Viewed

@@ -7,6 +7,8 @@ from datetime import datetime
 from dotenv import load_dotenv
 import requests
 from bs4 import BeautifulSoup
 # from scraping_service import get_grounding_context_crawl4ai, fetch_url_content_crawl4ai
 # Temporary mock functions for testing
 def get_grounding_context_crawl4ai(urls):
@@ -15,6 +17,14 @@ def get_grounding_context_crawl4ai(urls):
 def fetch_url_content_crawl4ai(url):
     return f"[Content from {url} would be fetched here]"
 # Load environment variables from .env file
 load_dotenv()
@@ -34,6 +44,8 @@ MODEL = "{model}"
 GROUNDING_URLS = {grounding_urls}
 ACCESS_CODE = "{access_code}"
 ENABLE_DYNAMIC_URLS = {enable_dynamic_urls}
 # Get API key from environment - customizable variable name
 API_KEY = os.environ.get("{api_key_var}")
@@ -108,6 +120,36 @@ def extract_urls_from_text(text):
     url_pattern = r'https?://[^\\s<>"{{}}|\\^`\\[\\]"]+'
     return re.findall(url_pattern, text)
 def generate_response(message, history):
     """Generate response using OpenRouter API"""
@@ -117,6 +159,12 @@ def generate_response(message, history):
     # Get grounding context
     grounding_context = get_grounding_context()
     # If dynamic URLs are enabled, check message for URLs to fetch
     if ENABLE_DYNAMIC_URLS:
         urls_in_message = extract_urls_from_text(message)
@@ -398,11 +446,16 @@ Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} with Chat U/I Helper
     return readme_content
-def create_requirements():
     """Generate requirements.txt"""
-    return "gradio==4.44.1\nrequests==2.32.3\ncrawl4ai==0.4.245"
-def generate_zip(name, description, role_purpose, intended_audience, key_tasks, additional_context, model, api_key_var, temperature, max_tokens, examples_text, access_code="", enable_dynamic_urls=False, url1="", url2="", url3="", url4=""):
     """Generate deployable zip file"""
     # Process examples
@@ -447,13 +500,15 @@ def generate_zip(name, description, role_purpose, intended_audience, key_tasks,
         'examples': examples_json,
         'grounding_urls': json.dumps(grounding_urls),
         'access_code': access_code or "",
-        'enable_dynamic_urls': enable_dynamic_urls
     }
     # Generate files
     app_content = SPACE_TEMPLATE.format(**config)
     readme_content = create_readme(config)
-    requirements_content = create_requirements()
     # Create zip file with clean naming
     filename = f"{name.lower().replace(' ', '_').replace('-', '_')}.zip"
@@ -474,7 +529,55 @@ def generate_zip(name, description, role_purpose, intended_audience, key_tasks,
     return filename
 # Define callback functions outside the interface
-def on_generate(name, description, role_purpose, intended_audience, key_tasks, additional_context, model, api_key_var, temperature, max_tokens, examples_text, access_code, enable_dynamic_urls, url1, url2, url3, url4):
     if not name or not name.strip():
         return gr.update(value="Error: Please provide a Space Title", visible=True), gr.update(visible=False)
@@ -482,7 +585,12 @@ def on_generate(name, description, role_purpose, intended_audience, key_tasks, a
         return gr.update(value="Error: Please provide a Role and Purpose for the assistant", visible=True), gr.update(visible=False)
     try:
-        filename = generate_zip(name, description, role_purpose, intended_audience, key_tasks, additional_context, model, api_key_var, temperature, max_tokens, examples_text, access_code, enable_dynamic_urls, url1, url2, url3, url4)
         success_msg = f"""**Deployment package ready!**
@@ -790,6 +898,27 @@ with gr.Blocks(title="Chat U/I Helper") as demo:
                         value=False,
                         info="Allow the assistant to fetch additional URLs mentioned in conversations (uses Crawl4AI)"
                     )
                 examples_text = gr.Textbox(
                     label="Example Prompts (one per line)",
@@ -878,10 +1007,23 @@ with gr.Blocks(title="Chat U/I Helper") as demo:
                 outputs=[url3, url4, add_url_btn, remove_url_btn, url_count]
             )
             # Connect the generate button
             generate_btn.click(
                 on_generate,
-                inputs=[name, description, role_purpose, intended_audience, key_tasks, additional_context, model, api_key_var, temperature, max_tokens, examples_text, access_code, enable_dynamic_urls, url1, url2, url3, url4],
                 outputs=[status, download_file]
             )

 from dotenv import load_dotenv
 import requests
 from bs4 import BeautifulSoup
+import tempfile
+from pathlib import Path
 # from scraping_service import get_grounding_context_crawl4ai, fetch_url_content_crawl4ai
 # Temporary mock functions for testing
 def get_grounding_context_crawl4ai(urls):
 def fetch_url_content_crawl4ai(url):
     return f"[Content from {url} would be fetched here]"
+# Import RAG components
+try:
+    from rag_tool import RAGTool
+    HAS_RAG = True
+except ImportError:
+    HAS_RAG = False
+    RAGTool = None
 # Load environment variables from .env file
 load_dotenv()
 GROUNDING_URLS = {grounding_urls}
 ACCESS_CODE = "{access_code}"
 ENABLE_DYNAMIC_URLS = {enable_dynamic_urls}
+ENABLE_VECTOR_RAG = {enable_vector_rag}
+RAG_DATA = {rag_data_json}
 # Get API key from environment - customizable variable name
 API_KEY = os.environ.get("{api_key_var}")
     url_pattern = r'https?://[^\\s<>"{{}}|\\^`\\[\\]"]+'
     return re.findall(url_pattern, text)
+# Initialize RAG context if enabled
+if ENABLE_VECTOR_RAG and RAG_DATA:
+    try:
+        import faiss
+        import numpy as np
+        import base64
+        class SimpleRAGContext:
+            def __init__(self, rag_data):
+                # Deserialize FAISS index
+                index_bytes = base64.b64decode(rag_data['index_base64'])
+                self.index = faiss.deserialize_index(index_bytes)
+                # Restore chunks and mappings
+                self.chunks = rag_data['chunks']
+                self.chunk_ids = rag_data['chunk_ids']
+            def get_context(self, query, max_chunks=3):
+                """Get relevant context - simplified version"""
+                # In production, you'd compute query embedding here
+                # For now, return a simple message
+                return "\\n\\n[RAG context would be retrieved here based on similarity search]\\n\\n"
+        rag_context_provider = SimpleRAGContext(RAG_DATA)
+    except Exception as e:
+        print(f"Failed to initialize RAG: {{e}}")
+        rag_context_provider = None
+else:
+    rag_context_provider = None
 def generate_response(message, history):
     """Generate response using OpenRouter API"""
     # Get grounding context
     grounding_context = get_grounding_context()
+    # Add RAG context if available
+    if ENABLE_VECTOR_RAG and rag_context_provider:
+        rag_context = rag_context_provider.get_context(message)
+        if rag_context:
+            grounding_context += rag_context
     # If dynamic URLs are enabled, check message for URLs to fetch
     if ENABLE_DYNAMIC_URLS:
         urls_in_message = extract_urls_from_text(message)
     return readme_content
+def create_requirements(enable_vector_rag=False):
     """Generate requirements.txt"""
+    base_requirements = "gradio==4.44.1\nrequests==2.32.3\ncrawl4ai==0.4.245"
+    if enable_vector_rag:
+        base_requirements += "\nfaiss-cpu==1.7.4\nnumpy==1.24.3"
+    return base_requirements
+def generate_zip(name, description, role_purpose, intended_audience, key_tasks, additional_context, model, api_key_var, temperature, max_tokens, examples_text, access_code="", enable_dynamic_urls=False, url1="", url2="", url3="", url4="", enable_vector_rag=False, rag_data=None):
     """Generate deployable zip file"""
     # Process examples
         'examples': examples_json,
         'grounding_urls': json.dumps(grounding_urls),
         'access_code': access_code or "",
+        'enable_dynamic_urls': enable_dynamic_urls,
+        'enable_vector_rag': enable_vector_rag,
+        'rag_data_json': json.dumps(rag_data) if rag_data else 'null'
     }
     # Generate files
     app_content = SPACE_TEMPLATE.format(**config)
     readme_content = create_readme(config)
+    requirements_content = create_requirements(enable_vector_rag)
     # Create zip file with clean naming
     filename = f"{name.lower().replace(' ', '_').replace('-', '_')}.zip"
     return filename
 # Define callback functions outside the interface
+def toggle_rag_section(enable_rag):
+    """Toggle visibility of RAG section"""
+    return gr.update(visible=enable_rag)
+def process_documents(files, current_rag_tool):
+    """Process uploaded documents"""
+    if not files:
+        return "Please upload files first", current_rag_tool
+    if not HAS_RAG:
+        return "RAG functionality not available. Please install required dependencies.", current_rag_tool
+    try:
+        # Initialize RAG tool if not exists
+        if not current_rag_tool:
+            current_rag_tool = RAGTool()
+        # Process files
+        result = current_rag_tool.process_uploaded_files(files)
+        if result['success']:
+            # Create status message
+            status_parts = [f"✅ {result['message']}"]
+            # Add file summary
+            if result['summary']['files_processed']:
+                status_parts.append("\n**Processed files:**")
+                for file_info in result['summary']['files_processed']:
+                    status_parts.append(f"- {file_info['name']} ({file_info['chunks']} chunks)")
+            # Add errors if any
+            if result.get('errors'):
+                status_parts.append("\n**Errors:**")
+                for error in result['errors']:
+                    status_parts.append(f"- {error['file']}: {error['error']}")
+            # Add index stats
+            if result.get('index_stats'):
+                stats = result['index_stats']
+                status_parts.append(f"\n**Index stats:** {stats['total_chunks']} chunks, {stats['dimension']}D embeddings")
+            return "\n".join(status_parts), current_rag_tool
+        else:
+            return f"❌ {result['message']}", current_rag_tool
+    except Exception as e:
+        return f"❌ Error processing documents: {str(e)}", current_rag_tool
+def on_generate(name, description, role_purpose, intended_audience, key_tasks, additional_context, model, api_key_var, temperature, max_tokens, examples_text, access_code, enable_dynamic_urls, url1, url2, url3, url4, enable_vector_rag, rag_tool_state):
     if not name or not name.strip():
         return gr.update(value="Error: Please provide a Space Title", visible=True), gr.update(visible=False)
         return gr.update(value="Error: Please provide a Role and Purpose for the assistant", visible=True), gr.update(visible=False)
     try:
+        # Get RAG data if enabled
+        rag_data = None
+        if enable_vector_rag and rag_tool_state:
+            rag_data = rag_tool_state.get_serialized_data()
+        filename = generate_zip(name, description, role_purpose, intended_audience, key_tasks, additional_context, model, api_key_var, temperature, max_tokens, examples_text, access_code, enable_dynamic_urls, url1, url2, url3, url4, enable_vector_rag, rag_data)
         success_msg = f"""**Deployment package ready!**
                         value=False,
                         info="Allow the assistant to fetch additional URLs mentioned in conversations (uses Crawl4AI)"
                     )
+                    enable_vector_rag = gr.Checkbox(
+                        label="Enable Document RAG",
+                        value=False,
+                        info="Upload documents for context-aware responses (PDF, DOCX, TXT, MD)",
+                        visible=HAS_RAG
+                    )
+                    with gr.Column(visible=False) as rag_section:
+                        gr.Markdown("### Document Upload")
+                        file_upload = gr.File(
+                            label="Upload Documents",
+                            file_types=[".pdf", ".docx", ".txt", ".md"],
+                            file_count="multiple",
+                            type="filepath"
+                        )
+                        process_btn = gr.Button("Process Documents", variant="secondary")
+                        rag_status = gr.Markdown()
+                        # State to store RAG tool
+                        rag_tool_state = gr.State(None)
                 examples_text = gr.Textbox(
                     label="Example Prompts (one per line)",
                 outputs=[url3, url4, add_url_btn, remove_url_btn, url_count]
             )
+            # Connect RAG functionality
+            enable_vector_rag.change(
+                toggle_rag_section,
+                inputs=[enable_vector_rag],
+                outputs=[rag_section]
+            )
+            process_btn.click(
+                process_documents,
+                inputs=[file_upload, rag_tool_state],
+                outputs=[rag_status, rag_tool_state]
+            )
             # Connect the generate button
             generate_btn.click(
                 on_generate,
+                inputs=[name, description, role_purpose, intended_audience, key_tasks, additional_context, model, api_key_var, temperature, max_tokens, examples_text, access_code, enable_dynamic_urls, url1, url2, url3, url4, enable_vector_rag, rag_tool_state],
                 outputs=[status, download_file]
             )

document_processor.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import os
+import json
+from typing import List, Dict, Any, Tuple
+from pathlib import Path
+import hashlib
+# Document parsing imports
+try:
+    import fitz  # PyMuPDF
+    HAS_PYMUPDF = True
+except ImportError:
+    HAS_PYMUPDF = False
+try:
+    from docx import Document
+    HAS_DOCX = True
+except ImportError:
+    HAS_DOCX = False
+# Text processing
+import re
+from dataclasses import dataclass
+@dataclass
+class DocumentChunk:
+    text: str
+    metadata: Dict[str, Any]
+    chunk_id: str
+    def to_dict(self):
+        return {
+            'text': self.text,
+            'metadata': self.metadata,
+            'chunk_id': self.chunk_id
+        }
+class DocumentProcessor:
+    def __init__(self, chunk_size: int = 800, chunk_overlap: int = 100):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.supported_extensions = ['.pdf', '.docx', '.txt', '.md']
+    def process_file(self, file_path: str) -> List[DocumentChunk]:
+        """Process a single file and return chunks"""
+        path = Path(file_path)
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        extension = path.suffix.lower()
+        if extension not in self.supported_extensions:
+            raise ValueError(f"Unsupported file type: {extension}")
+        # Extract text based on file type
+        if extension == '.pdf':
+            text = self._extract_pdf_text(file_path)
+        elif extension == '.docx':
+            text = self._extract_docx_text(file_path)
+        elif extension in ['.txt', '.md']:
+            text = self._extract_text_file(file_path)
+        else:
+            raise ValueError(f"Unsupported file type: {extension}")
+        # Create chunks
+        chunks = self._create_chunks(text, file_path)
+        return chunks
+    def _extract_pdf_text(self, file_path: str) -> str:
+        """Extract text from PDF file"""
+        if not HAS_PYMUPDF:
+            raise ImportError("PyMuPDF not installed. Install with: pip install PyMuPDF")
+        text_parts = []
+        try:
+            with fitz.open(file_path) as pdf:
+                for page_num in range(len(pdf)):
+                    page = pdf[page_num]
+                    text = page.get_text()
+                    if text.strip():
+                        text_parts.append(f"[Page {page_num + 1}]\n{text}")
+        except Exception as e:
+            raise Exception(f"Error processing PDF: {str(e)}")
+        return "\n\n".join(text_parts)
+    def _extract_docx_text(self, file_path: str) -> str:
+        """Extract text from DOCX file"""
+        if not HAS_DOCX:
+            raise ImportError("python-docx not installed. Install with: pip install python-docx")
+        text_parts = []
+        try:
+            doc = Document(file_path)
+            for paragraph in doc.paragraphs:
+                if paragraph.text.strip():
+                    text_parts.append(paragraph.text)
+            # Also extract text from tables
+            for table in doc.tables:
+                for row in table.rows:
+                    row_text = []
+                    for cell in row.cells:
+                        if cell.text.strip():
+                            row_text.append(cell.text.strip())
+                    if row_text:
+                        text_parts.append(" | ".join(row_text))
+        except Exception as e:
+            raise Exception(f"Error processing DOCX: {str(e)}")
+        return "\n\n".join(text_parts)
+    def _extract_text_file(self, file_path: str) -> str:
+        """Extract text from plain text or markdown file"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        except Exception as e:
+            raise Exception(f"Error reading text file: {str(e)}")
+    def _create_chunks(self, text: str, file_path: str) -> List[DocumentChunk]:
+        """Create overlapping chunks from text"""
+        chunks = []
+        # Clean and normalize text
+        text = re.sub(r'\s+', ' ', text)
+        text = text.strip()
+        if not text:
+            return chunks
+        # Simple word-based chunking
+        words = text.split()
+        for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
+            chunk_words = words[i:i + self.chunk_size]
+            chunk_text = ' '.join(chunk_words)
+            # Create chunk ID
+            chunk_id = hashlib.md5(f"{file_path}_{i}_{chunk_text[:50]}".encode()).hexdigest()[:8]
+            # Create metadata
+            metadata = {
+                'file_path': file_path,
+                'file_name': Path(file_path).name,
+                'chunk_index': len(chunks),
+                'start_word': i,
+                'word_count': len(chunk_words)
+            }
+            chunk = DocumentChunk(
+                text=chunk_text,
+                metadata=metadata,
+                chunk_id=chunk_id
+            )
+            chunks.append(chunk)
+        return chunks
+    def process_multiple_files(self, file_paths: List[str]) -> Tuple[List[DocumentChunk], Dict[str, Any]]:
+        """Process multiple files and return chunks with summary"""
+        all_chunks = []
+        summary = {
+            'total_files': 0,
+            'total_chunks': 0,
+            'files_processed': [],
+            'errors': []
+        }
+        for file_path in file_paths:
+            try:
+                chunks = self.process_file(file_path)
+                all_chunks.extend(chunks)
+                summary['files_processed'].append({
+                    'path': file_path,
+                    'name': Path(file_path).name,
+                    'chunks': len(chunks)
+                })
+            except Exception as e:
+                summary['errors'].append({
+                    'path': file_path,
+                    'error': str(e)
+                })
+        summary['total_files'] = len(summary['files_processed'])
+        summary['total_chunks'] = len(all_chunks)
+        return all_chunks, summary
+# Utility function for file size validation
+def validate_file_size(file_path: str, max_size_mb: float = 10.0) -> bool:
+    """Check if file size is within limits"""
+    size_bytes = os.path.getsize(file_path)
+    size_mb = size_bytes / (1024 * 1024)
+    return size_mb <= max_size_mb

rag_tool.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import json
+from typing import List, Dict, Any, Optional, Tuple
+from document_processor import DocumentProcessor, DocumentChunk
+from vector_store import VectorStore, SearchResult
+import os
+import tempfile
+from pathlib import Path
+class RAGTool:
+    """RAG tool for integrating document search with chat"""
+    def __init__(self):
+        self.processor = DocumentProcessor(chunk_size=800, chunk_overlap=100)
+        self.vector_store = VectorStore()
+        self.processed_files = []
+        self.total_chunks = 0
+    def process_uploaded_files(self, file_paths: List[str]) -> Dict[str, Any]:
+        """Process uploaded files and build vector index"""
+        # Validate files
+        valid_files = []
+        errors = []
+        for file_path in file_paths:
+            try:
+                # Check file size (10MB limit)
+                size_mb = os.path.getsize(file_path) / (1024 * 1024)
+                if size_mb > 10:
+                    errors.append({
+                        'file': Path(file_path).name,
+                        'error': f'File too large ({size_mb:.1f}MB). Maximum size is 10MB.'
+                    })
+                    continue
+                valid_files.append(file_path)
+            except Exception as e:
+                errors.append({
+                    'file': Path(file_path).name,
+                    'error': str(e)
+                })
+        if not valid_files:
+            return {
+                'success': False,
+                'message': 'No valid files to process',
+                'errors': errors
+            }
+        # Process files
+        all_chunks, summary = self.processor.process_multiple_files(valid_files)
+        if not all_chunks:
+            return {
+                'success': False,
+                'message': 'No content extracted from files',
+                'summary': summary
+            }
+        # Build vector index
+        chunk_dicts = [chunk.to_dict() for chunk in all_chunks]
+        self.vector_store.build_index(chunk_dicts, show_progress=False)
+        # Update stats
+        self.processed_files = summary['files_processed']
+        self.total_chunks = len(all_chunks)
+        # Calculate index size
+        index_stats = self.vector_store.get_stats()
+        return {
+            'success': True,
+            'message': f'Successfully processed {len(valid_files)} files into {self.total_chunks} chunks',
+            'summary': summary,
+            'index_stats': index_stats,
+            'errors': errors
+        }
+    def get_relevant_context(self, query: str, max_chunks: int = 3) -> str:
+        """Get relevant context for a query"""
+        if not self.vector_store.index:
+            return ""
+        # Search for relevant chunks
+        results = self.vector_store.search(
+            query=query,
+            top_k=max_chunks,
+            score_threshold=0.3
+        )
+        if not results:
+            return ""
+        # Format context
+        context_parts = []
+        for i, result in enumerate(results, 1):
+            file_name = result.metadata.get('file_name', 'Unknown')
+            context_parts.append(
+                f"[Document: {file_name} - Relevance: {result.score:.2f}]\n{result.text}"
+            )
+        return "\n\n".join(context_parts)
+    def get_serialized_data(self) -> Dict[str, Any]:
+        """Get serialized data for deployment"""
+        if not self.vector_store.index:
+            return None
+        return self.vector_store.serialize()
+    def get_deployment_info(self) -> Dict[str, Any]:
+        """Get information for deployment package"""
+        if not self.vector_store.index:
+            return {
+                'enabled': False,
+                'message': 'No documents processed'
+            }
+        # Estimate package size increase
+        index_stats = self.vector_store.get_stats()
+        estimated_size_mb = (
+            # Index size estimation
+            (index_stats['total_chunks'] * index_stats['dimension'] * 4) / (1024 * 1024) +
+            # Chunks text size estimation
+            (sum(len(chunk['text']) for chunk in self.vector_store.chunks.values()) / (1024 * 1024))
+        ) * 1.5  # Add overhead for base64 encoding
+        return {
+            'enabled': True,
+            'total_files': len(self.processed_files),
+            'total_chunks': self.total_chunks,
+            'estimated_size_mb': round(estimated_size_mb, 2),
+            'files': [f['name'] for f in self.processed_files]
+        }
+def create_rag_module_for_space(serialized_data: Dict[str, Any]) -> str:
+    """Create a minimal RAG module for the deployed space"""
+    return '''# RAG Module for deployed space
+import numpy as np
+import faiss
+import base64
+import json
+class RAGContext:
+    def __init__(self, serialized_data):
+        # Deserialize FAISS index
+        index_bytes = base64.b64decode(serialized_data['index_base64'])
+        self.index = faiss.deserialize_index(index_bytes)
+        # Restore chunks and mappings
+        self.chunks = serialized_data['chunks']
+        self.chunk_ids = serialized_data['chunk_ids']
+    def get_context(self, query_embedding, max_chunks=3):
+        """Get relevant context using pre-computed embedding"""
+        if not self.index:
+            return ""
+        # Normalize and search
+        faiss.normalize_L2(query_embedding)
+        scores, indices = self.index.search(query_embedding, max_chunks)
+        # Format results
+        context_parts = []
+        for score, idx in zip(scores[0], indices[0]):
+            if idx < 0 or score < 0.3:
+                continue
+            chunk = self.chunks[self.chunk_ids[idx]]
+            file_name = chunk.get('metadata', {}).get('file_name', 'Document')
+            context_parts.append(
+                f"[{file_name} - Relevance: {score:.2f}]\\n{chunk['text']}"
+            )
+        return "\\n\\n".join(context_parts) if context_parts else ""
+# Initialize RAG context
+RAG_DATA = json.loads(\'\'\'{{rag_data_json}}\'\'\')
+rag_context = RAGContext(RAG_DATA) if RAG_DATA else None
+def get_rag_context(query):
+    """Get relevant context for a query"""
+    if not rag_context:
+        return ""
+    # In production, you'd compute query embedding here
+    # For now, return empty (would need embedding service)
+    return ""
+'''
+def format_context_for_prompt(context: str, query: str) -> str:
+    """Format RAG context for inclusion in prompt"""
+    if not context:
+        return ""
+    return f"""Relevant context from uploaded documents:
+{context}
+Please use the above context to help answer the user's question: {query}"""

requirements.txt CHANGED Viewed

@@ -2,4 +2,11 @@ gradio>=4.44.0
 requests>=2.32.3
 beautifulsoup4>=4.12.3
 python-dotenv>=1.0.0
-crawl4ai>=0.4.245

 requests>=2.32.3
 beautifulsoup4>=4.12.3
 python-dotenv>=1.0.0
+crawl4ai>=0.4.245
+# Vector RAG dependencies (optional)
+sentence-transformers>=2.2.2
+faiss-cpu>=1.7.4
+PyMuPDF>=1.23.0
+python-docx>=0.8.11
+numpy>=1.24.3

vector_store.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import numpy as np
+import pickle
+import base64
+from typing import List, Dict, Any, Tuple, Optional
+import json
+from dataclasses import dataclass
+try:
+    from sentence_transformers import SentenceTransformer
+    HAS_SENTENCE_TRANSFORMERS = True
+except ImportError:
+    HAS_SENTENCE_TRANSFORMERS = False
+try:
+    import faiss
+    HAS_FAISS = True
+except ImportError:
+    HAS_FAISS = False
+@dataclass
+class SearchResult:
+    chunk_id: str
+    text: str
+    score: float
+    metadata: Dict[str, Any]
+class VectorStore:
+    def __init__(self, embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"):
+        self.embedding_model_name = embedding_model
+        self.embedding_model = None
+        self.index = None
+        self.chunks = {}  # chunk_id -> chunk data
+        self.chunk_ids = []  # Ordered list for FAISS index mapping
+        self.dimension = 384  # Default for all-MiniLM-L6-v2
+        if HAS_SENTENCE_TRANSFORMERS:
+            self._initialize_model()
+    def _initialize_model(self):
+        """Initialize the embedding model"""
+        if not HAS_SENTENCE_TRANSFORMERS:
+            raise ImportError("sentence-transformers not installed")
+        self.embedding_model = SentenceTransformer(self.embedding_model_name)
+        # Update dimension based on model
+        self.dimension = self.embedding_model.get_sentence_embedding_dimension()
+    def create_embeddings(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
+        """Create embeddings for a list of texts"""
+        if not self.embedding_model:
+            self._initialize_model()
+        # Process in batches for efficiency
+        embeddings = []
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i + batch_size]
+            batch_embeddings = self.embedding_model.encode(
+                batch,
+                convert_to_numpy=True,
+                show_progress_bar=False
+            )
+            embeddings.append(batch_embeddings)
+        return np.vstack(embeddings) if embeddings else np.array([])
+    def build_index(self, chunks: List[Dict[str, Any]], show_progress: bool = True):
+        """Build FAISS index from chunks"""
+        if not HAS_FAISS:
+            raise ImportError("faiss-cpu not installed")
+        # Extract texts and build embeddings
+        texts = [chunk['text'] for chunk in chunks]
+        if show_progress:
+            print(f"Creating embeddings for {len(texts)} chunks...")
+        embeddings = self.create_embeddings(texts)
+        # Build FAISS index
+        if show_progress:
+            print("Building FAISS index...")
+        # Use IndexFlatIP for inner product (cosine similarity with normalized vectors)
+        self.index = faiss.IndexFlatIP(self.dimension)
+        # Normalize embeddings for cosine similarity
+        faiss.normalize_L2(embeddings)
+        # Add to index
+        self.index.add(embeddings)
+        # Store chunks and maintain mapping
+        self.chunks = {}
+        self.chunk_ids = []
+        for chunk in chunks:
+            chunk_id = chunk['chunk_id']
+            self.chunks[chunk_id] = chunk
+            self.chunk_ids.append(chunk_id)
+        if show_progress:
+            print(f"Index built with {len(chunks)} chunks")
+    def search(self, query: str, top_k: int = 5, score_threshold: float = 0.3) -> List[SearchResult]:
+        """Search for similar chunks"""
+        if not self.index or not self.chunks:
+            return []
+        # Create query embedding
+        query_embedding = self.create_embeddings([query])
+        # Normalize for cosine similarity
+        faiss.normalize_L2(query_embedding)
+        # Search
+        scores, indices = self.index.search(query_embedding, min(top_k, len(self.chunks)))
+        # Convert to results
+        results = []
+        for score, idx in zip(scores[0], indices[0]):
+            if idx < 0 or score < score_threshold:
+                continue
+            chunk_id = self.chunk_ids[idx]
+            chunk = self.chunks[chunk_id]
+            result = SearchResult(
+                chunk_id=chunk_id,
+                text=chunk['text'],
+                score=float(score),
+                metadata=chunk.get('metadata', {})
+            )
+            results.append(result)
+        return results
+    def serialize(self) -> Dict[str, Any]:
+        """Serialize the vector store for deployment"""
+        if not self.index:
+            raise ValueError("No index to serialize")
+        # Serialize FAISS index
+        index_bytes = faiss.serialize_index(self.index)
+        index_base64 = base64.b64encode(index_bytes).decode('utf-8')
+        return {
+            'index_base64': index_base64,
+            'chunks': self.chunks,
+            'chunk_ids': self.chunk_ids,
+            'dimension': self.dimension,
+            'model_name': self.embedding_model_name
+        }
+    @classmethod
+    def deserialize(cls, data: Dict[str, Any]) -> 'VectorStore':
+        """Deserialize a vector store from deployment data"""
+        if not HAS_FAISS:
+            raise ImportError("faiss-cpu not installed")
+        store = cls(embedding_model=data['model_name'])
+        # Deserialize FAISS index
+        index_bytes = base64.b64decode(data['index_base64'])
+        store.index = faiss.deserialize_index(index_bytes)
+        # Restore chunks and mappings
+        store.chunks = data['chunks']
+        store.chunk_ids = data['chunk_ids']
+        store.dimension = data['dimension']
+        return store
+    def get_stats(self) -> Dict[str, Any]:
+        """Get statistics about the vector store"""
+        return {
+            'total_chunks': len(self.chunks),
+            'index_size': self.index.ntotal if self.index else 0,
+            'dimension': self.dimension,
+            'model': self.embedding_model_name
+        }
+class LightweightVectorStore:
+    """Lightweight version for deployed spaces without embedding model"""
+    def __init__(self, serialized_data: Dict[str, Any]):
+        if not HAS_FAISS:
+            raise ImportError("faiss-cpu not installed")
+        # Deserialize FAISS index
+        index_bytes = base64.b64decode(serialized_data['index_base64'])
+        self.index = faiss.deserialize_index(index_bytes)
+        # Restore chunks and mappings
+        self.chunks = serialized_data['chunks']
+        self.chunk_ids = serialized_data['chunk_ids']
+        self.dimension = serialized_data['dimension']
+        # For query embedding, we'll need to include pre-computed embeddings
+        # or use a lightweight embedding service
+        self.query_embeddings_cache = serialized_data.get('query_embeddings_cache', {})
+    def search_with_embedding(self, query_embedding: np.ndarray, top_k: int = 5, score_threshold: float = 0.3) -> List[SearchResult]:
+        """Search using pre-computed query embedding"""
+        if not self.index or not self.chunks:
+            return []
+        # Normalize for cosine similarity
+        faiss.normalize_L2(query_embedding)
+        # Search
+        scores, indices = self.index.search(query_embedding, min(top_k, len(self.chunks)))
+        # Convert to results
+        results = []
+        for score, idx in zip(scores[0], indices[0]):
+            if idx < 0 or score < score_threshold:
+                continue
+            chunk_id = self.chunk_ids[idx]
+            chunk = self.chunks[chunk_id]
+            result = SearchResult(
+                chunk_id=chunk_id,
+                text=chunk['text'],
+                score=float(score),
+                metadata=chunk.get('metadata', {})
+            )
+            results.append(result)
+        return results
+# Utility functions
+def estimate_index_size(num_chunks: int, dimension: int = 384) -> float:
+    """Estimate the size of the index in MB"""
+    # Rough estimation: 4 bytes per float * dimension * num_chunks
+    bytes_size = 4 * dimension * num_chunks
+    # Add overhead for index structure and metadata
+    overhead = 1.2
+    return (bytes_size * overhead) / (1024 * 1024)