satyamdev404 commited on
Commit
e0aa230
·
verified ·
1 Parent(s): f4cb706

Upload 31 files

Browse files
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ .env*
2
+ !.env.example
3
+
4
+ __pycache__/
5
+ *.py[cod]
6
+
7
+ logs/
8
+
9
+ **__pycache__/
10
+
API_KEYS_SETUP.md ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🔑 API Keys Setup Guide
2
+
3
+ ## How to Get Pinecone API Key
4
+
5
+ ### Step 1: Create Pinecone Account
6
+
7
+ 1. Go to [https://www.pinecone.io/](https://www.pinecone.io/)
8
+ 2. Click **"Sign Up"** or **"Get Started Free"**
9
+ 3. Create account with your email or sign up with Google/GitHub
10
+ 4. Verify your email address if required
11
+
12
+ ### Step 2: Access Dashboard
13
+
14
+ 1. Log into your Pinecone account
15
+ 2. You'll be taken to the Pinecone Console/Dashboard
16
+ 3. Look for **"API Keys"** in the left sidebar or navigation menu
17
+
18
+ ### Step 3: Get Your API Key
19
+
20
+ 1. Click on **"API Keys"** in the dashboard
21
+ 2. You'll see your default API key listed
22
+ 3. Click **"Copy"** or the copy icon next to the API key
23
+ 4. Save this key securely - you'll need it for the application
24
+
25
+ ## How to Get Gemini API Key
26
+
27
+ ### Step 1: Go to Google AI Studio
28
+
29
+ 1. Visit [https://aistudio.google.com/](https://aistudio.google.com/)
30
+ 2. Sign in with your Google account
31
+
32
+ ### Step 2: Get API Key
33
+
34
+ 1. Click **"Get API Key"** in the top navigation
35
+ 2. Click **"Create API Key"**
36
+ 3. Select your Google Cloud project (or create a new one)
37
+ 4. Copy the generated API key
38
+
39
+ ## How to Get Tavily API Key
40
+
41
+ ### Step 1: Create Tavily Account
42
+
43
+ 1. Go to [https://app.tavily.com/](https://app.tavily.com/)
44
+ 2. Click **"Sign Up"** and register with your email or use a social login
45
+ 3. Verify your email address if prompted
46
+
47
+ ### Step 2: Access API Keys
48
+
49
+ 1. Log into your Tavily account
50
+ 2. Navigate to the **"API Keys"** section in your dashboard
51
+ 3. Click **"Create API Key"** if you don't have one yet
52
+ 4. Copy the generated API key and store it securely
53
+
54
+ ## 🚀 Quick Start Guide
55
+
56
+ ### Option 1: Set Environment Variables Temporarily
57
+
58
+ **Windows Command Prompt:**
59
+
60
+ ```cmd
61
+ set PINECONE_API_KEY=pc-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
62
+ set GEMINI_API_KEY=AIzaSyxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
63
+ set TAVILY_API_KEY=tvly_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
64
+ python app.py
65
+ ```
66
+
67
+ **Windows PowerShell:**
68
+
69
+ ```powershell
70
+ $env:PINECONE_API_KEY="pc-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
71
+ $env:GEMINI_API_KEY="AIzaSyxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
72
+ $env:TAVILY_API_KEY="tvly_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
73
+ python app.py
74
+ ```
75
+
76
+ ### Option 2: Create .env File (Recommended)
77
+
78
+ 1. Create a file named `.env` in your project root:
79
+
80
+ ```
81
+ PINECONE_API_KEY=pc-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
82
+ GEMINI_API_KEY=AIzaSyxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
83
+ TAVILY_API_KEY=tvly_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
84
+ OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx # Optional
85
+ ```
86
+
87
+ 2. Run the application:
88
+
89
+ ```cmd
90
+ python app.py
91
+ ```
92
+
93
+ ## 📋 Free Tier Information
94
+
95
+ ### Pinecone Free Tier:
96
+
97
+ - ✅ 1 project
98
+ - ✅ 1 index
99
+ - ✅ 100K vectors
100
+ - ✅ Perfect for hackathons and testing
101
+
102
+ ### Gemini Free Tier:
103
+
104
+ - ✅ 15 requests per minute
105
+ - ✅ 1 million tokens per day
106
+ - ✅ Sufficient for development and demos
107
+
108
+ ### Tavily Free Tier:
109
+
110
+ - ✅ Generous free tier for testing and development
111
+ - ✅ Check [Tavily Pricing](https://www.tavily.com/pricing) for current limits
112
+
113
+ ## 🔧 Troubleshooting
114
+
115
+ ### If you get "Invalid API Key" errors:
116
+
117
+ 1. Double-check the API key is copied correctly
118
+ 2. Make sure there are no extra spaces
119
+ 3. Verify the environment variable is set: `echo %PINECONE_API_KEY%`
120
+
121
+ ### If Pinecone connection fails:
122
+
123
+ 1. Check your internet connection
124
+ 2. Verify your Pinecone account is active
125
+ 3. Make sure you're using the correct region (default is usually fine)
126
+
127
+ ## 🎯 Ready to Launch
128
+
129
+ Once you have all API keys:
130
+
131
+ 1. **Set the environment variables**
132
+ 2. **Run the application:**
133
+ ```cmd
134
+ python app.py
135
+ ```
136
+ 3. **Open your browser to:** `http://localhost:7860`
137
+ 4. **Start uploading documents and asking questions!**
138
+
139
+ The application will now have full functionality with:
140
+
141
+ - ✅ Document processing and embedding
142
+ - ✅ Vector storage in Pinecone
143
+ - ✅ AI-powered question answering
144
+ - ✅ Beautiful Gradio interface
145
+
146
+ **🎉 Your AI Embedded Knowledge Agent will be fully operational!**
README.md CHANGED
@@ -1,12 +1,49 @@
1
- ---
2
- title: Payman
3
- emoji: 📚
4
- colorFrom: pink
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.34.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🧠 AI Embedded Knowledge Agent
2
+
3
+ A comprehensive Retrieval-Augmented Generation (RAG) system that allows you to upload documents, process URLs, and ask intelligent questions about your knowledge base. Built with modern AI technologies and optimized for deployment on Hugging Face Spaces.
4
+
5
+ ## 🎥 Demo Video
6
+
7
+ Watch our comprehensive demo showcasing the live search capabilities and RAG system in action:
8
+
9
+ [![RAG AI Demo](https://img.shields.io/badge/🎥_Watch_Demo-Loom-00D4AA?style=for-the-badge)](https://www.loom.com/share/f1a3c79b75ad4b65b528b2973612cdd9?sid=a0932972-2926-42ab-a031-3149e40b1b97)
10
+
11
+ _See how the system processes documents, performs live web searches, and generates intelligent responses with real-time data integration._
12
+
13
+ ## ✨ Features
14
+
15
+ ### 🔥 Core Capabilities
16
+
17
+ - **📄 Document Processing**: Support for PDF, DOCX, CSV, XLSX, PPTX, TXT, MD, and more
18
+ - **🌐 URL Processing**: Extract content from web pages with intelligent crawling
19
+ - **🔍 Live Web Search**: Real-time web search using Tavily API for up-to-date information
20
+ - **🧠 Smart Q&A**: Ask questions and get contextual answers with source attribution
21
+ - **🎯 High Accuracy**: Advanced embedding and similarity search for precise results
22
+ - **⚡ Real-time Processing**: Fast document ingestion and query processing
23
+
24
+ ### 🚀 Advanced Features
25
+
26
+ - **🤖 Multiple LLM Support**: Gemini 2.5 Flash, OpenAI GPT models with automatic fallback
27
+ - **📊 Analytics Dashboard**: Query analytics, system metrics, and performance monitoring
28
+ - **🔍 Smart Query Processing**: Query expansion, caching, and suggestion system
29
+ - **📚 Knowledge Base Management**: View, manage, and export your knowledge base
30
+ - **🛡️ Robust Error Handling**: Graceful degradation and comprehensive error recovery
31
+ - **🎨 Beautiful UI**: Modern Gradio interface optimized for user experience
32
+
33
+ ### 🏗️ Technical Excellence
34
+
35
+ - **🔧 Modular Architecture**: Clean, maintainable, and extensible codebase
36
+ - **⚙️ Configurable**: Comprehensive YAML configuration for all components
37
+ - **🔒 Secure**: Input sanitization, rate limiting, and security best practices
38
+ - **📈 Scalable**: Designed for production deployment with monitoring and health checks
39
+ - **🧪 Well-tested**: Comprehensive test suite and example usage
40
+
41
+ ## 🎯 Project Origin & Agentic Vision
42
+
43
+ This project was initially conceived and developed as part of the **`agents-mcp-hackathon`**, showcasing a robust **`agent-demo-track`** for intelligent knowledge retrieval and generation. Our vision was to build an autonomous AI agent capable of:
44
+
45
+ - **🧠 Intelligent Information Retrieval**: Acting as a smart agent to fetch, process, and synthesize information from diverse sources (documents, URLs, live web).
46
+ - **🚀 Dynamic Query Routing**: Intelligently deciding between local knowledge base retrieval and real-time web search based on query intent.
47
+ - **💡 Autonomous Knowledge Management**: Providing a self-contained system for building and querying a dynamic knowledge base.
48
+
49
+ This system embodies the principles of agentic AI, offering a powerful, self-sufficient solution for complex information needs.
app.py ADDED
@@ -0,0 +1,884 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI Embedded Knowledge Agent - Main Application Entry Point
3
+
4
+ This is the main entry point for the RAG AI system that integrates all components
5
+ and launches the Gradio interface for deployment on Hugging Face.
6
+ """
7
+
8
+ import nltk
9
+
10
+ nltk.download("punkt_tab")
11
+
12
+
13
+ import spacy.cli
14
+
15
+ spacy.cli.download("en_core_web_sm")
16
+ nlp = spacy.load("en_core_web_sm")
17
+
18
+
19
+ import os
20
+ import sys
21
+ import logging
22
+ from pathlib import Path
23
+ from typing import Optional
24
+
25
+ # Load environment variables from .env file
26
+ try:
27
+ from dotenv import load_dotenv
28
+
29
+ load_dotenv()
30
+ except ImportError:
31
+ print(
32
+ "python-dotenv not installed. Please install it with: pip install python-dotenv"
33
+ )
34
+
35
+ # Add src directory to Python path
36
+ src_path = Path(__file__).parent / "src"
37
+ sys.path.insert(0, str(src_path))
38
+
39
+ # Import all components
40
+ from utils.config_manager import ConfigManager
41
+ from utils.error_handler import ErrorHandler, ErrorType
42
+ from ingestion.document_processor import DocumentProcessor
43
+ from ingestion.url_processor import URLProcessor
44
+ from ingestion.text_extractor import TextExtractor
45
+ from embedding.embedding_generator import EmbeddingGenerator
46
+ from storage.vector_db import VectorDB
47
+ from rag.optimized_query_processor import OptimizedQueryProcessor
48
+ from rag.response_generator import ResponseGenerator
49
+ from rag.live_search import LiveSearchProcessor
50
+ from rag.query_router import QueryRouter
51
+ from ui.gradio_app import GradioApp
52
+
53
+
54
+ class RAGSystem:
55
+ """
56
+ Main RAG AI system that orchestrates all components.
57
+
58
+ This class integrates document processing, embedding generation,
59
+ vector storage, and query processing into a unified system.
60
+ """
61
+
62
+ def __init__(self, config_path: Optional[str] = None):
63
+ """
64
+ Initialize the RAG system with all components.
65
+
66
+ Args:
67
+ config_path: Path to configuration file
68
+ """
69
+ # Initialize configuration
70
+ self.config_manager = ConfigManager(config_path)
71
+ self.config = self.config_manager.config
72
+
73
+ # Setup logging
74
+ self._setup_logging()
75
+ self.logger = logging.getLogger(__name__)
76
+ self.logger.info("Initializing RAG AI System...")
77
+
78
+ # Initialize error handler
79
+ self.error_handler = ErrorHandler()
80
+
81
+ # Validate environment and configuration
82
+ self._validate_environment()
83
+
84
+ # Initialize components
85
+ self._initialize_components()
86
+
87
+ # Run health checks
88
+ self._run_startup_health_checks()
89
+
90
+ self.logger.info("RAG AI System initialized successfully! ")
91
+
92
+ def _setup_logging(self):
93
+ """Setup comprehensive logging configuration."""
94
+ log_config = self.config.get("logging", {})
95
+ log_level = getattr(logging, log_config.get("level", "INFO").upper())
96
+ log_format = log_config.get(
97
+ "format", "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
98
+ )
99
+
100
+ # Configure root logger with UTF-8 encoding
101
+ import io
102
+
103
+ utf8_stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
104
+ logging.basicConfig(
105
+ level=log_level,
106
+ format=log_format,
107
+ handlers=[logging.StreamHandler(utf8_stdout)],
108
+ )
109
+
110
+ # Create logs directory if specified
111
+ log_file = log_config.get("file")
112
+ if log_file:
113
+ log_dir = Path(log_file).parent
114
+ log_dir.mkdir(parents=True, exist_ok=True)
115
+
116
+ # Add file handler with rotation
117
+ try:
118
+ from logging.handlers import RotatingFileHandler
119
+
120
+ file_handler = RotatingFileHandler(
121
+ log_file,
122
+ maxBytes=log_config.get("max_file_size_mb", 10) * 1024 * 1024,
123
+ backupCount=log_config.get("backup_count", 5),
124
+ )
125
+ file_handler.setFormatter(logging.Formatter(log_format))
126
+ logging.getLogger().addHandler(file_handler)
127
+ except Exception as e:
128
+ self.logger.warning(f"Could not setup file logging: {e}")
129
+
130
+ def _validate_environment(self):
131
+ """Validate environment variables and configuration."""
132
+ self.logger.info("Validating environment...")
133
+
134
+ # Check required API keys
135
+ required_keys = ["GEMINI_API_KEY"]
136
+ optional_keys = ["PINECONE_API_KEY", "OPENAI_API_KEY"]
137
+
138
+ missing_required = []
139
+ for key in required_keys:
140
+ if not os.getenv(key):
141
+ missing_required.append(key)
142
+
143
+ if missing_required:
144
+ self.logger.error(
145
+ f" Missing required environment variables: {missing_required}"
146
+ )
147
+ self.logger.error(
148
+ "Please set the required API keys as environment variables"
149
+ )
150
+ # Don't raise error in demo mode, just warn
151
+ self.logger.warning("Running in demo mode with limited functionality")
152
+
153
+ # Check optional keys
154
+ missing_optional = []
155
+ for key in optional_keys:
156
+ if not os.getenv(key):
157
+ missing_optional.append(key)
158
+
159
+ if missing_optional:
160
+ self.logger.warning(
161
+ f"Missing optional environment variables: {missing_optional}"
162
+ )
163
+ self.logger.warning("Some features may be limited without these keys")
164
+
165
+ # Validate configuration
166
+ self._validate_configuration()
167
+
168
+ self.logger.info("Environment validation completed")
169
+
170
+ def _validate_configuration(self):
171
+ """Validate configuration settings."""
172
+ try:
173
+ # Check embedding configuration
174
+ embedding_config = self.config.get("embedding", {})
175
+ if not embedding_config.get("model"):
176
+ self.logger.warning("Embedding model not specified, using default")
177
+
178
+ # Check vector database configuration
179
+ vector_db_config = self.config.get("vector_db", {})
180
+ if not vector_db_config.get("provider"):
181
+ self.logger.warning(
182
+ "Vector database provider not specified, using default"
183
+ )
184
+
185
+ # Check RAG configuration
186
+ rag_config = self.config.get("rag", {})
187
+ if rag_config.get("top_k", 5) <= 0:
188
+ self.logger.warning("Invalid top_k value, using default")
189
+
190
+ self.logger.info("Configuration validation completed")
191
+
192
+ except Exception as e:
193
+ self.logger.warning(f"Configuration validation warning: {e}")
194
+
195
+ def _initialize_components(self):
196
+ """Initialize all system components with error handling."""
197
+ try:
198
+ self.logger.info("Initializing system components...")
199
+
200
+ # Document processing components
201
+ self.logger.info(" Initializing document processing components...")
202
+ self.document_processor = DocumentProcessor(
203
+ self.config_manager.get_section("document_processing")
204
+ )
205
+
206
+ self.url_processor = URLProcessor(
207
+ self.config_manager.get_section("url_processing")
208
+ )
209
+
210
+ self.text_extractor = TextExtractor(
211
+ self.config_manager.get_section("document_processing")
212
+ )
213
+
214
+ # Embedding and storage components
215
+ self.logger.info("Initializing embedding and storage components...")
216
+ embedding_config = self.config_manager.get_section("embedding")
217
+ embedding_config["api_key"] = os.getenv("GEMINI_API_KEY")
218
+
219
+ self.embedding_generator = EmbeddingGenerator(embedding_config)
220
+
221
+ vector_db_config = self.config_manager.get_section("vector_db")
222
+ vector_db_config["api_key"] = os.getenv("PINECONE_API_KEY")
223
+
224
+ self.vector_db = VectorDB(vector_db_config)
225
+
226
+ # RAG components
227
+ self.logger.info("Initializing RAG components...")
228
+ self.query_processor = OptimizedQueryProcessor(
229
+ self.embedding_generator,
230
+ self.vector_db,
231
+ self.config_manager.get_section("rag"),
232
+ )
233
+
234
+ rag_config = self.config_manager.get_section("rag")
235
+ # Add API keys to RAG config for LLM initialization
236
+ rag_config["gemini_api_key"] = os.getenv("GEMINI_API_KEY")
237
+ rag_config["openai_api_key"] = os.getenv("OPENAI_API_KEY")
238
+
239
+ self.response_generator = ResponseGenerator(rag_config)
240
+
241
+ # Live Search components
242
+ self.logger.info("Initializing Live Search components...")
243
+ live_search_config = self.config_manager.get_section("live_search") or {}
244
+ self.live_search_processor = LiveSearchProcessor(live_search_config)
245
+
246
+ # Query Router for intelligent routing
247
+ router_config = self.config_manager.get_section("query_router") or {}
248
+ self.query_router = QueryRouter(
249
+ self.query_processor, self.live_search_processor, router_config
250
+ )
251
+
252
+ self.logger.info("All components initialized successfully")
253
+
254
+ except Exception as e:
255
+ self.logger.error(f" Failed to initialize components: {str(e)}")
256
+ # Don't raise in demo mode, continue with limited functionality
257
+ self.logger.warning("Some components may not be fully functional")
258
+
259
+ def _run_startup_health_checks(self):
260
+ """Run health checks on all components."""
261
+ self.logger.info("Running startup health checks...")
262
+
263
+ health_status = {
264
+ "document_processor": False,
265
+ "url_processor": False,
266
+ "text_extractor": False,
267
+ "embedding_generator": False,
268
+ "vector_db": False,
269
+ "query_processor": False,
270
+ "response_generator": False,
271
+ }
272
+
273
+ # Check each component
274
+ try:
275
+ if hasattr(self, "document_processor"):
276
+ health_status["document_processor"] = True
277
+ self.logger.info("Document processor: Healthy")
278
+ except:
279
+ self.logger.warning("Document processor: Not available")
280
+
281
+ try:
282
+ if hasattr(self, "url_processor"):
283
+ health_status["url_processor"] = True
284
+ self.logger.info("URL processor: Healthy")
285
+ except:
286
+ self.logger.warning("URL processor: Not available")
287
+
288
+ try:
289
+ if hasattr(self, "text_extractor"):
290
+ health_status["text_extractor"] = True
291
+ self.logger.info("Text extractor: Healthy")
292
+ except:
293
+ self.logger.warning("Text extractor: Not available")
294
+
295
+ try:
296
+ if hasattr(self, "embedding_generator"):
297
+ health_status["embedding_generator"] = True
298
+ self.logger.info("Embedding generator: Healthy")
299
+ except:
300
+ self.logger.warning("Embedding generator: Not available")
301
+
302
+ try:
303
+ if hasattr(self, "vector_db"):
304
+ health_status["vector_db"] = True
305
+ self.logger.info("Vector database: Healthy")
306
+ except:
307
+ self.logger.warning("Vector database: Not available")
308
+
309
+ try:
310
+ if hasattr(self, "query_processor"):
311
+ health_status["query_processor"] = True
312
+ self.logger.info("Query processor: Healthy")
313
+ except:
314
+ self.logger.warning("Query processor: Not available")
315
+
316
+ try:
317
+ if hasattr(self, "response_generator"):
318
+ health_status["response_generator"] = True
319
+ self.logger.info("Response generator: Healthy")
320
+ except:
321
+ self.logger.warning("Response generator: Not available")
322
+
323
+ # Overall health
324
+ healthy_components = sum(health_status.values())
325
+ total_components = len(health_status)
326
+
327
+ self.logger.info(
328
+ f"Health check complete: {healthy_components}/{total_components} components healthy"
329
+ )
330
+
331
+ if healthy_components < total_components:
332
+ self.logger.warning("Some components are not fully functional")
333
+ self.logger.warning("The system will run with limited capabilities")
334
+
335
+ def process_document(self, file_path: str) -> dict:
336
+ """
337
+ Process a document through the complete pipeline.
338
+
339
+ Args:
340
+ file_path: Path to the document file
341
+
342
+ Returns:
343
+ Dictionary with processing results
344
+ """
345
+ try:
346
+ self.logger.info(f" Processing document: {file_path}")
347
+
348
+ # Check if components are available
349
+ if not all(
350
+ hasattr(self, attr)
351
+ for attr in [
352
+ "document_processor",
353
+ "text_extractor",
354
+ "embedding_generator",
355
+ "vector_db",
356
+ ]
357
+ ):
358
+ return {
359
+ "status": "error",
360
+ "error": "Required components not available",
361
+ "chunks_processed": 0,
362
+ }
363
+
364
+ # Step 1: Extract content from document
365
+ doc_result = self.document_processor.process_document(file_path)
366
+
367
+ if not doc_result or "content" not in doc_result:
368
+ return {
369
+ "status": "error",
370
+ "error": "Failed to extract content from document",
371
+ "chunks_processed": 0,
372
+ }
373
+
374
+ # Step 2: Extract and chunk text
375
+ text_chunks = self.text_extractor.process_text(
376
+ doc_result["content"], doc_result.get("metadata", {})
377
+ )
378
+
379
+ if not text_chunks:
380
+ return {
381
+ "status": "error",
382
+ "error": "No text chunks generated",
383
+ "chunks_processed": 0,
384
+ }
385
+
386
+ # Step 3: Generate embeddings
387
+ embedded_chunks = self.embedding_generator.generate_embeddings(text_chunks)
388
+
389
+ if not embedded_chunks:
390
+ return {
391
+ "status": "error",
392
+ "error": "Failed to generate embeddings",
393
+ "chunks_processed": len(text_chunks),
394
+ }
395
+
396
+ # Step 4: Store in vector database
397
+ storage_success = self.vector_db.store_embeddings(embedded_chunks)
398
+
399
+ return {
400
+ "status": "success" if storage_success else "partial_success",
401
+ "chunks_processed": len(text_chunks),
402
+ "chunks_stored": len(embedded_chunks) if storage_success else 0,
403
+ "source": file_path,
404
+ }
405
+
406
+ except Exception as e:
407
+ self.logger.error(f" Error processing document: {str(e)}")
408
+ error_info = self.error_handler.handle_error(e, {"file_path": file_path})
409
+ return {
410
+ "status": "error",
411
+ "error": str(e),
412
+ "error_info": error_info,
413
+ "chunks_processed": 0,
414
+ }
415
+
416
+ def process_url(
417
+ self, url: str, max_depth: int = 1, follow_links: bool = True
418
+ ) -> dict:
419
+ """
420
+ Process a URL through the complete pipeline with advanced options.
421
+
422
+ Args:
423
+ url: URL to process
424
+ max_depth: Maximum crawling depth
425
+ follow_links: Whether to follow links
426
+
427
+ Returns:
428
+ Dictionary with processing results
429
+ """
430
+ try:
431
+ self.logger.info(f"Processing URL: {url}")
432
+
433
+ # Check if components are available
434
+ if not all(
435
+ hasattr(self, attr)
436
+ for attr in [
437
+ "url_processor",
438
+ "text_extractor",
439
+ "embedding_generator",
440
+ "vector_db",
441
+ ]
442
+ ):
443
+ return {
444
+ "status": "error",
445
+ "error": "Required components not available",
446
+ "chunks_processed": 0,
447
+ }
448
+
449
+ # Step 1: Configure URL processor with advanced options
450
+ # Update URL processor configuration dynamically
451
+ self.url_processor.max_depth = max_depth
452
+ self.url_processor.follow_links = follow_links
453
+
454
+ # Reset processor state for fresh crawl
455
+ self.url_processor.reset()
456
+
457
+ # Extract content from URL
458
+ url_result = self.url_processor.process_url(url)
459
+
460
+ if not url_result or "content" not in url_result:
461
+ return {
462
+ "status": "error",
463
+ "error": "Failed to extract content from URL",
464
+ "chunks_processed": 0,
465
+ }
466
+
467
+ # Step 2: Extract and chunk text
468
+ text_chunks = self.text_extractor.process_text(
469
+ url_result["content"], url_result.get("metadata", {})
470
+ )
471
+
472
+ if not text_chunks:
473
+ return {
474
+ "status": "error",
475
+ "error": "No text chunks generated",
476
+ "chunks_processed": 0,
477
+ }
478
+
479
+ # Step 3: Generate embeddings
480
+ embedded_chunks = self.embedding_generator.generate_embeddings(text_chunks)
481
+
482
+ if not embedded_chunks:
483
+ return {
484
+ "status": "error",
485
+ "error": "Failed to generate embeddings",
486
+ "chunks_processed": len(text_chunks),
487
+ }
488
+
489
+ # Step 4: Store in vector database
490
+ storage_success = self.vector_db.store_embeddings(embedded_chunks)
491
+
492
+ # Process linked documents if any
493
+ linked_processed = 0
494
+ for linked_doc in url_result.get("linked_documents", []):
495
+ if linked_doc and "content" in linked_doc:
496
+ try:
497
+ linked_chunks = self.text_extractor.process_text(
498
+ linked_doc["content"], linked_doc.get("metadata", {})
499
+ )
500
+ if linked_chunks:
501
+ linked_embedded = (
502
+ self.embedding_generator.generate_embeddings(
503
+ linked_chunks
504
+ )
505
+ )
506
+ if linked_embedded and self.vector_db.store_embeddings(
507
+ linked_embedded
508
+ ):
509
+ linked_processed += 1
510
+ except Exception as e:
511
+ self.logger.warning(f"Failed to process linked document: {e}")
512
+
513
+ return {
514
+ "status": "success" if storage_success else "partial_success",
515
+ "chunks_processed": len(text_chunks),
516
+ "chunks_stored": len(embedded_chunks) if storage_success else 0,
517
+ "linked_documents_processed": linked_processed,
518
+ "source": url,
519
+ }
520
+
521
+ except Exception as e:
522
+ self.logger.error(f" Error processing URL: {str(e)}")
523
+ error_info = self.error_handler.handle_error(e, {"url": url})
524
+ return {
525
+ "status": "error",
526
+ "error": str(e),
527
+ "error_info": error_info,
528
+ "chunks_processed": 0,
529
+ }
530
+
531
+ def query(
532
+ self,
533
+ question: str,
534
+ max_results: int = 5,
535
+ use_live_search: bool = False,
536
+ search_mode: str = "auto",
537
+ ) -> dict:
538
+ """
539
+ Process a query and generate a response with enhanced search control.
540
+
541
+ Args:
542
+ question: User question
543
+ max_results: Maximum number of results to retrieve
544
+ use_live_search: Whether to enable live web search (uses hybrid approach)
545
+ search_mode: Search mode - "auto", "local_only", "live_only", "hybrid"
546
+
547
+ Returns:
548
+ Dictionary with response and metadata
549
+ """
550
+ try:
551
+ self.logger.info(
552
+ f"Processing query: {question[:100]}... (live_search: {use_live_search})"
553
+ )
554
+
555
+ # Check if components are available
556
+ if not all(
557
+ hasattr(self, attr)
558
+ for attr in ["query_processor", "response_generator"]
559
+ ):
560
+ return {
561
+ "query": question,
562
+ "response": "Query processing components not available. Please check system configuration.",
563
+ "sources": [],
564
+ "confidence": 0.0,
565
+ "error": "Components not available",
566
+ }
567
+
568
+ # Use Query Router for intelligent routing if available
569
+ if hasattr(self, "query_router") and (
570
+ use_live_search or search_mode != "auto"
571
+ ):
572
+ self.logger.info(f" Using Query Router with mode: {search_mode}")
573
+
574
+ search_options = {"search_depth": "basic", "time_range": "month"}
575
+
576
+ router_result = self.query_router.route_query(
577
+ question,
578
+ use_live_search=use_live_search,
579
+ max_results=max_results,
580
+ search_options=search_options,
581
+ search_mode=search_mode,
582
+ )
583
+
584
+ # Convert router result to standard format
585
+ if router_result.get("results"):
586
+ # Format sources from router results
587
+ sources = []
588
+ for result in router_result["results"]:
589
+ sources.append(
590
+ {
591
+ "title": result.get("title", ""),
592
+ "source": result.get("source", ""),
593
+ "content": result.get("content", ""),
594
+ "score": result.get("score", 0.0),
595
+ "type": result.get("type", "unknown"),
596
+ }
597
+ )
598
+
599
+ # Generate response using response generator
600
+ context_items = []
601
+ for result in router_result["results"]:
602
+ context_items.append(
603
+ {
604
+ "text": result.get("content", ""),
605
+ "source": result.get("source", ""),
606
+ "score": result.get("score", 0.0),
607
+ "metadata": result.get("metadata", {}),
608
+ }
609
+ )
610
+
611
+ response_result = self.response_generator.generate_response(
612
+ question, context_items
613
+ )
614
+
615
+ return {
616
+ "query": question,
617
+ "response": response_result.get(
618
+ "response", "No response generated"
619
+ ),
620
+ "sources": sources,
621
+ "confidence": response_result.get("confidence", 0.0),
622
+ "context_items": len(context_items),
623
+ "processing_time": router_result.get("processing_time", 0),
624
+ "generation_time": response_result.get("generation_time", 0),
625
+ "model_used": response_result.get("model_used", "unknown"),
626
+ "routing_decision": router_result.get(
627
+ "routing_decision", "unknown"
628
+ ),
629
+ "search_type": "routed_search",
630
+ }
631
+ else:
632
+ # Fallback to local search if router fails
633
+ self.logger.warning(
634
+ "Router returned no results, falling back to local search"
635
+ )
636
+
637
+ # Traditional local search path
638
+ # Step 1: Process query and retrieve context with max_results
639
+ # Update query processor config temporarily
640
+ original_top_k = self.query_processor.top_k
641
+ self.query_processor.top_k = max_results
642
+
643
+ query_result = self.query_processor.process_query(question)
644
+
645
+ # Restore original top_k
646
+ self.query_processor.top_k = original_top_k
647
+
648
+ if query_result.get("error"):
649
+ return {
650
+ "query": question,
651
+ "response": f"Query processing failed: {query_result['error']}",
652
+ "sources": [],
653
+ "confidence": 0.0,
654
+ "error": query_result["error"],
655
+ }
656
+
657
+ # Step 2: Generate response
658
+ response_result = self.response_generator.generate_response(
659
+ question, query_result.get("context", [])
660
+ )
661
+
662
+ # Combine results
663
+ return {
664
+ "query": question,
665
+ "response": response_result.get("response", "No response generated"),
666
+ "sources": response_result.get("sources", []),
667
+ "confidence": response_result.get("confidence", 0.0),
668
+ "context_items": query_result.get("total_results", 0),
669
+ "processing_time": query_result.get("processing_time", 0),
670
+ "generation_time": response_result.get("generation_time", 0),
671
+ "model_used": response_result.get("model_used", "unknown"),
672
+ "search_type": "local_search",
673
+ }
674
+
675
+ except Exception as e:
676
+ self.logger.error(f"Error processing query: {str(e)}")
677
+ error_info = self.error_handler.handle_error(e, {"query": question})
678
+ return {
679
+ "query": question,
680
+ "response": "I encountered an error while processing your question. Please try again.",
681
+ "sources": [],
682
+ "confidence": 0.0,
683
+ "error": str(e),
684
+ "error_info": error_info,
685
+ }
686
+
687
+ def get_system_status(self) -> dict:
688
+ """
689
+ Get comprehensive system status.
690
+
691
+ Returns:
692
+ Dictionary with system status information
693
+ """
694
+ try:
695
+ status = {
696
+ "overall_status": "healthy",
697
+ "components": {},
698
+ "configuration": {},
699
+ "environment": {},
700
+ }
701
+
702
+ # Check component status
703
+ components = [
704
+ "document_processor",
705
+ "url_processor",
706
+ "text_extractor",
707
+ "embedding_generator",
708
+ "vector_db",
709
+ "query_processor",
710
+ "response_generator",
711
+ ]
712
+
713
+ for component in components:
714
+ status["components"][component] = hasattr(self, component)
715
+
716
+ # Configuration info
717
+ status["configuration"] = {
718
+ "embedding_model": self.config.get("embedding", {}).get(
719
+ "model", "unknown"
720
+ ),
721
+ "vector_db_provider": self.config.get("vector_db", {}).get(
722
+ "provider", "unknown"
723
+ ),
724
+ "rag_top_k": self.config.get("rag", {}).get("top_k", 5),
725
+ }
726
+
727
+ # Environment info
728
+ status["environment"] = {
729
+ "gemini_api_available": bool(os.getenv("GEMINI_API_KEY")),
730
+ "pinecone_api_available": bool(os.getenv("PINECONE_API_KEY")),
731
+ "openai_api_available": bool(os.getenv("OPENAI_API_KEY")),
732
+ }
733
+
734
+ # Overall status
735
+ healthy_components = sum(status["components"].values())
736
+ total_components = len(status["components"])
737
+
738
+ if healthy_components < total_components * 0.8:
739
+ status["overall_status"] = "degraded"
740
+ elif healthy_components < total_components * 0.5:
741
+ status["overall_status"] = "unhealthy"
742
+
743
+ return status
744
+
745
+ except Exception as e:
746
+ self.logger.error(f" Error getting system status: {e}")
747
+ return {"overall_status": "error", "error": str(e)}
748
+
749
+
750
+ def create_app():
751
+ """
752
+ Create and configure the RAG application.
753
+
754
+ Returns:
755
+ Tuple of (RAG system instance, Gradio app instance)
756
+ """
757
+ try:
758
+ # Initialize the RAG system
759
+ rag_system = RAGSystem()
760
+
761
+ # Create Gradio interface
762
+ ui_config = rag_system.config_manager.get_section("ui")
763
+ gradio_app = GradioApp(rag_system, ui_config)
764
+
765
+ return rag_system, gradio_app
766
+
767
+ except Exception as e:
768
+ print(f" Failed to create application: {str(e)}")
769
+ # Create a minimal system for demo purposes
770
+ print("Creating minimal demo system...")
771
+
772
+ # Create minimal config
773
+ minimal_config = {
774
+ "ui": {
775
+ "title": "AI Embedded Knowledge Agent (Demo Mode)",
776
+ "description": "Demo mode - some features may be limited. Please configure API keys for full functionality.",
777
+ }
778
+ }
779
+
780
+ # Create minimal RAG system
781
+ class MinimalRAGSystem:
782
+ def __init__(self):
783
+ self.config_manager = type(
784
+ "ConfigManager",
785
+ (),
786
+ {
787
+ "get_section": lambda self, section: minimal_config.get(
788
+ section, {}
789
+ )
790
+ },
791
+ )()
792
+
793
+ def process_document(self, file_path):
794
+ return {
795
+ "status": "error",
796
+ "error": "Demo mode - document processing not available",
797
+ }
798
+
799
+ def process_url(self, url):
800
+ return {
801
+ "status": "error",
802
+ "error": "Demo mode - URL processing not available",
803
+ }
804
+
805
+ def query(self, question):
806
+ return {
807
+ "query": question,
808
+ "response": "Demo mode: Please configure your API keys (GEMINI_API_KEY, PINECONE_API_KEY) to enable full functionality.",
809
+ "sources": [],
810
+ "confidence": 0.0,
811
+ }
812
+
813
+ rag_system = MinimalRAGSystem()
814
+ gradio_app = GradioApp(rag_system, minimal_config.get("ui", {}))
815
+
816
+ return rag_system, gradio_app
817
+
818
+
819
+ def main():
820
+ """Main function to run the application."""
821
+ try:
822
+ print("Starting AI Embedded Knowledge Agent...")
823
+ print("=" * 50)
824
+
825
+ # Create the application
826
+ rag_system, gradio_app = create_app()
827
+
828
+ # Get launch configuration
829
+ try:
830
+ ui_config = rag_system.config_manager.get_section("ui")
831
+ except:
832
+ ui_config = {}
833
+
834
+ # Launch the Gradio interface
835
+ base_port = ui_config.get("port", 7860)
836
+ launch_config = {
837
+ "server_name": ui_config.get("server_name", "0.0.0.0"),
838
+ "server_port": base_port,
839
+ "share": ui_config.get("share", False),
840
+ "show_error": True,
841
+ "quiet": False,
842
+ }
843
+
844
+ # Try different ports if the default is in use
845
+ for port_offset in range(10): # Try ports 7860-7869
846
+ try:
847
+ current_port = base_port + port_offset
848
+ launch_config["server_port"] = current_port
849
+
850
+ print(
851
+ f"Launching interface on {launch_config['server_name']}:{current_port}"
852
+ )
853
+ print("=" * 50)
854
+
855
+ gradio_app.launch(**launch_config)
856
+ break # If successful, break out of the loop
857
+
858
+ except Exception as e:
859
+ if (
860
+ "bind" in str(e).lower()
861
+ or "address already in use" in str(e).lower()
862
+ ):
863
+ print(f"Port {current_port} is in use, trying next port...")
864
+ continue
865
+ else:
866
+ # If it's a different error, re-raise it
867
+ raise e
868
+ else:
869
+ # If we've tried all ports without success
870
+ print(
871
+ "Could not find an available port. Please close other applications using ports 7860-7869."
872
+ )
873
+ raise Exception("No available ports found")
874
+
875
+ except KeyboardInterrupt:
876
+ print("\n👋 Shutting down gracefully...")
877
+ except Exception as e:
878
+ print(f" Failed to start application: {str(e)}")
879
+ print("Please check your configuration and API keys.")
880
+ sys.exit(1)
881
+
882
+
883
+ if __name__ == "__main__":
884
+ main()
config/config.yaml ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ api_keys:
2
+ gemini_api_key: ""
3
+ openai_api_key: ""
4
+ pinecone_api_key: ""
5
+ backup:
6
+ enabled: false
7
+ include_configuration: true
8
+ include_documents: true
9
+ include_logs: false
10
+ include_vector_db: true
11
+ interval_hours: 24
12
+ retention_days: 30
13
+ storage_path: backups/
14
+ customization:
15
+ custom_css: ""
16
+ default_query_examples:
17
+ - What is the main topic of the uploaded documents?
18
+ - Can you summarize the key points?
19
+ - What are the important findings mentioned?
20
+ favicon_url: ""
21
+ footer_text: ""
22
+ help_text: ""
23
+ logo_url: ""
24
+ welcome_message: ""
25
+ deployment:
26
+ auto_scale: true
27
+ development:
28
+ debug_mode: true
29
+ enable_profiling: true
30
+ log_level: DEBUG
31
+ enable_metrics: true
32
+ graceful_shutdown_timeout: 30
33
+ health_check_interval: 60
34
+ health_endpoint: /health
35
+ max_cpu_percent: 80
36
+ max_disk_usage_mb: 5120
37
+ max_memory_mb: 2048
38
+ metrics_endpoint: /metrics
39
+ platform: huggingface
40
+ production:
41
+ debug_mode: false
42
+ enable_profiling: false
43
+ log_level: WARNING
44
+ staging:
45
+ debug_mode: true
46
+ enable_profiling: true
47
+ log_level: INFO
48
+ development:
49
+ debug_mode: false
50
+ enable_test_endpoints: false
51
+ mock_apis: false
52
+ profiling_enabled: false
53
+ save_intermediate_results: false
54
+ test_data_path: data/test_data
55
+ test_mode: false
56
+ document_processing:
57
+ chunk_overlap: 200
58
+ chunk_size: 1000
59
+ detect_language: true
60
+ extract_images: false
61
+ extract_metadata: true
62
+ max_file_size_mb: 50
63
+ min_chunk_size: 100
64
+ preserve_formatting: true
65
+ supported_formats:
66
+ - .pdf
67
+ - .docx
68
+ - .doc
69
+ - .csv
70
+ - .xlsx
71
+ - .xls
72
+ - .pptx
73
+ - .txt
74
+ - .md
75
+ supported_languages:
76
+ - en
77
+ - es
78
+ - fr
79
+ - de
80
+ - it
81
+ - pt
82
+ - ru
83
+ - zh
84
+ - ja
85
+ - ko
86
+ embedding:
87
+ batch_size: 1
88
+ cache_embeddings: true
89
+ fallback_model: sentence-transformers
90
+ max_retries: 3
91
+ max_tokens: 8192
92
+ model: gemini-embedding-exp-03-07
93
+ output_dimensionality: 3072
94
+ rate_limit_delay: 1.0
95
+ retry_delay: 2
96
+ task_type: RETRIEVAL_DOCUMENT
97
+ title: ""
98
+ features:
99
+ async_processing: false
100
+ audio_processing: false
101
+ auto_summarization: false
102
+ batch_processing: true
103
+ content_recommendation: false
104
+ document_upload: true
105
+ image_processing: false
106
+ live_search: true
107
+ multi_language_support: false
108
+ query_processing: true
109
+ question_generation: false
110
+ real_time_updates: false
111
+ url_processing: true
112
+ video_processing: false
113
+ integrations:
114
+ aws_s3:
115
+ access_key: ""
116
+ bucket_name: ""
117
+ enabled: false
118
+ secret_key: ""
119
+ google_analytics:
120
+ enabled: false
121
+ tracking_id: ""
122
+ huggingface:
123
+ api_key: ""
124
+ enabled: false
125
+ models: []
126
+ postgresql:
127
+ connection_string: ""
128
+ enabled: false
129
+ sentry:
130
+ dsn: ""
131
+ enabled: false
132
+ logging:
133
+ backup_count: 5
134
+ component_levels:
135
+ document_processing: INFO
136
+ embedding: INFO
137
+ rag: INFO
138
+ ui: INFO
139
+ url_processing: INFO
140
+ vector_db: INFO
141
+ file: logs/rag_ai.log
142
+ format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
143
+ level: INFO
144
+ max_file_size_mb: 10
145
+ notifications:
146
+ email:
147
+ enabled: false
148
+ from_address: ""
149
+ password: ""
150
+ smtp_port: 587
151
+ smtp_server: ""
152
+ to_addresses: []
153
+ username: ""
154
+ enabled: false
155
+ webhook:
156
+ enabled: false
157
+ events:
158
+ - error
159
+ - system_health
160
+ - processing_complete
161
+ url: ""
162
+ performance:
163
+ batch_processing_size: 10
164
+ cache_ttl: 3600
165
+ enable_caching: true
166
+ enable_parallel_processing: true
167
+ garbage_collection_interval: 300
168
+ max_concurrent_requests: 5
169
+ max_memory_usage_mb: 1024
170
+ max_worker_threads: 4
171
+ request_timeout: 30
172
+ rag:
173
+ confidence_threshold: 0.3
174
+ context_window_overlap: 0.1
175
+ deduplicate_results: true
176
+ enable_query_caching: true
177
+ enable_query_expansion: true
178
+ fallback_model: gpt-3.5-turbo
179
+ include_sources: true
180
+ max_context_length: 8000
181
+ max_response_length: 2000
182
+ max_tokens: 500
183
+ model: gemini-2.5-flash-preview-05-20
184
+ query_cache_ttl: 7200
185
+ rerank_results: true
186
+ similarity_threshold: 0.4
187
+ temperature: 0.7
188
+ top_k: 10
189
+ top_p: 0.9
190
+ live_search:
191
+ enabled: true
192
+ enable_caching: true
193
+ include_raw_content: true
194
+ max_results: 10
195
+ search_depth: basic
196
+ time_range: month
197
+ query_router:
198
+ confidence_threshold: 0.5
199
+ enable_hybrid_search: true
200
+ live_weight: 0.4
201
+ local_weight: 0.6
202
+ max_hybrid_results: 10
203
+ security:
204
+ allowed_domains: []
205
+ blocked_content_types:
206
+ - executable
207
+ - script
208
+ blocked_domains:
209
+ - localhost
210
+ - 127.0.0.1
211
+ - 0.0.0.0
212
+ enable_content_filtering: true
213
+ enable_rate_limiting: true
214
+ max_text_length: 1000000
215
+ max_upload_size_mb: 100
216
+ requests_per_hour: 1000
217
+ requests_per_minute: 60
218
+ sanitize_input: true
219
+ ui:
220
+ demo_mode: false
221
+ description:
222
+ Upload documents or provide URLs to build your knowledge base, then
223
+ ask questions!
224
+ features:
225
+ analytics_dashboard: true
226
+ confidence_display: true
227
+ file_upload: true
228
+ knowledge_base_management: true
229
+ query_interface: true
230
+ source_display: true
231
+ system_health_monitoring: true
232
+ url_input: true
233
+ max_file_uploads: 10
234
+ max_query_length: 1000
235
+ port: 7860
236
+ sample_documents: []
237
+ server_name: 0.0.0.0
238
+ share: false
239
+ show_advanced_options: true
240
+ theme: default
241
+ title: "\xF0\u0178\xA7\_ AI Embedded Knowledge Agent"
242
+ url_processing:
243
+ allowed_domains: []
244
+ blocked_domains:
245
+ - localhost
246
+ - 127.0.0.1
247
+ - 0.0.0.0
248
+ delay_between_requests: 0.5
249
+ extract_main_content: true
250
+ follow_links: true
251
+ max_depth: 1
252
+ max_pages: 10
253
+ remove_ads: true
254
+ remove_navigation: true
255
+ requests_per_second: 2
256
+ respect_robots_txt: true
257
+ timeout: 10
258
+ user_agent: RAG-AI-Bot/1.0
259
+ vector_db:
260
+ batch_size: 100
261
+ create_index_if_not_exists: true
262
+ dimension: 3072
263
+ environment: us-east-1
264
+ fallback_provider: memory
265
+ index_name: rag-ai-index
266
+ max_retries: 3
267
+ metric: cosine
268
+ provider: pinecone
269
+ retry_delay: 1
docs/architecture.md ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AI Embedded Knowledge Agent - Architecture Document
2
+
3
+ ## 1. System Overview
4
+
5
+ The AI Embedded Knowledge Agent is a versatile knowledge management system designed to ingest, process, and retrieve information from various document types and web sources. Built for a hackathon and deployable on Hugging Face, this system enables users to upload documents or provide URLs, which are then processed, embedded, and stored for intelligent retrieval.
6
+
7
+ ```mermaid
8
+ graph TD
9
+ A[User Interface - Gradio] --> B[Document Processor]
10
+ A --> C[URL Processor]
11
+ B --> D[Text Extractor]
12
+ C --> D
13
+ D --> E[Embedding Generator - Gemini]
14
+ E --> F[Vector Database - Pinecone]
15
+ A --> G[Query Processor]
16
+ G --> E
17
+ G --> F
18
+ G --> H[Response Generator - LangChain RAG]
19
+ H --> A
20
+ ```
21
+
22
+ ## 2. Core Components
23
+
24
+ ### 2.1 Document Ingestion System
25
+
26
+ This component handles the intake of various document formats and web content.
27
+
28
+ #### Document Processor
29
+
30
+ - **Responsibility**: Process uploaded documents (PDF, DOCX, CSV, PPTX, Excel)
31
+ - **Technologies**: PyMuPDF, python-docx, pandas, python-pptx, pdfplumber
32
+ - **Input**: Raw document files
33
+ - **Output**: Extracted text content
34
+
35
+ #### URL Processor
36
+
37
+ - **Responsibility**: Crawl and extract content from provided URLs, including nested documents and links
38
+ - **Technologies**: BeautifulSoup, requests, trafilatura
39
+ - **Input**: URLs
40
+ - **Output**: Extracted text content from web pages and linked documents
41
+
42
+ ### 2.2 Knowledge Processing System
43
+
44
+ This component transforms raw text into queryable knowledge.
45
+
46
+ #### Text Extractor
47
+
48
+ - **Responsibility**: Clean, normalize, and chunk text from various sources
49
+ - **Technologies**: NLTK, spaCy, regex
50
+ - **Input**: Raw text from documents and web pages
51
+ - **Output**: Cleaned, normalized text chunks ready for embedding
52
+
53
+ #### Embedding Generator
54
+
55
+ - **Responsibility**: Generate vector embeddings for text chunks
56
+ - **Technology**: Gemini Embedding v3 (gemini-embedding-exp-03-07)
57
+ - **Input**: Processed text chunks
58
+ - **Output**: Vector embeddings
59
+
60
+ ### 2.3 Knowledge Storage System
61
+
62
+ This component manages the storage and retrieval of vector embeddings.
63
+
64
+ #### Vector Database
65
+
66
+ - **Responsibility**: Store and index vector embeddings for efficient retrieval
67
+ - **Technology**: Pinecone
68
+ - **Input**: Vector embeddings with metadata
69
+ - **Output**: Retrieved relevant vectors based on similarity
70
+
71
+ ### 2.4 Query Processing System
72
+
73
+ This component handles user queries and generates responses.
74
+
75
+ #### Query Processor
76
+
77
+ - **Responsibility**: Process user queries and convert them to vector embeddings
78
+ - **Technologies**: Gemini Embedding v3, LangChain
79
+ - **Input**: User queries
80
+ - **Output**: Query vector embeddings
81
+
82
+ #### Response Generator
83
+
84
+ - **Responsibility**: Generate coherent responses based on retrieved knowledge
85
+ - **Technology**: LangChain RAG (Retrieval Augmented Generation)
86
+ - **Input**: Retrieved relevant text chunks
87
+ - **Output**: Natural language responses
88
+
89
+ ### 2.5 User Interface System
90
+
91
+ This component provides the user-facing interface.
92
+
93
+ #### Gradio UI
94
+
95
+ - **Responsibility**: Provide intuitive interface for document upload, URL input, and querying
96
+ - **Technology**: Gradio
97
+ - **Features**:
98
+ - Document upload area
99
+ - URL input field
100
+ - Query input and response display
101
+ - System status indicators
102
+
103
+ ## 3. Data Flow
104
+
105
+ ```mermaid
106
+ sequenceDiagram
107
+ participant User
108
+ participant UI as Gradio UI
109
+ participant DP as Document Processor
110
+ participant UP as URL Processor
111
+ participant TE as Text Extractor
112
+ participant EG as Embedding Generator
113
+ participant VDB as Vector Database
114
+ participant QP as Query Processor
115
+ participant RG as Response Generator
116
+
117
+ %% Document Upload Flow
118
+ User->>UI: Upload Document
119
+ UI->>DP: Process Document
120
+ DP->>TE: Extract Text
121
+ TE->>EG: Generate Embeddings
122
+ EG->>VDB: Store Embeddings
123
+
124
+ %% URL Processing Flow
125
+ User->>UI: Input URL
126
+ UI->>UP: Process URL
127
+ UP->>TE: Extract Text
128
+ TE->>EG: Generate Embeddings
129
+ EG->>VDB: Store Embeddings
130
+
131
+ %% Query Flow
132
+ User->>UI: Submit Query
133
+ UI->>QP: Process Query
134
+ QP->>EG: Generate Query Embedding
135
+ QP->>VDB: Retrieve Relevant Embeddings
136
+ VDB->>QP: Return Relevant Chunks
137
+ QP->>RG: Generate Response
138
+ RG->>UI: Display Response
139
+ UI->>User: Show Answer
140
+ ```
141
+
142
+ ## 4. Technical Architecture
143
+
144
+ ### 4.1 Technology Stack
145
+
146
+ | Component | Technology | Purpose |
147
+ | ------------------ | ----------------------------------------------------- | ------------------------------------------ |
148
+ | Document Parsing | PyMuPDF, python-docx, pandas, python-pptx, pdfplumber | Extract text from various document formats |
149
+ | Web Scraping | BeautifulSoup, requests, trafilatura | Extract content from web pages |
150
+ | Text Processing | NLTK, spaCy, regex | Clean and chunk text |
151
+ | Embedding | Gemini Embedding v3 (gemini-embedding-exp-03-07) | Generate vector embeddings |
152
+ | Vector Storage | Pinecone | Store and retrieve vector embeddings |
153
+ | RAG Implementation | LangChain | Implement retrieval augmented generation |
154
+ | User Interface | Gradio | Provide user-friendly interface |
155
+
156
+ ### 4.2 Integration Points
157
+
158
+ - **Document Processing → Text Extraction**: Raw text extraction from documents
159
+ - **URL Processing → Text Extraction**: Raw text extraction from web pages
160
+ - **Text Extraction → Embedding Generation**: Processed text chunks for embedding
161
+ - **Embedding Generation → Vector Database**: Storage of embeddings
162
+ - **Query Processing → Embedding Generation**: Query embedding generation
163
+ - **Query Processing → Vector Database**: Retrieval of relevant embeddings
164
+ - **Query Processing → Response Generation**: Generation of coherent responses
165
+ - **Response Generation → UI**: Display of responses to user
166
+
167
+ ## 5. Deployment Architecture
168
+
169
+ The system is designed to be deployed on Hugging Face using their Spaces feature, which supports Gradio applications.
170
+
171
+ ```mermaid
172
+ graph TD
173
+ A[User] --> B[Hugging Face Space]
174
+ B --> C[Gradio Application]
175
+ C --> D[Document Processing]
176
+ C --> E[URL Processing]
177
+ C --> F[Query Processing]
178
+ D --> G[Gemini API]
179
+ E --> G
180
+ F --> G
181
+ D --> H[Pinecone API]
182
+ E --> H
183
+ F --> H
184
+ ```
185
+
186
+ ### 5.1 Deployment Considerations
187
+
188
+ - **API Keys**: Secure storage of Gemini and Pinecone API keys
189
+ - **Rate Limiting**: Handling API rate limits for both Gemini and Pinecone
190
+ - **Memory Management**: Efficient memory usage within Hugging Face constraints
191
+ - **Statelessness**: Designing components to be stateless where possible
192
+ - **Error Handling**: Robust error handling for API failures and timeouts
193
+
194
+ ## 6. Scalability and Performance
195
+
196
+ For the hackathon version, the focus is on functionality rather than scalability. However, the architecture is designed with the following considerations:
197
+
198
+ - **Document Size Limits**: Implement reasonable limits on document sizes
199
+ - **Chunking Strategy**: Optimize text chunking for better retrieval performance
200
+ - **Caching**: Implement basic caching for frequently accessed embeddings
201
+ - **Asynchronous Processing**: Use asynchronous processing where appropriate
202
+
203
+ ## 7. Future Enhancements
204
+
205
+ While not implemented in the hackathon version, the architecture supports future enhancements:
206
+
207
+ - **Authentication**: User authentication and document access control
208
+ - **Document Versioning**: Track changes to documents over time
209
+ - **Advanced RAG Techniques**: Implement more sophisticated RAG approaches
210
+ - **Multi-Modal Support**: Add support for images and other non-text content
211
+ - **Collaborative Features**: Allow multiple users to collaborate on knowledge bases
212
+ - **Custom Training**: Fine-tune models for specific domains
213
+
214
+ ## 8. Folder Structure
215
+
216
+ ```
217
+ rag-ai/
218
+ ├── src/
219
+ │ ├── ingestion/
220
+ │ │ ├── document_processor.py
221
+ │ │ ├── url_processor.py
222
+ │ │ └── text_extractor.py
223
+ │ ├── embedding/
224
+ │ │ └── embedding_generator.py
225
+ │ ├── storage/
226
+ │ │ └── vector_db.py
227
+ │ ├── rag/
228
+ │ │ ├── query_processor.py
229
+ │ │ └── response_generator.py
230
+ │ ├── ui/
231
+ │ │ └── gradio_app.py
232
+ │ └── utils/
233
+ │ ├── config_manager.py
234
+ │ └── error_handler.py
235
+ ├── config/
236
+ │ └── config.yaml
237
+ ├── docs/
238
+ │ ├── architecture.md
239
+ │ └── api_documentation.md
240
+ ├── tests/
241
+ │ ├── test_document_processor.py
242
+ │ ├── test_url_processor.py
243
+ │ └── ...
244
+ ├── scripts/
245
+ │ ├── setup.py
246
+ │ └── deploy_to_huggingface.py
247
+ ├── data/
248
+ │ ├── sample_documents/
249
+ │ └── test_data/
250
+ ├── .gitignore
251
+ ├── requirements.txt
252
+ ├── README.md
253
+ └── app.py
254
+ ```
255
+
256
+ ## 9. Implementation Roadmap
257
+
258
+ 1. **Phase 1: Core Infrastructure**
259
+
260
+ - Set up project structure
261
+ - Implement basic document processing
262
+ - Set up Pinecone integration
263
+
264
+ 2. **Phase 2: Knowledge Processing**
265
+
266
+ - Implement text extraction and chunking
267
+ - Integrate Gemini embedding API
268
+ - Develop vector storage and retrieval
269
+
270
+ 3. **Phase 3: Query System**
271
+
272
+ - Implement query processing
273
+ - Develop RAG response generation
274
+ - Integrate components
275
+
276
+ 4. **Phase 4: User Interface**
277
+
278
+ - Develop Gradio UI
279
+ - Integrate UI with backend components
280
+ - Add error handling and user feedback
281
+
282
+ 5. **Phase 5: URL Processing**
283
+
284
+ - Implement URL crawling
285
+ - Add nested document extraction
286
+ - Integrate with existing components
287
+
288
+ 6. **Phase 6: Testing and Deployment**
289
+ - Comprehensive testing
290
+ - Optimization for Hugging Face deployment
291
+ - Documentation and demo preparation
requirements.txt ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Dependencies
2
+ gradio
3
+ pyyaml
4
+ python-dotenv
5
+
6
+ # Document Processing
7
+ PyPDF2
8
+ PyMuPDF
9
+ pdfplumber
10
+ python-docx
11
+ pandas
12
+ openpyxl
13
+ python-pptx
14
+
15
+ # Web Scraping and URL Processing
16
+ requests
17
+ beautifulsoup4
18
+ lxml
19
+ html2text
20
+ trafilatura
21
+
22
+ # Text Processing
23
+ nltk
24
+ spacy
25
+ textstat
26
+ langdetect
27
+
28
+ # Embedding and Vector Database
29
+ google-generativeai
30
+ pinecone
31
+ sentence-transformers
32
+
33
+ # LangChain and LLM Integration
34
+ langchain
35
+ langchain-google-genai
36
+ langchain-openai
37
+ langchain-community
38
+ openai
39
+
40
+ # Live Search Integration
41
+ tavily-python
42
+
43
+ # Vector Operations and ML
44
+ numpy
45
+ scikit-learn
46
+ faiss-cpu
47
+
48
+ # Async and Performance
49
+ aiohttp
50
+ asyncio
51
+
52
+ # Logging and Monitoring
53
+ structlog
54
+ prometheus-client
55
+
56
+ # Development and Testing
57
+ pytest
58
+ pytest-asyncio
59
+ black
60
+ flake8
61
+ mypy
62
+
63
+ # Optional Dependencies for Enhanced Features
64
+ # Uncomment if needed:
65
+
66
+ # Advanced NLP
67
+ # transformers
68
+ # torch
69
+
70
+ # Image Processing (if document images need processing)
71
+ # Pillow
72
+ # pytesseract
73
+
74
+ # Audio Processing (for future features)
75
+ # librosa
76
+ # soundfile
77
+
78
+ # Database Support
79
+ # psycopg2-binary
80
+ # sqlalchemy
81
+
82
+ # Cloud Storage
83
+ # boto3
84
+ # google-cloud-storage
85
+
86
+ # Monitoring and Analytics
87
+ # sentry-sdk
88
+
89
+ # Additional Text Processing
90
+ # langdetect
91
+ # polyglot
92
+
93
+ # Web Framework (if API endpoints needed)
94
+ # fastapi
95
+ # uvicorn
96
+
97
+ # Caching
98
+ # redis
99
+ # diskcache
100
+
101
+ # Configuration Management
102
+ # hydra-core
103
+ # omegaconf
src/embedding/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ Embedding module for generating vector embeddings.
3
+
4
+ This module contains components for generating vector embeddings
5
+ from text chunks using Gemini Embedding v3.
6
+ """
src/embedding/embedding_generator.py ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Embedding Generator Module
3
+
4
+ This module is responsible for generating vector embeddings for text chunks
5
+ using Gemini Embedding v3 with complete API integration.
6
+
7
+ Technology: Gemini Embedding v3 (gemini-embedding-exp-03-07)
8
+ """
9
+
10
+ import logging
11
+ import os
12
+ import time
13
+ import hashlib
14
+ from datetime import datetime, timedelta
15
+ from typing import Dict, List, Any, Optional, Union
16
+ import json
17
+
18
+ # Import Gemini API and caching libraries
19
+ try:
20
+ import google.generativeai as genai
21
+ from cachetools import TTLCache
22
+ except ImportError as e:
23
+ logging.warning(f"Some embedding libraries are not installed: {e}")
24
+
25
+ from utils.error_handler import EmbeddingError, error_handler, ErrorType
26
+
27
+
28
+ class EmbeddingGenerator:
29
+ """
30
+ Generates vector embeddings for text chunks using Gemini Embedding v3 with full functionality.
31
+
32
+ Features:
33
+ - Gemini Embedding v3 API integration
34
+ - Batch processing with rate limiting
35
+ - Intelligent retry logic with exponential backoff
36
+ - Embedding caching mechanism
37
+ - Cost optimization
38
+ """
39
+
40
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
41
+ """
42
+ Initialize the EmbeddingGenerator with configuration.
43
+
44
+ Args:
45
+ config: Configuration dictionary with API parameters
46
+ """
47
+ self.config = config or {}
48
+ self.logger = logging.getLogger(__name__)
49
+
50
+ # API Configuration
51
+ self.api_key = self.config.get("api_key", os.environ.get("GEMINI_API_KEY"))
52
+ self.model = self.config.get("model", "gemini-embedding-exp-03-07")
53
+ self.batch_size = self.config.get("batch_size", 5)
54
+ self.max_retries = self.config.get("max_retries", 3)
55
+ self.retry_delay = self.config.get("retry_delay", 1)
56
+
57
+ # Performance settings
58
+ self.rate_limit_delay = self.config.get("rate_limit_delay", 0.1)
59
+ self.max_text_length = self.config.get(
60
+ "max_text_length", 8192
61
+ ) # ✨ 8K token limit for latest model
62
+ self.enable_caching = self.config.get("enable_caching", True)
63
+ self.cache_ttl = self.config.get("cache_ttl", 3600) # 1 hour
64
+
65
+ # Statistics tracking
66
+ self.stats = {
67
+ "total_requests": 0,
68
+ "successful_requests": 0,
69
+ "failed_requests": 0,
70
+ "cache_hits": 0,
71
+ "total_tokens_processed": 0,
72
+ "start_time": datetime.now(),
73
+ }
74
+
75
+ # Initialize cache
76
+ if self.enable_caching:
77
+ self.cache = TTLCache(maxsize=1000, ttl=self.cache_ttl)
78
+ else:
79
+ self.cache = None
80
+
81
+ # Validate and initialize API client
82
+ self._initialize_client()
83
+
84
+ def _initialize_client(self):
85
+ """Initialize Gemini API client with validation."""
86
+ if not self.api_key:
87
+ self.logger.warning(
88
+ "No Gemini API key provided. Embeddings will not be generated."
89
+ )
90
+ self.client = None
91
+ return
92
+
93
+ try:
94
+ # Configure Gemini API
95
+ genai.configure(api_key=self.api_key)
96
+
97
+ # Test API connection
98
+ self._test_api_connection()
99
+
100
+ self.client = genai
101
+ self.logger.info("Gemini API client initialized successfully")
102
+
103
+ except Exception as e:
104
+ self.logger.error(f"Failed to initialize Gemini API client: {str(e)}")
105
+ self.client = None
106
+
107
+ def _test_api_connection(self):
108
+ """Test API connection with a simple request."""
109
+ try:
110
+ # Test with a simple embedding request
111
+ test_result = genai.embed_content(
112
+ model=self.model,
113
+ content="test connection",
114
+ task_type="retrieval_document",
115
+ )
116
+
117
+ if not test_result.get("embedding"):
118
+ raise Exception("No embedding returned from test request")
119
+
120
+ self.logger.info("API connection test successful")
121
+
122
+ except Exception as e:
123
+ raise EmbeddingError(f"API connection test failed: {str(e)}")
124
+
125
+ @error_handler(ErrorType.EMBEDDING_GENERATION)
126
+ def generate_embeddings(self, texts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
127
+ """
128
+ Generate embeddings for a list of text chunks with full functionality.
129
+
130
+ Args:
131
+ texts: List of dictionaries containing text chunks and metadata
132
+ Each dict should have 'content' and 'metadata' keys
133
+
134
+ Returns:
135
+ List of dictionaries with original content, metadata, and embeddings
136
+ """
137
+ if not self.client or not texts:
138
+ self.logger.warning("No API client or empty text list")
139
+ return texts
140
+
141
+ self.logger.info(f"Generating embeddings for {len(texts)} text chunks")
142
+ start_time = time.time()
143
+
144
+ # Filter and validate texts
145
+ valid_texts = self._validate_texts(texts)
146
+ if not valid_texts:
147
+ self.logger.warning("No valid texts to process")
148
+ return texts
149
+
150
+ # Process in batches to respect API limits
151
+ results = []
152
+ total_batches = (len(valid_texts) + self.batch_size - 1) // self.batch_size
153
+
154
+ for i in range(0, len(valid_texts), self.batch_size):
155
+ batch_num = (i // self.batch_size) + 1
156
+ batch = valid_texts[i : i + self.batch_size]
157
+
158
+ self.logger.info(
159
+ f"Processing batch {batch_num}/{total_batches} ({len(batch)} items)"
160
+ )
161
+
162
+ try:
163
+ batch_results = self._process_batch(batch)
164
+ results.extend(batch_results)
165
+
166
+ # Rate limiting between batches
167
+ if i + self.batch_size < len(valid_texts):
168
+ time.sleep(self.rate_limit_delay)
169
+
170
+ except Exception as e:
171
+ self.logger.error(f"Batch {batch_num} failed: {str(e)}")
172
+ # Add original items without embeddings
173
+ for item in batch:
174
+ item_copy = item.copy()
175
+ item_copy["embedding"] = []
176
+ item_copy["embedding_error"] = str(e)
177
+ results.append(item_copy)
178
+
179
+ # Update statistics
180
+ processing_time = time.time() - start_time
181
+ self.logger.info(f"Embedding generation completed in {processing_time:.2f}s")
182
+
183
+ return results
184
+
185
+ def _validate_texts(self, texts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
186
+ """
187
+ Validate and filter text inputs.
188
+
189
+ Args:
190
+ texts: List of text dictionaries
191
+
192
+ Returns:
193
+ List of valid text dictionaries
194
+ """
195
+ valid_texts = []
196
+
197
+ for i, item in enumerate(texts):
198
+ if not isinstance(item, dict) or "content" not in item:
199
+ self.logger.warning(f"Invalid item at index {i}: missing 'content' key")
200
+ continue
201
+
202
+ content = item["content"]
203
+ if not content or not isinstance(content, str):
204
+ self.logger.warning(
205
+ f"Invalid content at index {i}: empty or non-string"
206
+ )
207
+ continue
208
+
209
+ # Truncate if too long
210
+ if len(content) > self.max_text_length:
211
+ self.logger.warning(
212
+ f"Truncating text at index {i}: {len(content)} -> {self.max_text_length} chars"
213
+ )
214
+ item = item.copy()
215
+ item["content"] = content[: self.max_text_length]
216
+ item["metadata"] = item.get("metadata", {})
217
+ item["metadata"]["truncated"] = True
218
+ item["metadata"]["original_length"] = len(content)
219
+
220
+ valid_texts.append(item)
221
+
222
+ return valid_texts
223
+
224
+ def _process_batch(self, batch: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
225
+ """
226
+ Process a batch of text chunks to generate embeddings.
227
+
228
+ Args:
229
+ batch: List of dictionaries containing text chunks and metadata
230
+
231
+ Returns:
232
+ List of dictionaries with original content, metadata, and embeddings
233
+ """
234
+ # Extract content and check cache
235
+ contents = []
236
+ cache_results = {}
237
+
238
+ for i, item in enumerate(batch):
239
+ content = item["content"]
240
+
241
+ # Check cache first
242
+ if self.cache is not None:
243
+ cache_key = self._get_cache_key(content)
244
+ if cache_key in self.cache:
245
+ cache_results[i] = self.cache[cache_key]
246
+ self.stats["cache_hits"] += 1
247
+ continue
248
+
249
+ contents.append((i, content))
250
+
251
+ # Generate embeddings for non-cached content
252
+ embeddings_map = {}
253
+ if contents:
254
+ content_texts = [content for _, content in contents]
255
+ embeddings = self._generate_with_retry(content_texts)
256
+
257
+ # Map embeddings back to indices
258
+ for j, (original_index, content) in enumerate(contents):
259
+ if j < len(embeddings):
260
+ embedding = embeddings[j]
261
+ embeddings_map[original_index] = embedding
262
+
263
+ # Cache the result
264
+ if self.cache is not None:
265
+ cache_key = self._get_cache_key(content)
266
+ self.cache[cache_key] = embedding
267
+
268
+ # 🔗 Combine results
269
+ results = []
270
+ for i, item in enumerate(batch):
271
+ result = item.copy()
272
+
273
+ # Add embedding from cache or new generation
274
+ if i in cache_results:
275
+ result["embedding"] = cache_results[i]
276
+ result["embedding_source"] = "cache"
277
+ elif i in embeddings_map:
278
+ result["embedding"] = embeddings_map[i]
279
+ result["embedding_source"] = "api"
280
+ else:
281
+ result["embedding"] = []
282
+ result["embedding_source"] = "failed"
283
+ self.logger.warning(f"Missing embedding for batch item {i}")
284
+
285
+ # Add embedding metadata
286
+ if result["embedding"]:
287
+ result["metadata"] = result.get("metadata", {})
288
+ result["metadata"].update(
289
+ {
290
+ "embedding_model": self.model,
291
+ "embedding_dimension": len(result["embedding"]),
292
+ "embedding_generated_at": datetime.now().isoformat(),
293
+ }
294
+ )
295
+
296
+ results.append(result)
297
+
298
+ return results
299
+
300
+ def _generate_with_retry(self, texts: List[str]) -> List[List[float]]:
301
+ """
302
+ Generate embeddings with intelligent retry logic.
303
+
304
+ Args:
305
+ texts: List of text strings to embed
306
+
307
+ Returns:
308
+ List of embedding vectors (each is a list of floats)
309
+ """
310
+ for attempt in range(self.max_retries):
311
+ try:
312
+ self.stats["total_requests"] += 1
313
+
314
+ # Generate embeddings using Gemini API
315
+ embeddings = []
316
+
317
+ for text in texts:
318
+ try:
319
+ # Track tokens
320
+ self.stats["total_tokens_processed"] += len(text.split())
321
+
322
+ # Call Gemini API
323
+ result = self.client.embed_content(
324
+ model=self.model,
325
+ content=text,
326
+ task_type="retrieval_document",
327
+ title="Document chunk for RAG system",
328
+ )
329
+
330
+ if result and "embedding" in result:
331
+ embeddings.append(result["embedding"])
332
+ else:
333
+ self.logger.warning(
334
+ f"No embedding in API response for text: {text[:50]}..."
335
+ )
336
+ embeddings.append([])
337
+
338
+ except Exception as e:
339
+ self.logger.warning(
340
+ f"Failed to embed individual text: {str(e)}"
341
+ )
342
+ embeddings.append([])
343
+
344
+ self.stats["successful_requests"] += 1
345
+ return embeddings
346
+
347
+ except Exception as e:
348
+ self.stats["failed_requests"] += 1
349
+ self.logger.warning(
350
+ f"Embedding generation failed (attempt {attempt+1}/{self.max_retries}): {str(e)}"
351
+ )
352
+
353
+ if attempt < self.max_retries - 1:
354
+ # Exponential backoff with jitter
355
+ delay = self.retry_delay * (2**attempt) + (time.time() % 1)
356
+ self.logger.info(f"Retrying in {delay:.1f} seconds...")
357
+ time.sleep(delay)
358
+
359
+ # All retries failed
360
+ self.logger.error("All embedding generation attempts failed")
361
+ return [[] for _ in texts]
362
+
363
+ @error_handler(ErrorType.EMBEDDING_GENERATION)
364
+ def generate_query_embedding(self, query: str) -> List[float]:
365
+ """
366
+ Generate embedding for a single query string.
367
+
368
+ Args:
369
+ query: Query text to embed
370
+
371
+ Returns:
372
+ Embedding vector as a list of floats
373
+ """
374
+ if not self.client or not query:
375
+ return []
376
+
377
+ self.logger.info(f"Generating embedding for query: {query[:50]}...")
378
+
379
+ # Check cache first
380
+ if self.cache is not None:
381
+ cache_key = self._get_cache_key(query, "query")
382
+ if cache_key in self.cache:
383
+ self.stats["cache_hits"] += 1
384
+ return self.cache[cache_key]
385
+
386
+ # Generate embedding
387
+ embeddings = self._generate_with_retry([query])
388
+ embedding = embeddings[0] if embeddings else []
389
+
390
+ # Cache the result
391
+ if embedding and self.cache is not None:
392
+ cache_key = self._get_cache_key(query, "query")
393
+ self.cache[cache_key] = embedding
394
+
395
+ return embedding
396
+
397
+ def _get_cache_key(self, text: str, prefix: str = "doc") -> str:
398
+ """
399
+ Generate cache key for text.
400
+
401
+ Args:
402
+ text: Text content
403
+ prefix: Key prefix
404
+
405
+ Returns:
406
+ Cache key string
407
+ """
408
+ # 🔐 Create hash of text + model for unique key
409
+ content_hash = hashlib.md5(f"{self.model}:{text}".encode()).hexdigest()
410
+ return f"{prefix}:{content_hash}"
411
+
412
+ def get_statistics(self) -> Dict[str, Any]:
413
+ """
414
+ Get embedding generation statistics.
415
+
416
+ Returns:
417
+ Dictionary with statistics
418
+ """
419
+ runtime = datetime.now() - self.stats["start_time"]
420
+
421
+ return {
422
+ **self.stats,
423
+ "runtime_seconds": runtime.total_seconds(),
424
+ "cache_hit_rate": (
425
+ self.stats["cache_hits"] / max(1, self.stats["total_requests"]) * 100
426
+ ),
427
+ "success_rate": (
428
+ self.stats["successful_requests"]
429
+ / max(1, self.stats["total_requests"])
430
+ * 100
431
+ ),
432
+ "avg_tokens_per_request": (
433
+ self.stats["total_tokens_processed"]
434
+ / max(1, self.stats["total_requests"])
435
+ ),
436
+ "cache_size": len(self.cache) if self.cache else 0,
437
+ "model": self.model,
438
+ "batch_size": self.batch_size,
439
+ }
440
+
441
+ def clear_cache(self):
442
+ """Clear the embedding cache."""
443
+ if self.cache:
444
+ self.cache.clear()
445
+ self.logger.info("Embedding cache cleared")
446
+
447
+ def warm_up_cache(self, sample_texts: List[str]):
448
+ """
449
+ 🔥 Warm up the cache with sample texts.
450
+
451
+ Args:
452
+ sample_texts: List of sample texts to pre-generate embeddings
453
+ """
454
+ if not sample_texts:
455
+ return
456
+
457
+ self.logger.info(f"🔥 Warming up cache with {len(sample_texts)} sample texts")
458
+
459
+ sample_items = [{"content": text, "metadata": {}} for text in sample_texts]
460
+ self.generate_embeddings(sample_items)
461
+
462
+ self.logger.info("Cache warm-up completed")
src/ingestion/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ Ingestion module for processing documents and URLs.
3
+
4
+ This module contains components for processing various document formats
5
+ and extracting content from web URLs.
6
+ """
src/ingestion/document_processor.py ADDED
@@ -0,0 +1,668 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Processor Module
3
+
4
+ This module is responsible for processing various document formats including
5
+ PDF, DOCX, CSV, PPTX, and Excel files with complete functionality.
6
+
7
+ Technologies: PyMuPDF, python-docx, pandas, python-pptx, pdfplumber
8
+ """
9
+
10
+ import os
11
+ import time
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from typing import Dict, List, Any, Optional, Union
15
+ import logging
16
+
17
+ # Import document processing libraries
18
+ try:
19
+ import fitz # PyMuPDF
20
+ import docx
21
+ import pandas as pd
22
+ import pptx
23
+ import pdfplumber
24
+ from openpyxl import load_workbook
25
+ except ImportError as e:
26
+ logging.warning(f"Some document processing libraries are not installed: {e}")
27
+
28
+ from utils.error_handler import DocumentProcessingError, error_handler, ErrorType
29
+
30
+
31
+ class DocumentProcessor:
32
+ """
33
+ Processes various document formats and extracts text content with full functionality.
34
+
35
+ Supported formats:
36
+ - PDF (using PyMuPDF and pdfplumber)
37
+ - DOCX (using python-docx)
38
+ - CSV/Excel (using pandas)
39
+ - PPTX (using python-pptx)
40
+ """
41
+
42
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
43
+ """
44
+ Initialize the DocumentProcessor with configuration.
45
+
46
+ Args:
47
+ config: Configuration dictionary with processing parameters
48
+ """
49
+ self.config = config or {}
50
+ self.logger = logging.getLogger(__name__)
51
+
52
+ # Configuration settings
53
+ self.max_file_size_mb = self.config.get("max_file_size_mb", 50)
54
+ self.supported_formats = self.config.get(
55
+ "supported_formats",
56
+ [".pdf", ".docx", ".csv", ".xlsx", ".xls", ".pptx", ".txt", ".md"],
57
+ )
58
+
59
+ @error_handler(ErrorType.DOCUMENT_PROCESSING)
60
+ def process_document(self, file_path: str) -> Dict[str, Any]:
61
+ """
62
+ Process a document and extract its text content with metadata.
63
+
64
+ Args:
65
+ file_path: Path to the document file
66
+
67
+ Returns:
68
+ Dictionary containing extracted text and metadata
69
+ """
70
+ if not os.path.exists(file_path):
71
+ raise DocumentProcessingError(f"Document not found: {file_path}", file_path)
72
+
73
+ # Validate file size
74
+ file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
75
+ if file_size_mb > self.max_file_size_mb:
76
+ raise DocumentProcessingError(
77
+ f"File too large: {file_size_mb:.1f}MB (max: {self.max_file_size_mb}MB)",
78
+ file_path,
79
+ )
80
+
81
+ file_extension = os.path.splitext(file_path)[1].lower()
82
+
83
+ # Validate file format
84
+ if file_extension not in self.supported_formats:
85
+ raise DocumentProcessingError(
86
+ f"Unsupported file format: {file_extension}", file_path
87
+ )
88
+
89
+ self.logger.info(f"Processing document: {file_path} ({file_size_mb:.1f}MB)")
90
+
91
+ try:
92
+ if file_extension == ".pdf":
93
+ return self._process_pdf(file_path)
94
+ elif file_extension == ".docx":
95
+ return self._process_docx(file_path)
96
+ elif file_extension in [".csv", ".xlsx", ".xls"]:
97
+ return self._process_spreadsheet(file_path)
98
+ elif file_extension == ".pptx":
99
+ return self._process_pptx(file_path)
100
+ elif file_extension in [".txt", ".md"]:
101
+ return self._process_text_file(file_path)
102
+ except Exception as e:
103
+ raise DocumentProcessingError(
104
+ f"Error processing document: {str(e)}", file_path
105
+ )
106
+
107
+ def process_batch(self, file_paths: List[str]) -> List[Dict[str, Any]]:
108
+ """
109
+ Process multiple documents in batch.
110
+
111
+ Args:
112
+ file_paths: List of file paths to process
113
+
114
+ Returns:
115
+ List of processed document results
116
+ """
117
+ results = []
118
+ self.logger.info(f"Processing batch of {len(file_paths)} documents")
119
+
120
+ for i, file_path in enumerate(file_paths):
121
+ try:
122
+ result = self.process_document(file_path)
123
+ results.append(result)
124
+ self.logger.info(f"Processed {i+1}/{len(file_paths)}: {file_path}")
125
+ except Exception as e:
126
+ self.logger.error(f"❌ Failed to process {file_path}: {str(e)}")
127
+ # Continue with other files
128
+ continue
129
+
130
+ return results
131
+
132
+ def _extract_metadata(self, file_path: str) -> Dict[str, Any]:
133
+ """
134
+ Extract common metadata from file.
135
+
136
+ Args:
137
+ file_path: Path to the file
138
+
139
+ Returns:
140
+ Dictionary containing file metadata
141
+ """
142
+ file_stat = os.stat(file_path)
143
+ file_path_obj = Path(file_path)
144
+
145
+ return {
146
+ "filename": file_path_obj.name,
147
+ "file_extension": file_path_obj.suffix.lower(),
148
+ "file_size_bytes": file_stat.st_size,
149
+ "file_size_mb": round(file_stat.st_size / (1024 * 1024), 2),
150
+ "created_time": datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
151
+ "modified_time": datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
152
+ "processed_time": datetime.now().isoformat(),
153
+ }
154
+
155
+ def _process_pdf(self, file_path: str) -> Dict[str, Any]:
156
+ """
157
+ 📄 Extract text from a PDF document using PyMuPDF with fallback to pdfplumber.
158
+
159
+ Args:
160
+ file_path: Path to the PDF file
161
+
162
+ Returns:
163
+ Dictionary with extracted text and metadata
164
+ """
165
+ self.logger.info(f"Processing PDF: {file_path}")
166
+
167
+ text_content = []
168
+ metadata = self._extract_metadata(file_path)
169
+
170
+ try:
171
+ # Primary method: PyMuPDF (faster)
172
+ doc = fitz.open(file_path)
173
+ metadata.update(
174
+ {
175
+ "page_count": doc.page_count,
176
+ "title": doc.metadata.get("title", ""),
177
+ "author": doc.metadata.get("author", ""),
178
+ "subject": doc.metadata.get("subject", ""),
179
+ "creator": doc.metadata.get("creator", ""),
180
+ }
181
+ )
182
+
183
+ for page_num in range(doc.page_count):
184
+ page = doc[page_num]
185
+ text = page.get_text()
186
+ if text.strip(): # Only add non-empty pages
187
+ text_content.append({"page": page_num + 1, "content": text.strip()})
188
+
189
+ doc.close()
190
+
191
+ except Exception as e:
192
+ self.logger.warning(f"PyMuPDF failed, trying pdfplumber: {str(e)}")
193
+
194
+ # Fallback method: pdfplumber (more robust for complex PDFs)
195
+ try:
196
+ with pdfplumber.open(file_path) as pdf:
197
+ metadata["page_count"] = len(pdf.pages)
198
+
199
+ for page_num, page in enumerate(pdf.pages):
200
+ text = page.extract_text()
201
+ if text and text.strip():
202
+ text_content.append(
203
+ {"page": page_num + 1, "content": text.strip()}
204
+ )
205
+
206
+ except Exception as fallback_error:
207
+ raise DocumentProcessingError(
208
+ f"Both PDF extraction methods failed: {str(fallback_error)}",
209
+ file_path,
210
+ )
211
+
212
+ # Final content processing
213
+ full_text = "\n\n".join([item["content"] for item in text_content])
214
+ metadata["total_characters"] = len(full_text)
215
+ metadata["total_words"] = len(full_text.split())
216
+
217
+ return {
218
+ "content": full_text,
219
+ "pages": text_content,
220
+ "metadata": metadata,
221
+ "source": file_path,
222
+ "document_type": "pdf",
223
+ }
224
+
225
+ def _process_docx(self, file_path: str) -> Dict[str, Any]:
226
+ """
227
+ Extract text from a DOCX document using python-docx.
228
+
229
+ Args:
230
+ file_path: Path to the DOCX file
231
+
232
+ Returns:
233
+ Dictionary with extracted text and metadata
234
+ """
235
+ self.logger.info(f"Processing DOCX: {file_path}")
236
+
237
+ try:
238
+ doc = docx.Document(file_path)
239
+ metadata = self._extract_metadata(file_path)
240
+
241
+ # Extract document properties
242
+ core_props = doc.core_properties
243
+ metadata.update(
244
+ {
245
+ "title": core_props.title or "",
246
+ "author": core_props.author or "",
247
+ "subject": core_props.subject or "",
248
+ "created": (
249
+ core_props.created.isoformat() if core_props.created else ""
250
+ ),
251
+ "modified": (
252
+ core_props.modified.isoformat() if core_props.modified else ""
253
+ ),
254
+ "paragraph_count": len(doc.paragraphs),
255
+ }
256
+ )
257
+
258
+ # Extract text content
259
+ paragraphs = []
260
+ full_text_parts = []
261
+
262
+ for i, paragraph in enumerate(doc.paragraphs):
263
+ text = paragraph.text.strip()
264
+ if text: # Only include non-empty paragraphs
265
+ paragraphs.append({"paragraph": i + 1, "content": text})
266
+ full_text_parts.append(text)
267
+
268
+ # Extract tables if present
269
+ tables_content = []
270
+ for table_idx, table in enumerate(doc.tables):
271
+ table_data = []
272
+ for row in table.rows:
273
+ row_data = [cell.text.strip() for cell in row.cells]
274
+ if any(row_data): # Only include non-empty rows
275
+ table_data.append(row_data)
276
+
277
+ if table_data:
278
+ tables_content.append({"table": table_idx + 1, "data": table_data})
279
+ # Add table content to full text
280
+ table_text = "\n".join([" | ".join(row) for row in table_data])
281
+ full_text_parts.append(f"\n[Table {table_idx + 1}]\n{table_text}")
282
+
283
+ full_text = "\n\n".join(full_text_parts)
284
+ metadata.update(
285
+ {
286
+ "total_characters": len(full_text),
287
+ "total_words": len(full_text.split()),
288
+ "table_count": len(tables_content),
289
+ }
290
+ )
291
+
292
+ return {
293
+ "content": full_text,
294
+ "paragraphs": paragraphs,
295
+ "tables": tables_content,
296
+ "metadata": metadata,
297
+ "source": file_path,
298
+ "document_type": "docx",
299
+ }
300
+
301
+ except Exception as e:
302
+ raise DocumentProcessingError(f"Error processing DOCX: {str(e)}", file_path)
303
+
304
+ def _process_spreadsheet(self, file_path: str) -> Dict[str, Any]:
305
+ """
306
+ Extract text from a CSV or Excel file using pandas.
307
+
308
+ Args:
309
+ file_path: Path to the spreadsheet file
310
+
311
+ Returns:
312
+ Dictionary with extracted text and metadata
313
+ """
314
+ file_extension = os.path.splitext(file_path)[1].lower()
315
+ self.logger.info(f"Processing spreadsheet: {file_path}")
316
+
317
+ try:
318
+ metadata = self._extract_metadata(file_path)
319
+ sheets_data = []
320
+
321
+ if file_extension == ".csv":
322
+ # 📄 Process CSV file
323
+ df = pd.read_csv(file_path, encoding="utf-8")
324
+ sheet_content = self._process_dataframe(df, "Sheet1")
325
+ sheets_data.append(sheet_content)
326
+ metadata["sheet_count"] = 1
327
+
328
+ else:
329
+ # Process Excel file
330
+ excel_file = pd.ExcelFile(file_path)
331
+ metadata["sheet_count"] = len(excel_file.sheet_names)
332
+
333
+ for sheet_name in excel_file.sheet_names:
334
+ df = pd.read_excel(file_path, sheet_name=sheet_name)
335
+ sheet_content = self._process_dataframe(df, sheet_name)
336
+ sheets_data.append(sheet_content)
337
+
338
+ # 🔗 Combine all sheets content
339
+ full_text_parts = []
340
+ for sheet in sheets_data:
341
+ full_text_parts.append(f"[{sheet['sheet_name']}]\n{sheet['content']}")
342
+
343
+ full_text = "\n\n".join(full_text_parts)
344
+ metadata.update(
345
+ {
346
+ "total_characters": len(full_text),
347
+ "total_words": len(full_text.split()),
348
+ "total_rows": sum(sheet["row_count"] for sheet in sheets_data),
349
+ "total_columns": (
350
+ max(sheet["column_count"] for sheet in sheets_data)
351
+ if sheets_data
352
+ else 0
353
+ ),
354
+ }
355
+ )
356
+
357
+ return {
358
+ "content": full_text,
359
+ "sheets": sheets_data,
360
+ "metadata": metadata,
361
+ "source": file_path,
362
+ "document_type": "spreadsheet",
363
+ }
364
+
365
+ except Exception as e:
366
+ raise DocumentProcessingError(
367
+ f"Error processing spreadsheet: {str(e)}", file_path
368
+ )
369
+
370
+ def _process_dataframe(self, df: pd.DataFrame, sheet_name: str) -> Dict[str, Any]:
371
+ """
372
+ Process a pandas DataFrame into text content.
373
+
374
+ Args:
375
+ df: Pandas DataFrame
376
+ sheet_name: Name of the sheet
377
+
378
+ Returns:
379
+ Dictionary with processed sheet data
380
+ """
381
+ # Clean the dataframe
382
+ df = df.dropna(how="all") # Remove completely empty rows
383
+ df = df.fillna("") # Fill NaN with empty strings
384
+
385
+ # Create text representation
386
+ content_parts = []
387
+
388
+ # Add headers
389
+ headers = df.columns.tolist()
390
+ content_parts.append(" | ".join(str(h) for h in headers))
391
+ content_parts.append("-" * 50) # Separator
392
+
393
+ # Add data rows
394
+ for _, row in df.iterrows():
395
+ row_text = " | ".join(str(cell) for cell in row.values)
396
+ content_parts.append(row_text)
397
+
398
+ content = "\n".join(content_parts)
399
+
400
+ return {
401
+ "sheet_name": sheet_name,
402
+ "content": content,
403
+ "headers": headers,
404
+ "row_count": len(df),
405
+ "column_count": len(df.columns),
406
+ "data": df.to_dict("records"), # For structured access
407
+ }
408
+
409
+ def _process_pptx(self, file_path: str) -> Dict[str, Any]:
410
+ """
411
+ 🎯 Extract text from a PowerPoint presentation using python-pptx.
412
+
413
+ Args:
414
+ file_path: Path to the PPTX file
415
+
416
+ Returns:
417
+ Dictionary with extracted text and metadata
418
+ """
419
+ self.logger.info(f" Processing PPTX: {file_path}")
420
+
421
+ try:
422
+ presentation = pptx.Presentation(file_path)
423
+ metadata = self._extract_metadata(file_path)
424
+
425
+ # Extract presentation metadata
426
+ core_props = presentation.core_properties
427
+ metadata.update(
428
+ {
429
+ "title": core_props.title or "",
430
+ "author": core_props.author or "",
431
+ "subject": core_props.subject or "",
432
+ "created": (
433
+ core_props.created.isoformat() if core_props.created else ""
434
+ ),
435
+ "modified": (
436
+ core_props.modified.isoformat() if core_props.modified else ""
437
+ ),
438
+ "slide_count": len(presentation.slides),
439
+ }
440
+ )
441
+
442
+ # 🎯 Extract content from slides
443
+ slides_content = []
444
+ full_text_parts = []
445
+
446
+ for slide_idx, slide in enumerate(presentation.slides):
447
+ slide_text_parts = []
448
+
449
+ # Extract text from all shapes in the slide
450
+ for shape in slide.shapes:
451
+ if hasattr(shape, "text") and shape.text.strip():
452
+ slide_text_parts.append(shape.text.strip())
453
+
454
+ if slide_text_parts:
455
+ slide_content = "\n".join(slide_text_parts)
456
+ slides_content.append(
457
+ {"slide": slide_idx + 1, "content": slide_content}
458
+ )
459
+ full_text_parts.append(f"[Slide {slide_idx + 1}]\n{slide_content}")
460
+
461
+ full_text = "\n\n".join(full_text_parts)
462
+ metadata.update(
463
+ {
464
+ "total_characters": len(full_text),
465
+ "total_words": len(full_text.split()),
466
+ "slides_with_content": len(slides_content),
467
+ }
468
+ )
469
+
470
+ return {
471
+ "content": full_text,
472
+ "slides": slides_content,
473
+ "metadata": metadata,
474
+ "source": file_path,
475
+ "document_type": "pptx",
476
+ }
477
+
478
+ except Exception as e:
479
+ raise DocumentProcessingError(f"Error processing PPTX: {str(e)}", file_path)
480
+
481
+ def _process_text_file(self, file_path: str) -> Dict[str, Any]:
482
+ """
483
+ 📝 Extract text from plain text files (.txt, .md).
484
+
485
+ Args:
486
+ file_path: Path to the text file
487
+
488
+ Returns:
489
+ Dictionary with extracted text and metadata
490
+ """
491
+ file_extension = os.path.splitext(file_path)[1].lower()
492
+ self.logger.info(f" Processing text file: {file_path}")
493
+
494
+ try:
495
+ metadata = self._extract_metadata(file_path)
496
+
497
+ # Try different encodings for robust text reading
498
+ encodings = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
499
+ content = None
500
+
501
+ for encoding in encodings:
502
+ try:
503
+ with open(file_path, "r", encoding=encoding) as file:
504
+ content = file.read()
505
+ self.logger.info(
506
+ f" Successfully read file with {encoding} encoding"
507
+ )
508
+ break
509
+ except UnicodeDecodeError:
510
+ continue
511
+ except Exception as e:
512
+ self.logger.warning(f"Failed to read with {encoding}: {str(e)}")
513
+ continue
514
+
515
+ if content is None:
516
+ raise DocumentProcessingError(
517
+ f"Could not read file with any supported encoding", file_path
518
+ )
519
+
520
+ # Clean and process content
521
+ content = content.strip()
522
+ if not content:
523
+ raise DocumentProcessingError(
524
+ f"File is empty or contains no readable text", file_path
525
+ )
526
+
527
+ # Split content into logical sections for better processing
528
+ sections = []
529
+ if file_extension == ".md":
530
+ # 📋 For Markdown files, split by headers
531
+ sections = self._split_markdown_content(content)
532
+ else:
533
+ # 📄 For plain text, split by paragraphs
534
+ sections = self._split_text_content(content)
535
+
536
+ # Update metadata with text-specific information
537
+ lines = content.split("\n")
538
+ metadata.update(
539
+ {
540
+ "file_type": (
541
+ "markdown" if file_extension == ".md" else "plain_text"
542
+ ),
543
+ "line_count": len(lines),
544
+ "paragraph_count": len(
545
+ [p for p in content.split("\n\n") if p.strip()]
546
+ ),
547
+ "total_characters": len(content),
548
+ "total_words": len(content.split()),
549
+ "encoding_used": encoding if "encoding" in locals() else "utf-8",
550
+ "sections_count": len(sections),
551
+ }
552
+ )
553
+
554
+ return {
555
+ "content": content,
556
+ "sections": sections,
557
+ "metadata": metadata,
558
+ "source": file_path,
559
+ "document_type": "markdown" if file_extension == ".md" else "text",
560
+ }
561
+
562
+ except Exception as e:
563
+ raise DocumentProcessingError(
564
+ f"Error processing text file: {str(e)}", file_path
565
+ )
566
+
567
+ def _split_markdown_content(self, content: str) -> List[Dict[str, Any]]:
568
+ """
569
+ Split Markdown content by headers for better organization.
570
+
571
+ Args:
572
+ content: Markdown content
573
+
574
+ Returns:
575
+ List of sections with headers and content
576
+ """
577
+ sections = []
578
+ lines = content.split("\n")
579
+ current_section = {"header": "", "content": [], "level": 0}
580
+
581
+ for line in lines:
582
+ # Check for markdown headers
583
+ if line.strip().startswith("#"):
584
+ # Save previous section if it has content
585
+ if current_section["content"] or current_section["header"]:
586
+ section_content = "\n".join(current_section["content"]).strip()
587
+ if section_content or current_section["header"]:
588
+ sections.append(
589
+ {
590
+ "header": current_section["header"],
591
+ "content": section_content,
592
+ "level": current_section["level"],
593
+ "section_index": len(sections),
594
+ }
595
+ )
596
+
597
+ # Start new section
598
+ header_level = len(line) - len(line.lstrip("#"))
599
+ header_text = line.lstrip("#").strip()
600
+ current_section = {
601
+ "header": header_text,
602
+ "content": [],
603
+ "level": header_level,
604
+ }
605
+ else:
606
+ current_section["content"].append(line)
607
+
608
+ # Add the last section
609
+ if current_section["content"] or current_section["header"]:
610
+ section_content = "\n".join(current_section["content"]).strip()
611
+ if section_content or current_section["header"]:
612
+ sections.append(
613
+ {
614
+ "header": current_section["header"],
615
+ "content": section_content,
616
+ "level": current_section["level"],
617
+ "section_index": len(sections),
618
+ }
619
+ )
620
+
621
+ # If no headers found, treat entire content as one section
622
+ if not sections:
623
+ sections.append(
624
+ {
625
+ "header": "Document Content",
626
+ "content": content.strip(),
627
+ "level": 1,
628
+ "section_index": 0,
629
+ }
630
+ )
631
+
632
+ return sections
633
+
634
+ def _split_text_content(self, content: str) -> List[Dict[str, Any]]:
635
+ """
636
+ Split plain text content by paragraphs.
637
+
638
+ Args:
639
+ content: Plain text content
640
+
641
+ Returns:
642
+ List of paragraph sections
643
+ """
644
+ sections = []
645
+ paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
646
+
647
+ for i, paragraph in enumerate(paragraphs):
648
+ sections.append(
649
+ {
650
+ "header": f"Paragraph {i + 1}",
651
+ "content": paragraph,
652
+ "level": 1,
653
+ "section_index": i,
654
+ }
655
+ )
656
+
657
+ # If no clear paragraphs, treat as single section
658
+ if not sections:
659
+ sections.append(
660
+ {
661
+ "header": "Document Content",
662
+ "content": content.strip(),
663
+ "level": 1,
664
+ "section_index": 0,
665
+ }
666
+ )
667
+
668
+ return sections
src/ingestion/pipeline.py ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ingestion Pipeline Module
3
+
4
+ This module orchestrates the complete document ingestion process,
5
+ integrating all components for a seamless workflow.
6
+
7
+ Components: DocumentProcessor, URLProcessor, TextExtractor, EmbeddingGenerator, VectorDB
8
+ """
9
+
10
+ import logging
11
+ from typing import Dict, List, Any, Optional, Union
12
+ from pathlib import Path
13
+ import asyncio
14
+ from datetime import datetime
15
+
16
+ from .document_processor import DocumentProcessor
17
+ from .url_processor import URLProcessor
18
+ from ingestion.text_extractor import TextExtractor
19
+ from embedding.embedding_generator import EmbeddingGenerator
20
+ from storage.vector_db import VectorDB
21
+ from utils.config_manager import ConfigManager
22
+ from utils.error_handler import error_handler, ErrorType, RAGError
23
+
24
+
25
+ class IngestionPipeline:
26
+ """
27
+ Complete ingestion pipeline that orchestrates document processing, text extraction,
28
+ embedding generation, and vector storage.
29
+
30
+ Features:
31
+ - End-to-end document ingestion
32
+ - URL content processing
33
+ - Batch processing capabilities
34
+ - Progress tracking and statistics
35
+ - Error handling and recovery
36
+ """
37
+
38
+ def __init__(self, config_path: Optional[str] = None):
39
+ """
40
+ Initialize the ingestion pipeline.
41
+
42
+ Args:
43
+ config_path: Path to configuration file
44
+ """
45
+ self.logger = logging.getLogger(__name__)
46
+
47
+ # Load configuration
48
+ self.config_manager = ConfigManager(config_path)
49
+ self.config = self.config_manager.config
50
+
51
+ # Initialize statistics
52
+ self.stats = {
53
+ "documents_processed": 0,
54
+ "urls_processed": 0,
55
+ "chunks_created": 0,
56
+ "embeddings_generated": 0,
57
+ "vectors_stored": 0,
58
+ "errors_encountered": 0,
59
+ "start_time": None,
60
+ "end_time": None,
61
+ }
62
+
63
+ # Initialize components
64
+ self._initialize_components()
65
+
66
+ def _initialize_components(self):
67
+ """Initialize all pipeline components."""
68
+ try:
69
+ # 📄 Document processor
70
+ doc_config = self.config.get("document_processing", {})
71
+ self.document_processor = DocumentProcessor(doc_config)
72
+
73
+ # URL processor
74
+ url_config = self.config.get("url_processing", {})
75
+ self.url_processor = URLProcessor(url_config)
76
+
77
+ # Text extractor
78
+ text_config = self.config.get("document_processing", {})
79
+ self.text_extractor = TextExtractor(text_config)
80
+
81
+ # 🔮 Embedding generator
82
+ embedding_config = self.config.get("embedding", {})
83
+ embedding_config["api_key"] = self.config.get("api_keys", {}).get(
84
+ "gemini_api_key"
85
+ )
86
+ self.embedding_generator = EmbeddingGenerator(embedding_config)
87
+
88
+ # Vector database
89
+ vector_config = self.config.get("vector_db", {})
90
+ vector_config["api_key"] = self.config.get("api_keys", {}).get(
91
+ "pinecone_api_key"
92
+ )
93
+ self.vector_db = VectorDB(vector_config)
94
+
95
+ self.logger.info("All pipeline components initialized successfully")
96
+
97
+ except Exception as e:
98
+ self.logger.error(f"❌ Failed to initialize pipeline components: {str(e)}")
99
+ raise RAGError(f"Pipeline initialization failed: {str(e)}")
100
+
101
+ @error_handler(ErrorType.DOCUMENT_PROCESSING)
102
+ def process_documents(self, file_paths: List[str]) -> Dict[str, Any]:
103
+ """
104
+ Process multiple documents through the complete pipeline.
105
+
106
+ Args:
107
+ file_paths: List of document file paths
108
+
109
+ Returns:
110
+ Processing results and statistics
111
+ """
112
+ self.logger.info(
113
+ f"Starting document processing pipeline for {len(file_paths)} files"
114
+ )
115
+ self.stats["start_time"] = datetime.now()
116
+
117
+ all_results = []
118
+
119
+ for i, file_path in enumerate(file_paths):
120
+ try:
121
+ self.logger.info(
122
+ f"📄 Processing document {i+1}/{len(file_paths)}: {file_path}"
123
+ )
124
+
125
+ # 📄 Step 1: Process document
126
+ doc_result = self.document_processor.process_document(file_path)
127
+ self.stats["documents_processed"] += 1
128
+
129
+ # Step 2: Extract and chunk text
130
+ text_chunks = self.text_extractor.process_text(
131
+ doc_result["content"], doc_result["metadata"]
132
+ )
133
+ self.stats["chunks_created"] += len(text_chunks)
134
+
135
+ # 🔮 Step 3: Generate embeddings
136
+ embedded_chunks = self.embedding_generator.generate_embeddings(
137
+ text_chunks
138
+ )
139
+ valid_embeddings = [
140
+ chunk for chunk in embedded_chunks if chunk.get("embedding")
141
+ ]
142
+ self.stats["embeddings_generated"] += len(valid_embeddings)
143
+
144
+ # Step 4: Store in vector database
145
+ if valid_embeddings:
146
+ storage_success = self.vector_db.store_embeddings(valid_embeddings)
147
+ if storage_success:
148
+ self.stats["vectors_stored"] += len(valid_embeddings)
149
+
150
+ # Compile results
151
+ result = {
152
+ "file_path": file_path,
153
+ "document_type": doc_result.get("document_type"),
154
+ "chunks_created": len(text_chunks),
155
+ "embeddings_generated": len(valid_embeddings),
156
+ "storage_success": storage_success if valid_embeddings else False,
157
+ "metadata": doc_result["metadata"],
158
+ }
159
+
160
+ all_results.append(result)
161
+ self.logger.info(
162
+ f"Document processed: {len(text_chunks)} chunks, {len(valid_embeddings)} embeddings"
163
+ )
164
+
165
+ except Exception as e:
166
+ self.stats["errors_encountered"] += 1
167
+ self.logger.error(f"❌ Error processing {file_path}: {str(e)}")
168
+
169
+ all_results.append(
170
+ {
171
+ "file_path": file_path,
172
+ "error": str(e),
173
+ "chunks_created": 0,
174
+ "embeddings_generated": 0,
175
+ "storage_success": False,
176
+ }
177
+ )
178
+
179
+ self.stats["end_time"] = datetime.now()
180
+
181
+ return {
182
+ "results": all_results,
183
+ "statistics": self.get_statistics(),
184
+ "success_rate": self._calculate_success_rate(all_results),
185
+ }
186
+
187
+ @error_handler(ErrorType.URL_PROCESSING)
188
+ def process_urls(self, urls: List[str]) -> Dict[str, Any]:
189
+ """
190
+ Process multiple URLs through the complete pipeline.
191
+
192
+ Args:
193
+ urls: List of URLs to process
194
+
195
+ Returns:
196
+ Processing results and statistics
197
+ """
198
+ self.logger.info(f"Starting URL processing pipeline for {len(urls)} URLs")
199
+ self.stats["start_time"] = datetime.now()
200
+
201
+ all_results = []
202
+
203
+ for i, url in enumerate(urls):
204
+ try:
205
+ self.logger.info(f"Processing URL {i+1}/{len(urls)}: {url}")
206
+
207
+ # Step 1: Process URL
208
+ url_result = self.url_processor.process_url(url)
209
+ if not url_result:
210
+ self.logger.warning(f"No content extracted from URL: {url}")
211
+ continue
212
+
213
+ self.stats["urls_processed"] += 1
214
+
215
+ # Step 2: Extract and chunk text
216
+ text_chunks = self.text_extractor.process_text(
217
+ url_result["content"], url_result["metadata"]
218
+ )
219
+ self.stats["chunks_created"] += len(text_chunks)
220
+
221
+ # 🔮 Step 3: Generate embeddings
222
+ embedded_chunks = self.embedding_generator.generate_embeddings(
223
+ text_chunks
224
+ )
225
+ valid_embeddings = [
226
+ chunk for chunk in embedded_chunks if chunk.get("embedding")
227
+ ]
228
+ self.stats["embeddings_generated"] += len(valid_embeddings)
229
+
230
+ # Step 4: Store in vector database
231
+ storage_success = False
232
+ if valid_embeddings:
233
+ storage_success = self.vector_db.store_embeddings(valid_embeddings)
234
+ if storage_success:
235
+ self.stats["vectors_stored"] += len(valid_embeddings)
236
+
237
+ # Process linked documents if any
238
+ linked_results = []
239
+ for linked_doc in url_result.get("linked_documents", []):
240
+ if linked_doc.get("content"):
241
+ linked_chunks = self.text_extractor.process_text(
242
+ linked_doc["content"], linked_doc["metadata"]
243
+ )
244
+ linked_embedded = self.embedding_generator.generate_embeddings(
245
+ linked_chunks
246
+ )
247
+ linked_valid = [
248
+ chunk for chunk in linked_embedded if chunk.get("embedding")
249
+ ]
250
+
251
+ if linked_valid:
252
+ self.vector_db.store_embeddings(linked_valid)
253
+ linked_results.append(
254
+ {
255
+ "url": linked_doc["source"],
256
+ "chunks": len(linked_chunks),
257
+ "embeddings": len(linked_valid),
258
+ }
259
+ )
260
+
261
+ # Compile results
262
+ result = {
263
+ "url": url,
264
+ "chunks_created": len(text_chunks),
265
+ "embeddings_generated": len(valid_embeddings),
266
+ "storage_success": storage_success,
267
+ "linked_documents": linked_results,
268
+ "metadata": url_result["metadata"],
269
+ }
270
+
271
+ all_results.append(result)
272
+ self.logger.info(
273
+ f"URL processed: {len(text_chunks)} chunks, {len(valid_embeddings)} embeddings"
274
+ )
275
+
276
+ except Exception as e:
277
+ self.stats["errors_encountered"] += 1
278
+ self.logger.error(f"❌ Error processing {url}: {str(e)}")
279
+
280
+ all_results.append(
281
+ {
282
+ "url": url,
283
+ "error": str(e),
284
+ "chunks_created": 0,
285
+ "embeddings_generated": 0,
286
+ "storage_success": False,
287
+ }
288
+ )
289
+
290
+ self.stats["end_time"] = datetime.now()
291
+
292
+ return {
293
+ "results": all_results,
294
+ "statistics": self.get_statistics(),
295
+ "success_rate": self._calculate_success_rate(all_results),
296
+ }
297
+
298
+ def process_mixed_sources(
299
+ self, file_paths: Optional[List[str]] = None, urls: Optional[List[str]] = None
300
+ ) -> Dict[str, Any]:
301
+ """
302
+ Process both documents and URLs in a single pipeline run.
303
+
304
+ Args:
305
+ file_paths: Optional list of document file paths
306
+ urls: Optional list of URLs
307
+
308
+ Returns:
309
+ Combined processing results
310
+ """
311
+ self.logger.info("Starting mixed source processing pipeline")
312
+
313
+ results = {
314
+ "document_results": [],
315
+ "url_results": [],
316
+ "combined_statistics": {},
317
+ "overall_success_rate": 0.0,
318
+ }
319
+
320
+ # 📄 Process documents
321
+ if file_paths:
322
+ doc_results = self.process_documents(file_paths)
323
+ results["document_results"] = doc_results["results"]
324
+
325
+ # Process URLs
326
+ if urls:
327
+ url_results = self.process_urls(urls)
328
+ results["url_results"] = url_results["results"]
329
+
330
+ # Combine statistics
331
+ results["combined_statistics"] = self.get_statistics()
332
+
333
+ # 🎯 Calculate overall success rate
334
+ all_items = results["document_results"] + results["url_results"]
335
+ results["overall_success_rate"] = self._calculate_success_rate(all_items)
336
+
337
+ return results
338
+
339
+ def _calculate_success_rate(self, results: List[Dict[str, Any]]) -> float:
340
+ """
341
+ Calculate success rate from results.
342
+
343
+ Args:
344
+ results: List of processing results
345
+
346
+ Returns:
347
+ Success rate as percentage
348
+ """
349
+ if not results:
350
+ return 0.0
351
+
352
+ successful = sum(
353
+ 1 for result in results if result.get("storage_success", False)
354
+ )
355
+ return (successful / len(results)) * 100
356
+
357
+ def get_statistics(self) -> Dict[str, Any]:
358
+ """
359
+ Get comprehensive pipeline statistics.
360
+
361
+ Returns:
362
+ Statistics dictionary
363
+ """
364
+ stats = self.stats.copy()
365
+
366
+ if stats["start_time"] and stats["end_time"]:
367
+ runtime = stats["end_time"] - stats["start_time"]
368
+ stats["runtime_seconds"] = runtime.total_seconds()
369
+ stats["processing_rate"] = (
370
+ stats["documents_processed"] + stats["urls_processed"]
371
+ ) / max(1, runtime.total_seconds())
372
+
373
+ # 🔮 Add component statistics
374
+ stats["embedding_stats"] = self.embedding_generator.get_statistics()
375
+ stats["vector_db_stats"] = self.vector_db.get_index_stats()
376
+ stats["url_processor_stats"] = self.url_processor.get_statistics()
377
+
378
+ return stats
379
+
380
+ def health_check(self) -> Dict[str, Any]:
381
+ """
382
+ Perform comprehensive health check on all components.
383
+
384
+ Returns:
385
+ Health check results
386
+ """
387
+ health = {
388
+ "overall_status": "healthy",
389
+ "timestamp": datetime.now().isoformat(),
390
+ "components": {},
391
+ }
392
+
393
+ try:
394
+ # 🔮 Check embedding generator
395
+ if self.embedding_generator.client:
396
+ health["components"]["embedding_generator"] = "Ready"
397
+ else:
398
+ health["components"]["embedding_generator"] = "❌ Not configured"
399
+ health["overall_status"] = "degraded"
400
+
401
+ # Check vector database
402
+ vector_health = self.vector_db.health_check()
403
+ health["components"]["vector_database"] = vector_health["status"]
404
+ if vector_health["status"] != "healthy":
405
+ health["overall_status"] = "degraded"
406
+
407
+ # Add component details
408
+ health["details"] = {
409
+ "vector_db_health": vector_health,
410
+ "embedding_stats": self.embedding_generator.get_statistics(),
411
+ "pipeline_stats": self.get_statistics(),
412
+ }
413
+
414
+ except Exception as e:
415
+ health["overall_status"] = "unhealthy"
416
+ health["error"] = str(e)
417
+
418
+ return health
419
+
420
+ def reset_statistics(self):
421
+ """Reset pipeline statistics."""
422
+ self.stats = {
423
+ "documents_processed": 0,
424
+ "urls_processed": 0,
425
+ "chunks_created": 0,
426
+ "embeddings_generated": 0,
427
+ "vectors_stored": 0,
428
+ "errors_encountered": 0,
429
+ "start_time": None,
430
+ "end_time": None,
431
+ }
432
+
433
+ # Reset component statistics
434
+ self.embedding_generator.stats = {
435
+ "total_requests": 0,
436
+ "successful_requests": 0,
437
+ "failed_requests": 0,
438
+ "cache_hits": 0,
439
+ "total_tokens_processed": 0,
440
+ "start_time": datetime.now(),
441
+ }
442
+
443
+ self.vector_db.reset_stats()
444
+ self.url_processor.reset()
445
+
446
+ self.logger.info("All pipeline statistics reset")
447
+
448
+
449
+ # Convenience function for quick pipeline usage
450
+ def create_pipeline(config_path: Optional[str] = None) -> IngestionPipeline:
451
+ """
452
+ Create and return a configured ingestion pipeline.
453
+
454
+ Args:
455
+ config_path: Optional path to configuration file
456
+
457
+ Returns:
458
+ Configured IngestionPipeline instance
459
+ """
460
+ return IngestionPipeline(config_path)
461
+
462
+
463
+ # 📄 Example usage functions
464
+ def process_documents_simple(
465
+ file_paths: List[str], config_path: Optional[str] = None
466
+ ) -> Dict[str, Any]:
467
+ """
468
+ 📄 Simple function to process documents with default configuration.
469
+
470
+ Args:
471
+ file_paths: List of document file paths
472
+ config_path: Optional configuration file path
473
+
474
+ Returns:
475
+ Processing results
476
+ """
477
+ pipeline = create_pipeline(config_path)
478
+ return pipeline.process_documents(file_paths)
479
+
480
+
481
+ def process_urls_simple(
482
+ urls: List[str], config_path: Optional[str] = None
483
+ ) -> Dict[str, Any]:
484
+ """
485
+ Simple function to process URLs with default configuration.
486
+
487
+ Args:
488
+ urls: List of URLs to process
489
+ config_path: Optional configuration file path
490
+
491
+ Returns:
492
+ Processing results
493
+ """
494
+ pipeline = create_pipeline(config_path)
495
+ return pipeline.process_urls(urls)
src/ingestion/text_extractor.py ADDED
@@ -0,0 +1,526 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text Extractor Module
3
+
4
+ This module is responsible for cleaning, normalizing, and chunking text
5
+ from various sources with complete NLP functionality.
6
+
7
+ Technologies: NLTK, spaCy, regex, langdetect
8
+ """
9
+
10
+ import re
11
+ import logging
12
+ from datetime import datetime
13
+ from typing import Dict, List, Any, Optional, Union
14
+ import unicodedata
15
+
16
+ # Import NLP libraries
17
+ try:
18
+ import nltk
19
+ from nltk.tokenize import sent_tokenize, word_tokenize
20
+ from nltk.corpus import stopwords
21
+ from nltk.stem import PorterStemmer
22
+ import spacy
23
+ from langdetect import detect
24
+ from langdetect.lang_detect_exception import LangDetectException as LangDetectError
25
+
26
+ # Download required NLTK data
27
+ try:
28
+ nltk.data.find("tokenizers/punkt")
29
+ except LookupError:
30
+ nltk.download("punkt", quiet=True)
31
+
32
+ try:
33
+ nltk.data.find("corpora/stopwords")
34
+ except LookupError:
35
+ nltk.download("stopwords", quiet=True)
36
+
37
+ except ImportError as e:
38
+ logging.warning(f"Some NLP libraries are not installed: {e}")
39
+
40
+ from utils.error_handler import error_handler, ErrorType
41
+
42
+
43
+ class TextExtractor:
44
+ """
45
+ Cleans, normalizes, and chunks text from various sources with intelligent processing.
46
+
47
+ Features:
48
+ - Advanced text cleaning and normalization
49
+ - Language detection
50
+ - Intelligent sentence segmentation
51
+ - Smart text chunking with overlap
52
+ - Metadata preservation
53
+ """
54
+
55
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
56
+ """
57
+ Initialize the TextExtractor with configuration.
58
+
59
+ Args:
60
+ config: Configuration dictionary with processing parameters
61
+ """
62
+ self.config = config or {}
63
+ self.logger = logging.getLogger(__name__)
64
+
65
+ # Configuration settings
66
+ self.chunk_size = self.config.get("chunk_size", 1000)
67
+ self.chunk_overlap = self.config.get("chunk_overlap", 200)
68
+ self.min_chunk_size = self.config.get("min_chunk_size", 100)
69
+ self.max_chunk_size = self.config.get("max_chunk_size", 2000)
70
+
71
+ # NLP settings
72
+ self.enable_language_detection = self.config.get(
73
+ "enable_language_detection", True
74
+ )
75
+ self.preserve_formatting = self.config.get("preserve_formatting", False)
76
+ self.remove_stopwords = self.config.get("remove_stopwords", False)
77
+
78
+ # Initialize NLP components
79
+ self.nlp = None
80
+ self.stemmer = None
81
+ self.stop_words = set()
82
+
83
+ self._initialize_nlp_components()
84
+
85
+ def _initialize_nlp_components(self):
86
+ """Initialize NLP components with error handling."""
87
+ try:
88
+ # Load spaCy model for advanced processing
89
+ self.nlp = spacy.load("en_core_web_sm")
90
+ self.logger.info("spaCy model loaded successfully")
91
+ except Exception as e:
92
+ self.logger.warning(f"Could not load spaCy model: {str(e)}")
93
+
94
+ try:
95
+ # Initialize NLTK components
96
+ self.stemmer = PorterStemmer()
97
+ self.stop_words = set(stopwords.words("english"))
98
+ self.logger.info("NLTK components initialized")
99
+ except Exception as e:
100
+ self.logger.warning(f"Could not initialize NLTK components: {str(e)}")
101
+
102
+ @error_handler(ErrorType.DOCUMENT_PROCESSING)
103
+ def process_text(
104
+ self,
105
+ text: Union[str, List[str]],
106
+ metadata: Optional[Dict[str, Any]] = None,
107
+ preserve_structure: bool = False,
108
+ ) -> List[Dict[str, Any]]:
109
+ """
110
+ Process text by cleaning, normalizing, and chunking with intelligence.
111
+
112
+ Args:
113
+ text: Raw text content (string or list of strings)
114
+ metadata: Optional metadata to include with each chunk
115
+ preserve_structure: Whether to preserve original text structure
116
+
117
+ Returns:
118
+ List of dictionaries containing processed text chunks and metadata
119
+ """
120
+ if not text:
121
+ return []
122
+
123
+ # Convert list to string if needed
124
+ if isinstance(text, list):
125
+ text = "\n".join(str(item) for item in text if item)
126
+
127
+ if not text.strip():
128
+ return []
129
+
130
+ self.logger.info(f"Processing text: {len(text)} characters")
131
+
132
+ # Detect language
133
+ language = self._detect_language(text)
134
+
135
+ # Clean and normalize the text
136
+ cleaned_text = self._clean_text(text, preserve_structure)
137
+
138
+ if len(cleaned_text.strip()) < self.min_chunk_size:
139
+ self.logger.warning(
140
+ f"Text too short after cleaning: {len(cleaned_text)} chars"
141
+ )
142
+ return []
143
+
144
+ # Split text into chunks
145
+ chunks = self._chunk_text(cleaned_text)
146
+
147
+ # Prepare result with enhanced metadata
148
+ result = []
149
+ base_metadata = metadata.copy() if metadata else {}
150
+ base_metadata.update(
151
+ {
152
+ "language": language,
153
+ "original_length": len(text),
154
+ "cleaned_length": len(cleaned_text),
155
+ "chunk_count": len(chunks),
156
+ "processing_time": datetime.now().isoformat(),
157
+ "chunk_size_config": self.chunk_size,
158
+ "chunk_overlap_config": self.chunk_overlap,
159
+ }
160
+ )
161
+
162
+ for i, chunk in enumerate(chunks):
163
+ chunk_metadata = base_metadata.copy()
164
+ chunk_stats = self._analyze_chunk(chunk)
165
+
166
+ chunk_metadata.update(
167
+ {
168
+ "chunk_index": i,
169
+ "chunk_id": f"chunk_{i}_{hash(chunk) % 10000}",
170
+ **chunk_stats,
171
+ }
172
+ )
173
+
174
+ result.append({"content": chunk, "metadata": chunk_metadata})
175
+
176
+ self.logger.info(f"Processed text into {len(chunks)} chunks")
177
+ return result
178
+
179
+ def _detect_language(self, text: str) -> str:
180
+ """
181
+ Detect the language of the text.
182
+
183
+ Args:
184
+ text: Text to analyze
185
+
186
+ Returns:
187
+ Language code (e.g., 'en', 'es', 'fr')
188
+ """
189
+ if not self.enable_language_detection:
190
+ return "en" # Default to English
191
+
192
+ try:
193
+ # Use a sample of text for detection (first 1000 chars)
194
+ sample = text[:1000].strip()
195
+ if len(sample) < 50: # Too short for reliable detection
196
+ return "en"
197
+
198
+ language = detect(sample)
199
+ self.logger.info(f"Detected language: {language}")
200
+ return language
201
+
202
+ except (LangDetectError, Exception) as e:
203
+ self.logger.warning(f"Language detection failed: {str(e)}")
204
+ return "en" # Default to English
205
+
206
+ def _clean_text(self, text: str, preserve_structure: bool = False) -> str:
207
+ """
208
+ Clean and normalize text with advanced processing.
209
+
210
+ Args:
211
+ text: Raw text to clean
212
+ preserve_structure: Whether to preserve formatting
213
+
214
+ Returns:
215
+ Cleaned and normalized text
216
+ """
217
+ # Unicode normalization
218
+ text = unicodedata.normalize("NFKC", text)
219
+
220
+ if not preserve_structure:
221
+ # Basic cleaning operations
222
+ # Remove excessive whitespace but preserve paragraph breaks
223
+ text = re.sub(r"[ \t]+", " ", text) # Multiple spaces/tabs to single space
224
+ text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text) # Multiple newlines to double
225
+
226
+ # Remove or normalize special characters
227
+ # Keep basic punctuation and common symbols
228
+ text = re.sub(r'[^\w\s.,;:!?\'"\-()[\]{}/@#$%&*+=<>|\\~`\n]', " ", text)
229
+
230
+ # Clean up whitespace again
231
+ text = re.sub(r"[ \t]+", " ", text)
232
+ text = re.sub(r"\n\s*\n+", "\n\n", text)
233
+
234
+ # Remove common artifacts
235
+ # Remove page numbers and headers/footers patterns
236
+ text = re.sub(r"\n\s*\d+\s*\n", "\n", text) # Standalone page numbers
237
+ text = re.sub(r"\n\s*Page \d+.*?\n", "\n", text, flags=re.IGNORECASE)
238
+
239
+ # Remove excessive punctuation
240
+ text = re.sub(r"[.]{3,}", "...", text) # Multiple dots
241
+ text = re.sub(r"[-]{3,}", "---", text) # Multiple dashes
242
+
243
+ # Final cleanup
244
+ text = text.strip()
245
+
246
+ return text
247
+
248
+ def _chunk_text(self, text: str) -> List[str]:
249
+ """
250
+ Split text into chunks with intelligent boundary detection.
251
+
252
+ Args:
253
+ text: Cleaned text to chunk
254
+
255
+ Returns:
256
+ List of text chunks
257
+ """
258
+ if len(text) <= self.chunk_size:
259
+ return [text]
260
+
261
+ chunks = []
262
+
263
+ # Try intelligent chunking with spaCy first
264
+ if self.nlp:
265
+ try:
266
+ return self._chunk_with_spacy(text)
267
+ except Exception as e:
268
+ self.logger.warning(f"spaCy chunking failed: {str(e)}")
269
+
270
+ # Fallback to NLTK sentence-based chunking
271
+ try:
272
+ return self._chunk_with_sentences(text)
273
+ except Exception as e:
274
+ self.logger.warning(f"Sentence chunking failed: {str(e)}")
275
+
276
+ # Final fallback to character-based chunking
277
+ return self._chunk_by_characters(text)
278
+
279
+ def _chunk_with_spacy(self, text: str) -> List[str]:
280
+ """
281
+ Intelligent chunking using spaCy for better semantic boundaries.
282
+
283
+ Args:
284
+ text: Text to chunk
285
+
286
+ Returns:
287
+ List of text chunks
288
+ """
289
+ doc = self.nlp(text)
290
+ chunks = []
291
+ current_chunk = []
292
+ current_size = 0
293
+
294
+ for sent in doc.sents:
295
+ sent_text = sent.text.strip()
296
+ sent_size = len(sent_text)
297
+
298
+ # 📏 Check if adding this sentence exceeds chunk size
299
+ if current_size + sent_size > self.chunk_size and current_chunk:
300
+ # 📦 Finalize current chunk
301
+ chunk_text = " ".join(current_chunk)
302
+ chunks.append(chunk_text)
303
+
304
+ # Start new chunk with overlap
305
+ overlap_chunk, overlap_size = self._create_overlap(current_chunk)
306
+ current_chunk = overlap_chunk
307
+ current_size = overlap_size
308
+
309
+ current_chunk.append(sent_text)
310
+ current_size += sent_size
311
+
312
+ # 📦 Add the last chunk
313
+ if current_chunk:
314
+ chunk_text = " ".join(current_chunk)
315
+ if len(chunk_text.strip()) >= self.min_chunk_size:
316
+ chunks.append(chunk_text)
317
+
318
+ return chunks
319
+
320
+ def _chunk_with_sentences(self, text: str) -> List[str]:
321
+ """
322
+ Chunk text using NLTK sentence tokenization.
323
+
324
+ Args:
325
+ text: Text to chunk
326
+
327
+ Returns:
328
+ List of text chunks
329
+ """
330
+ sentences = sent_tokenize(text)
331
+ chunks = []
332
+ current_chunk = []
333
+ current_size = 0
334
+
335
+ for sentence in sentences:
336
+ sentence = sentence.strip()
337
+ sentence_size = len(sentence)
338
+
339
+ # 📏 Check chunk size limit
340
+ if current_size + sentence_size > self.chunk_size and current_chunk:
341
+ # 📦 Finalize current chunk
342
+ chunk_text = " ".join(current_chunk)
343
+ chunks.append(chunk_text)
344
+
345
+ # Create overlap
346
+ overlap_chunk, overlap_size = self._create_overlap(current_chunk)
347
+ current_chunk = overlap_chunk
348
+ current_size = overlap_size
349
+
350
+ current_chunk.append(sentence)
351
+ current_size += sentence_size
352
+
353
+ # 📦 Add final chunk
354
+ if current_chunk:
355
+ chunk_text = " ".join(current_chunk)
356
+ if len(chunk_text.strip()) >= self.min_chunk_size:
357
+ chunks.append(chunk_text)
358
+
359
+ return chunks
360
+
361
+ def _chunk_by_characters(self, text: str) -> List[str]:
362
+ """
363
+ Fallback character-based chunking with boundary detection.
364
+
365
+ Args:
366
+ text: Text to chunk
367
+
368
+ Returns:
369
+ List of text chunks
370
+ """
371
+ chunks = []
372
+ start = 0
373
+
374
+ while start < len(text):
375
+ end = start + self.chunk_size
376
+
377
+ # Try to find a good boundary
378
+ if end < len(text):
379
+ # Look for sentence boundaries first
380
+ for boundary in [". ", "! ", "? ", "\n\n", "\n", ". "]:
381
+ boundary_pos = text.rfind(boundary, start, end)
382
+ if boundary_pos > start + self.min_chunk_size:
383
+ end = boundary_pos + len(boundary)
384
+ break
385
+
386
+ chunk = text[start:end].strip()
387
+ if len(chunk) >= self.min_chunk_size:
388
+ chunks.append(chunk)
389
+
390
+ # Move start position with overlap
391
+ start = max(start + 1, end - self.chunk_overlap)
392
+
393
+ return chunks
394
+
395
+ def _create_overlap(self, sentences: List[str]) -> tuple:
396
+ """
397
+ Create overlap from previous chunk sentences.
398
+
399
+ Args:
400
+ sentences: List of sentences from previous chunk
401
+
402
+ Returns:
403
+ Tuple of (overlap_sentences, overlap_size)
404
+ """
405
+ overlap_sentences = []
406
+ overlap_size = 0
407
+
408
+ # Add sentences from the end for overlap
409
+ for sentence in reversed(sentences):
410
+ if overlap_size + len(sentence) <= self.chunk_overlap:
411
+ overlap_sentences.insert(0, sentence)
412
+ overlap_size += len(sentence)
413
+ else:
414
+ break
415
+
416
+ return overlap_sentences, overlap_size
417
+
418
+ def _analyze_chunk(self, chunk: str) -> Dict[str, Any]:
419
+ """
420
+ Analyze chunk statistics and properties.
421
+
422
+ Args:
423
+ chunk: Text chunk to analyze
424
+
425
+ Returns:
426
+ Dictionary with chunk statistics
427
+ """
428
+ words = chunk.split()
429
+
430
+ stats = {
431
+ "character_count": len(chunk),
432
+ "word_count": len(words),
433
+ "sentence_count": len(sent_tokenize(chunk)) if chunk else 0,
434
+ "avg_word_length": (
435
+ sum(len(word) for word in words) / len(words) if words else 0
436
+ ),
437
+ }
438
+
439
+ # Advanced analysis with spaCy if available
440
+ if self.nlp:
441
+ try:
442
+ doc = self.nlp(chunk)
443
+ stats.update(
444
+ {
445
+ "entity_count": len(doc.ents),
446
+ "noun_count": len(
447
+ [token for token in doc if token.pos_ == "NOUN"]
448
+ ),
449
+ "verb_count": len(
450
+ [token for token in doc if token.pos_ == "VERB"]
451
+ ),
452
+ }
453
+ )
454
+ except Exception:
455
+ pass # Skip advanced analysis if it fails
456
+
457
+ return stats
458
+
459
+ def extract_keywords(self, text: str, max_keywords: int = 10) -> List[str]:
460
+ """
461
+ Extract keywords from text using NLP techniques.
462
+
463
+ Args:
464
+ text: Text to extract keywords from
465
+ max_keywords: Maximum number of keywords to return
466
+
467
+ Returns:
468
+ List of extracted keywords
469
+ """
470
+ if not self.nlp:
471
+ return []
472
+
473
+ try:
474
+ doc = self.nlp(text)
475
+
476
+ # Extract keywords based on POS tags and frequency
477
+ keywords = []
478
+ word_freq = {}
479
+
480
+ for token in doc:
481
+ if (
482
+ token.pos_ in ["NOUN", "PROPN", "ADJ"]
483
+ and not token.is_stop
484
+ and not token.is_punct
485
+ and len(token.text) > 2
486
+ ):
487
+
488
+ word = token.lemma_.lower()
489
+ word_freq[word] = word_freq.get(word, 0) + 1
490
+
491
+ # Sort by frequency and return top keywords
492
+ sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
493
+ keywords = [word for word, freq in sorted_words[:max_keywords]]
494
+
495
+ return keywords
496
+
497
+ except Exception as e:
498
+ self.logger.warning(f"Keyword extraction failed: {str(e)}")
499
+ return []
500
+
501
+ def get_text_statistics(self, text: str) -> Dict[str, Any]:
502
+ """
503
+ Get comprehensive text statistics.
504
+
505
+ Args:
506
+ text: Text to analyze
507
+
508
+ Returns:
509
+ Dictionary with text statistics
510
+ """
511
+ words = text.split()
512
+ sentences = sent_tokenize(text) if text else []
513
+
514
+ stats = {
515
+ "character_count": len(text),
516
+ "word_count": len(words),
517
+ "sentence_count": len(sentences),
518
+ "paragraph_count": len([p for p in text.split("\n\n") if p.strip()]),
519
+ "avg_words_per_sentence": len(words) / len(sentences) if sentences else 0,
520
+ "avg_chars_per_word": (
521
+ sum(len(word) for word in words) / len(words) if words else 0
522
+ ),
523
+ "language": self._detect_language(text),
524
+ }
525
+
526
+ return stats
src/ingestion/url_processor.py ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ URL Processor Module
3
+
4
+ This module is responsible for crawling and extracting content from provided URLs,
5
+ including nested documents and links with complete web scraping functionality.
6
+
7
+ Technologies: BeautifulSoup, requests, trafilatura
8
+ """
9
+
10
+ import logging
11
+ import time
12
+ import re
13
+ from datetime import datetime
14
+ from typing import Dict, List, Any, Optional, Set
15
+ from urllib.parse import urlparse, urljoin, urlunparse
16
+ from urllib.robotparser import RobotFileParser
17
+
18
+ # Import web scraping libraries
19
+ try:
20
+ import requests
21
+ from bs4 import BeautifulSoup
22
+ import trafilatura
23
+ from requests.adapters import HTTPAdapter
24
+ from urllib3.util.retry import Retry
25
+ except ImportError as e:
26
+ logging.warning(f"Some web scraping libraries are not installed: {e}")
27
+
28
+ from utils.error_handler import URLProcessingError, error_handler, ErrorType
29
+
30
+
31
+ class URLProcessor:
32
+ """
33
+ Processes URLs to extract content from web pages and linked documents with full functionality.
34
+
35
+ Features:
36
+ - Web page content extraction with trafilatura
37
+ - Recursive link following with depth control
38
+ - Rate limiting and retry logic
39
+ - Robots.txt respect
40
+ - Multiple content type handling
41
+ """
42
+
43
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
44
+ """
45
+ Initialize the URLProcessor with configuration.
46
+
47
+ Args:
48
+ config: Configuration dictionary with processing parameters
49
+ """
50
+ self.config = config or {}
51
+ self.logger = logging.getLogger(__name__)
52
+
53
+ # Configuration settings
54
+ self.max_depth = self.config.get("max_depth", 1)
55
+ self.follow_links = self.config.get("follow_links", True)
56
+ self.max_pages = self.config.get("max_pages", 10)
57
+ self.timeout = self.config.get("timeout", 10)
58
+ self.user_agent = self.config.get("user_agent", "RAG-AI-Bot/1.0")
59
+ self.respect_robots_txt = self.config.get("respect_robots_txt", True)
60
+ self.rate_limit_delay = self.config.get("rate_limit_delay", 1.0)
61
+
62
+ # Retry configuration
63
+ self.max_retries = 3
64
+ self.backoff_factor = 0.3
65
+
66
+ # Track visited URLs and robots.txt cache
67
+ self.visited_urls: Set[str] = set()
68
+ self.robots_cache: Dict[str, RobotFileParser] = {}
69
+ self.last_request_time: Dict[str, float] = {}
70
+
71
+ # Setup session with retry strategy
72
+ self.session = self._setup_session()
73
+
74
+ def _setup_session(self) -> requests.Session:
75
+ """
76
+ Setup requests session with retry strategy and headers.
77
+
78
+ Returns:
79
+ Configured requests session
80
+ """
81
+ session = requests.Session()
82
+
83
+ # Retry strategy
84
+ retry_strategy = Retry(
85
+ total=self.max_retries,
86
+ backoff_factor=self.backoff_factor,
87
+ status_forcelist=[429, 500, 502, 503, 504],
88
+ )
89
+
90
+ adapter = HTTPAdapter(max_retries=retry_strategy)
91
+ session.mount("http://", adapter)
92
+ session.mount("https://", adapter)
93
+
94
+ # 🏷Default headers
95
+ session.headers.update(
96
+ {
97
+ "User-Agent": self.user_agent,
98
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
99
+ "Accept-Language": "en-US,en;q=0.5",
100
+ "Accept-Encoding": "gzip, deflate",
101
+ "Connection": "keep-alive",
102
+ }
103
+ )
104
+
105
+ return session
106
+
107
+ @error_handler(ErrorType.URL_PROCESSING)
108
+ def process_url(self, url: str, depth: int = 0) -> Dict[str, Any]:
109
+ """
110
+ Process a URL and extract its content with full functionality.
111
+
112
+ Args:
113
+ url: The URL to process
114
+ depth: Current crawling depth
115
+
116
+ Returns:
117
+ Dictionary containing extracted text and metadata
118
+ """
119
+ # Validation checks
120
+ if not url or not self._is_valid_url(url):
121
+ raise URLProcessingError(f"Invalid URL: {url}", url)
122
+
123
+ if depth > self.max_depth:
124
+ self.logger.info(f"🛑 Max depth reached for: {url}")
125
+ return {}
126
+
127
+ if len(self.visited_urls) >= self.max_pages:
128
+ self.logger.info(f"🛑 Max pages limit reached")
129
+ return {}
130
+
131
+ if url in self.visited_urls:
132
+ self.logger.info(f"Already visited: {url}")
133
+ return {}
134
+
135
+ # Check robots.txt if enabled
136
+ if self.respect_robots_txt and not self._can_fetch(url):
137
+ self.logger.info(f"Robots.txt disallows: {url}")
138
+ return {}
139
+
140
+ self.visited_urls.add(url)
141
+ self.logger.info(f"Processing URL: {url} (depth: {depth})")
142
+
143
+ try:
144
+ # Rate limiting
145
+ self._apply_rate_limit(url)
146
+
147
+ # Fetch and extract content
148
+ content, metadata = self._extract_content(url)
149
+
150
+ if not content:
151
+ self.logger.warning(f"No content extracted from: {url}")
152
+ return {}
153
+
154
+ result = {
155
+ "content": content,
156
+ "metadata": metadata,
157
+ "source": url,
158
+ "depth": depth,
159
+ "linked_documents": [],
160
+ "document_type": "webpage",
161
+ "crawl_stats": {
162
+ "max_depth_configured": self.max_depth,
163
+ "follow_links_enabled": self.follow_links,
164
+ "current_depth": depth,
165
+ },
166
+ }
167
+
168
+ # Follow links if configured and not at max depth
169
+ if (
170
+ self.follow_links
171
+ and depth < self.max_depth
172
+ and len(self.visited_urls) < self.max_pages
173
+ ):
174
+ links = self._extract_links(url, content)
175
+ self.logger.info(f" Found {len(links)} links on {url}")
176
+
177
+ for link in links[:5]: # Limit links per page
178
+ try:
179
+ linked_content = self.process_url(link, depth + 1)
180
+ if linked_content:
181
+ result["linked_documents"].append(linked_content)
182
+ except Exception as e:
183
+ self.logger.warning(
184
+ f"Failed to process linked URL {link}: {str(e)}"
185
+ )
186
+ continue
187
+
188
+ return result
189
+
190
+ except Exception as e:
191
+ raise URLProcessingError(f"Error processing URL: {str(e)}", url)
192
+
193
+ def process_batch(self, urls: List[str]) -> List[Dict[str, Any]]:
194
+ """
195
+ Process multiple URLs in batch.
196
+
197
+ Args:
198
+ urls: List of URLs to process
199
+
200
+ Returns:
201
+ List of processed URL results
202
+ """
203
+ results = []
204
+ self.logger.info(f"Processing batch of {len(urls)} URLs")
205
+
206
+ for i, url in enumerate(urls):
207
+ try:
208
+ result = self.process_url(url)
209
+ if result:
210
+ results.append(result)
211
+ self.logger.info(f"Processed {i+1}/{len(urls)}: {url}")
212
+ except Exception as e:
213
+ self.logger.error(f"❌ Failed to process {url}: {str(e)}")
214
+ continue
215
+
216
+ return results
217
+
218
+ def _is_valid_url(self, url: str) -> bool:
219
+ """
220
+ Validate URL format and scheme.
221
+
222
+ Args:
223
+ url: URL to validate
224
+
225
+ Returns:
226
+ True if URL is valid
227
+ """
228
+ try:
229
+ parsed = urlparse(url)
230
+ return bool(parsed.netloc) and parsed.scheme in ["http", "https"]
231
+ except Exception:
232
+ return False
233
+
234
+ def _can_fetch(self, url: str) -> bool:
235
+ """
236
+ Check if URL can be fetched according to robots.txt.
237
+
238
+ Args:
239
+ url: URL to check
240
+
241
+ Returns:
242
+ True if URL can be fetched
243
+ """
244
+ try:
245
+ parsed_url = urlparse(url)
246
+ base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
247
+
248
+ if base_url not in self.robots_cache:
249
+ robots_url = urljoin(base_url, "/robots.txt")
250
+ rp = RobotFileParser()
251
+ rp.set_url(robots_url)
252
+
253
+ try:
254
+ rp.read()
255
+ self.robots_cache[base_url] = rp
256
+ except Exception:
257
+ # If robots.txt can't be fetched, assume allowed
258
+ return True
259
+
260
+ return self.robots_cache[base_url].can_fetch(self.user_agent, url)
261
+
262
+ except Exception:
263
+ # If robots.txt check fails, assume allowed
264
+ return True
265
+
266
+ def _apply_rate_limit(self, url: str) -> None:
267
+ """
268
+ Apply rate limiting between requests to the same domain.
269
+
270
+ Args:
271
+ url: URL being processed
272
+ """
273
+ domain = urlparse(url).netloc
274
+ current_time = time.time()
275
+
276
+ if domain in self.last_request_time:
277
+ time_since_last = current_time - self.last_request_time[domain]
278
+ if time_since_last < self.rate_limit_delay:
279
+ sleep_time = self.rate_limit_delay - time_since_last
280
+ self.logger.info(
281
+ f"Rate limiting: sleeping {sleep_time:.1f}s for {domain}"
282
+ )
283
+ time.sleep(sleep_time)
284
+
285
+ self.last_request_time[domain] = time.time()
286
+
287
+ def _extract_content(self, url: str) -> tuple:
288
+ """
289
+ Extract content from a web page using trafilatura with BeautifulSoup fallback.
290
+
291
+ Args:
292
+ url: The URL to extract content from
293
+
294
+ Returns:
295
+ Tuple of (content, metadata)
296
+ """
297
+ self.logger.info(f"Extracting content from: {url}")
298
+
299
+ try:
300
+ # Fetch the page
301
+ response = self.session.get(url, timeout=self.timeout)
302
+ response.raise_for_status()
303
+
304
+ # Basic metadata
305
+ metadata = {
306
+ "url": url,
307
+ "status_code": response.status_code,
308
+ "content_type": response.headers.get("content-type", ""),
309
+ "content_length": len(response.content),
310
+ "extracted_time": datetime.now().isoformat(),
311
+ "encoding": response.encoding or "utf-8",
312
+ }
313
+
314
+ # Check content type
315
+ content_type = response.headers.get("content-type", "").lower()
316
+
317
+ if "application/pdf" in content_type:
318
+ return self._handle_pdf_url(response, metadata)
319
+ elif "text/html" not in content_type and "text/plain" not in content_type:
320
+ self.logger.warning(f"Unsupported content type: {content_type}")
321
+ return "", metadata
322
+
323
+ # Primary method: trafilatura (best for content extraction)
324
+ try:
325
+ content = trafilatura.extract(
326
+ response.text,
327
+ include_comments=False,
328
+ include_tables=True,
329
+ include_formatting=False,
330
+ favor_precision=True,
331
+ )
332
+
333
+ if content and len(content.strip()) > 50: # Minimum content threshold
334
+ # Extract additional metadata with trafilatura
335
+ metadata_extracted = trafilatura.extract_metadata(response.text)
336
+ if metadata_extracted:
337
+ metadata.update(
338
+ {
339
+ "title": metadata_extracted.title or "",
340
+ "author": metadata_extracted.author or "",
341
+ "description": metadata_extracted.description or "",
342
+ "sitename": metadata_extracted.sitename or "",
343
+ "date": metadata_extracted.date or "",
344
+ }
345
+ )
346
+
347
+ metadata.update(
348
+ {
349
+ "extraction_method": "trafilatura",
350
+ "word_count": len(content.split()),
351
+ "character_count": len(content),
352
+ }
353
+ )
354
+
355
+ return content.strip(), metadata
356
+
357
+ except Exception as e:
358
+ self.logger.warning(f"Trafilatura failed: {str(e)}")
359
+
360
+ # Fallback method: BeautifulSoup
361
+ return self._extract_with_beautifulsoup(response.text, metadata)
362
+
363
+ except requests.RequestException as e:
364
+ raise URLProcessingError(f"Failed to fetch URL: {str(e)}", url)
365
+ except Exception as e:
366
+ raise URLProcessingError(f"Content extraction failed: {str(e)}", url)
367
+
368
+ def _extract_with_beautifulsoup(self, html: str, metadata: Dict[str, Any]) -> tuple:
369
+ """
370
+ Fallback content extraction using BeautifulSoup.
371
+
372
+ Args:
373
+ html: HTML content
374
+ metadata: Existing metadata dictionary
375
+
376
+ Returns:
377
+ Tuple of (content, metadata)
378
+ """
379
+ try:
380
+ soup = BeautifulSoup(html, "html.parser")
381
+
382
+ # Extract metadata
383
+ title_tag = soup.find("title")
384
+ if title_tag:
385
+ metadata["title"] = title_tag.get_text().strip()
386
+
387
+ # Meta tags
388
+ for meta in soup.find_all("meta"):
389
+ name = meta.get("name", "").lower()
390
+ content = meta.get("content", "")
391
+ if name == "description":
392
+ metadata["description"] = content
393
+ elif name == "author":
394
+ metadata["author"] = content
395
+
396
+ # Remove unwanted elements
397
+ for element in soup(
398
+ ["script", "style", "nav", "header", "footer", "aside"]
399
+ ):
400
+ element.decompose()
401
+
402
+ # Extract main content
403
+ content_selectors = [
404
+ "main",
405
+ "article",
406
+ ".content",
407
+ "#content",
408
+ ".post",
409
+ ".entry",
410
+ ]
411
+
412
+ content = ""
413
+ for selector in content_selectors:
414
+ content_elem = soup.select_one(selector)
415
+ if content_elem:
416
+ content = content_elem.get_text(separator="\n", strip=True)
417
+ break
418
+
419
+ # Fallback to body if no main content found
420
+ if not content:
421
+ body = soup.find("body")
422
+ if body:
423
+ content = body.get_text(separator="\n", strip=True)
424
+
425
+ # Clean and validate content
426
+ content = re.sub(r"\n\s*\n", "\n\n", content) # Clean multiple newlines
427
+ content = content.strip()
428
+
429
+ metadata.update(
430
+ {
431
+ "extraction_method": "beautifulsoup",
432
+ "word_count": len(content.split()),
433
+ "character_count": len(content),
434
+ }
435
+ )
436
+
437
+ return content, metadata
438
+
439
+ except Exception as e:
440
+ self.logger.error(f"❌ BeautifulSoup extraction failed: {str(e)}")
441
+ return "", metadata
442
+
443
+ def _handle_pdf_url(
444
+ self, response: requests.Response, metadata: Dict[str, Any]
445
+ ) -> tuple:
446
+ """
447
+ 📄 Handle PDF content from URL.
448
+
449
+ Args:
450
+ response: HTTP response containing PDF
451
+ metadata: Existing metadata
452
+
453
+ Returns:
454
+ Tuple of (content, metadata)
455
+ """
456
+ self.logger.info("📄 Detected PDF content, extracting text...")
457
+
458
+ try:
459
+ # Save PDF temporarily and process with document processor
460
+ import tempfile
461
+ import os
462
+ from .document_processor import DocumentProcessor
463
+
464
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
465
+ tmp_file.write(response.content)
466
+ tmp_file.flush()
467
+
468
+ # Process PDF
469
+ doc_processor = DocumentProcessor(self.config)
470
+ result = doc_processor.process_document(tmp_file.name)
471
+
472
+ # Cleanup
473
+ os.unlink(tmp_file.name)
474
+
475
+ metadata.update(
476
+ {
477
+ "document_type": "pdf_from_url",
478
+ "extraction_method": "document_processor",
479
+ }
480
+ )
481
+ metadata.update(result.get("metadata", {}))
482
+
483
+ return result.get("content", ""), metadata
484
+
485
+ except Exception as e:
486
+ self.logger.error(f"❌ PDF extraction failed: {str(e)}")
487
+ return "", metadata
488
+
489
+ def _extract_links(self, url: str, content: str) -> List[str]:
490
+ """
491
+ Extract links from a web page.
492
+
493
+ Args:
494
+ url: The source URL
495
+ content: Page content (for context)
496
+
497
+ Returns:
498
+ List of discovered URLs
499
+ """
500
+ self.logger.info(f" Extracting links from: {url}")
501
+
502
+ try:
503
+ response = self.session.get(url, timeout=self.timeout)
504
+ soup = BeautifulSoup(response.text, "html.parser")
505
+
506
+ links = []
507
+ base_domain = urlparse(url).netloc
508
+
509
+ for a_tag in soup.find_all("a", href=True):
510
+ href = a_tag.get("href")
511
+ if not href:
512
+ continue
513
+
514
+ # Convert relative URLs to absolute
515
+ absolute_url = urljoin(url, href)
516
+
517
+ # Filter links
518
+ if self._should_follow_link(absolute_url, base_domain):
519
+ links.append(absolute_url)
520
+
521
+ # 🎯 Remove duplicates and limit
522
+ unique_links = list(dict.fromkeys(links)) # Preserve order
523
+ return unique_links[:20] # Limit to prevent explosion
524
+
525
+ except Exception as e:
526
+ self.logger.error(f"❌ Link extraction failed: {str(e)}")
527
+ return []
528
+
529
+ def _should_follow_link(self, url: str, base_domain: str) -> bool:
530
+ """
531
+ Determine if a link should be followed.
532
+
533
+ Args:
534
+ url: URL to check
535
+ base_domain: Base domain of the source page
536
+
537
+ Returns:
538
+ True if link should be followed
539
+ """
540
+ try:
541
+ parsed = urlparse(url)
542
+
543
+ # Skip non-HTTP(S) links
544
+ if parsed.scheme not in ["http", "https"]:
545
+ return False
546
+
547
+ # Skip already visited
548
+ if url in self.visited_urls:
549
+ return False
550
+
551
+ # Skip file downloads (basic check)
552
+ path = parsed.path.lower()
553
+ skip_extensions = [
554
+ ".pdf",
555
+ ".doc",
556
+ ".docx",
557
+ ".zip",
558
+ ".exe",
559
+ ".dmg",
560
+ ".jpg",
561
+ ".png",
562
+ ".gif",
563
+ ]
564
+ if any(path.endswith(ext) for ext in skip_extensions):
565
+ return False
566
+
567
+ # Skip fragments and query-heavy URLs
568
+ if parsed.fragment or len(parsed.query) > 100:
569
+ return False
570
+
571
+ # Prefer same domain (but allow subdomains)
572
+ link_domain = parsed.netloc
573
+ if not (
574
+ link_domain == base_domain or link_domain.endswith("." + base_domain)
575
+ ):
576
+ return False
577
+
578
+ return True
579
+
580
+ except Exception:
581
+ return False
582
+
583
+ def reset(self):
584
+ """Reset the processor state, clearing visited URLs and caches."""
585
+ self.visited_urls.clear()
586
+ self.robots_cache.clear()
587
+ self.last_request_time.clear()
588
+ self.logger.info("URL processor state reset")
589
+
590
+ def get_statistics(self) -> Dict[str, Any]:
591
+ """
592
+ Get processing statistics.
593
+
594
+ Returns:
595
+ Dictionary with processing statistics
596
+ """
597
+ return {
598
+ "urls_processed": len(self.visited_urls),
599
+ "domains_cached": len(self.robots_cache),
600
+ "rate_limited_domains": len(self.last_request_time),
601
+ "max_pages_limit": self.max_pages,
602
+ "max_depth_limit": self.max_depth,
603
+ }
src/integrations/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Integrations Module
3
+
4
+ This module contains integrations with external services and APIs
5
+ for enhanced RAG functionality.
6
+
7
+ Available Integrations:
8
+ - MCP Tavily: Live web search via Model Context Protocol
9
+ """
10
+
11
+ from .mcp_tavily_integration import MCPTavilyIntegration, create_mcp_tavily_client
12
+
13
+ __all__ = ["MCPTavilyIntegration", "create_mcp_tavily_client"]
src/integrations/mcp_tavily_integration.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MCP Tavily Integration Module
3
+
4
+ This module demonstrates how to integrate Tavily API via MCP (Model Context Protocol)
5
+ for live web search functionality in the RAG system.
6
+
7
+ Technology: MCP + Tavily API
8
+ """
9
+
10
+ import logging
11
+ import time
12
+ from typing import Dict, List, Any, Optional
13
+ from datetime import datetime
14
+
15
+
16
+ class MCPTavilyIntegration:
17
+ """
18
+ Handles MCP integration with Tavily API for live web search.
19
+
20
+ This class provides the bridge between the RAG system and Tavily's
21
+ search capabilities through the Model Context Protocol.
22
+ """
23
+
24
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
25
+ """
26
+ Initialize MCP Tavily integration.
27
+
28
+ Args:
29
+ config: Configuration dictionary
30
+ """
31
+ self.config = config or {}
32
+ self.logger = logging.getLogger(__name__)
33
+
34
+ # 🔧 MCP Configuration
35
+ self.server_name = self.config.get("mcp_server_name", "tavily-mcp")
36
+ self.tool_name = self.config.get("mcp_tool_name", "tavily-search")
37
+ self.timeout = self.config.get("timeout", 30)
38
+
39
+ self.logger.info(" MCP Tavily Integration initialized")
40
+
41
+ def search_web(
42
+ self,
43
+ query: str,
44
+ max_results: int = 5,
45
+ search_depth: str = "basic",
46
+ time_range: str = "month",
47
+ topic: str = "general",
48
+ ) -> Dict[str, Any]:
49
+ """
50
+ Perform web search using Tavily API via MCP.
51
+
52
+ Args:
53
+ query: Search query
54
+ max_results: Maximum number of results
55
+ search_depth: Search depth (basic/advanced)
56
+ time_range: Time range for results
57
+ topic: Search topic category
58
+
59
+ Returns:
60
+ Dictionary with search results
61
+ """
62
+ try:
63
+ self.logger.info(f" MCP Tavily search: '{query}' (depth: {search_depth})")
64
+
65
+ # 🚀 Prepare MCP arguments
66
+ mcp_arguments = {
67
+ "query": query,
68
+ "max_results": min(max_results, 20), # Tavily limit
69
+ "search_depth": search_depth,
70
+ "topic": topic,
71
+ "include_raw_content": True,
72
+ "time_range": time_range,
73
+ }
74
+
75
+ # 🌐 This is where the actual MCP call would be made
76
+ # In a real implementation, this would use the MCP client:
77
+
78
+ """
79
+ Example MCP call structure:
80
+
81
+ result = use_mcp_tool(
82
+ server_name=self.server_name,
83
+ tool_name=self.tool_name,
84
+ arguments=mcp_arguments
85
+ )
86
+ """
87
+
88
+ # 🚧 For demonstration, we'll simulate the MCP response structure
89
+ simulated_result = self._simulate_tavily_response(query, max_results)
90
+
91
+ # 🔄 Process and validate MCP response
92
+ processed_result = self._process_mcp_response(simulated_result, query)
93
+
94
+ self.logger.info(
95
+ f" MCP search completed: {processed_result.get('total_results', 0)} results"
96
+ )
97
+ return processed_result
98
+
99
+ except Exception as e:
100
+ self.logger.error(f" MCP Tavily search failed: {str(e)}")
101
+ return {
102
+ "query": query,
103
+ "results": [],
104
+ "total_results": 0,
105
+ "error": str(e),
106
+ "status": "mcp_error",
107
+ }
108
+
109
+ def _simulate_tavily_response(self, query: str, max_results: int) -> Dict[str, Any]:
110
+ """
111
+ Simulate Tavily API response for demonstration.
112
+
113
+ In production, this would be replaced by actual MCP call results.
114
+ """
115
+ # 🚧 Simulated response structure matching Tavily API
116
+ return {
117
+ "query": query,
118
+ "follow_up_questions": None,
119
+ "answer": f"Based on web search for '{query}'...",
120
+ "images": [],
121
+ "results": [
122
+ {
123
+ "title": f"Example Result 1 for {query}",
124
+ "url": "https://example.com/result1",
125
+ "content": f"This is example content related to {query}. It provides comprehensive information about the topic.",
126
+ "raw_content": f"Raw content for {query} with additional details...",
127
+ "published_date": "2024-01-15",
128
+ "score": 0.95,
129
+ },
130
+ {
131
+ "title": f"Example Result 2 for {query}",
132
+ "url": "https://example.com/result2",
133
+ "content": f"Another relevant result for {query} with different perspective and insights.",
134
+ "raw_content": f"Extended raw content for {query}...",
135
+ "published_date": "2024-01-14",
136
+ "score": 0.87,
137
+ },
138
+ ][:max_results],
139
+ "response_time": 1.2,
140
+ }
141
+
142
+ def _process_mcp_response(
143
+ self, mcp_result: Dict[str, Any], original_query: str
144
+ ) -> Dict[str, Any]:
145
+ """
146
+ Process and validate MCP response from Tavily.
147
+
148
+ Args:
149
+ mcp_result: Raw MCP response
150
+ original_query: Original search query
151
+
152
+ Returns:
153
+ Processed search results
154
+ """
155
+ try:
156
+ # 🔍 Extract results from MCP response
157
+ raw_results = mcp_result.get("results", [])
158
+
159
+ # 🔄 Process each result
160
+ processed_results = []
161
+ for i, result in enumerate(raw_results):
162
+ processed_result = {
163
+ "title": result.get("title", f"Web Result {i+1}"),
164
+ "url": result.get("url", ""),
165
+ "content": result.get("content", ""),
166
+ "raw_content": result.get("raw_content", ""),
167
+ "score": result.get("score", 0.0),
168
+ "published_date": result.get("published_date", ""),
169
+ "rank": i + 1,
170
+ "source": "tavily_web_search",
171
+ "search_engine": "tavily",
172
+ "metadata": {
173
+ "title": result.get("title", ""),
174
+ "url": result.get("url", ""),
175
+ "content_length": len(result.get("content", "")),
176
+ "has_raw_content": bool(result.get("raw_content")),
177
+ "search_rank": i + 1,
178
+ "published_date": result.get("published_date", ""),
179
+ },
180
+ }
181
+ processed_results.append(processed_result)
182
+
183
+ # 📊 Prepare final response
184
+ return {
185
+ "query": original_query,
186
+ "results": processed_results,
187
+ "total_results": len(processed_results),
188
+ "answer": mcp_result.get("answer", ""),
189
+ "follow_up_questions": mcp_result.get("follow_up_questions", []),
190
+ "response_time": mcp_result.get("response_time", 0),
191
+ "timestamp": datetime.now(),
192
+ "status": "success",
193
+ "source": "mcp_tavily",
194
+ }
195
+
196
+ except Exception as e:
197
+ self.logger.error(f" Error processing MCP response: {str(e)}")
198
+ return {
199
+ "query": original_query,
200
+ "results": [],
201
+ "total_results": 0,
202
+ "error": f"Response processing failed: {str(e)}",
203
+ "status": "processing_error",
204
+ }
205
+
206
+ def test_connection(self) -> Dict[str, Any]:
207
+ """
208
+ Test MCP connection to Tavily.
209
+
210
+ Returns:
211
+ Connection test results
212
+ """
213
+ try:
214
+ self.logger.info(" Testing MCP Tavily connection...")
215
+
216
+ # 🔍 Simple test query
217
+ test_result = self.search_web(
218
+ query="test connection", max_results=1, search_depth="basic"
219
+ )
220
+
221
+ if test_result.get("status") == "success":
222
+ return {
223
+ "status": "success",
224
+ "message": " MCP Tavily connection successful",
225
+ "server_name": self.server_name,
226
+ "tool_name": self.tool_name,
227
+ "response_time": test_result.get("response_time", 0),
228
+ }
229
+ else:
230
+ return {
231
+ "status": "error",
232
+ "message": " MCP Tavily connection failed",
233
+ "error": test_result.get("error", "Unknown error"),
234
+ }
235
+
236
+ except Exception as e:
237
+ self.logger.error(f" MCP connection test failed: {str(e)}")
238
+ return {
239
+ "status": "error",
240
+ "message": " MCP connection test failed",
241
+ "error": str(e),
242
+ }
243
+
244
+ def get_server_info(self) -> Dict[str, Any]:
245
+ """
246
+ Get MCP server information.
247
+
248
+ Returns:
249
+ Server information dictionary
250
+ """
251
+ return {
252
+ "server_name": self.server_name,
253
+ "tool_name": self.tool_name,
254
+ "timeout": self.timeout,
255
+ "status": "configured",
256
+ "description": "MCP integration for Tavily web search API",
257
+ }
258
+
259
+
260
+ # 🔧 Helper function for easy integration
261
+ def create_mcp_tavily_client(
262
+ config: Optional[Dict[str, Any]] = None,
263
+ ) -> MCPTavilyIntegration:
264
+ """
265
+ Create and configure MCP Tavily client.
266
+
267
+ Args:
268
+ config: Optional configuration dictionary
269
+
270
+ Returns:
271
+ Configured MCPTavilyIntegration instance
272
+ """
273
+ return MCPTavilyIntegration(config)
274
+
275
+
276
+ # 📝 Example usage and integration guide
277
+ if __name__ == "__main__":
278
+ """
279
+ Example usage of MCP Tavily Integration
280
+
281
+ This demonstrates how to use the MCP integration in your RAG system.
282
+ """
283
+
284
+ # 🔧 Configure MCP client
285
+ config = {
286
+ "mcp_server_name": "tavily-mcp",
287
+ "mcp_tool_name": "tavily-search",
288
+ "timeout": 30,
289
+ }
290
+
291
+ # 🚀 Create client
292
+ mcp_client = create_mcp_tavily_client(config)
293
+
294
+ # 🧪 Test connection
295
+ connection_test = mcp_client.test_connection()
296
+ print(f"Connection test: {connection_test}")
297
+
298
+ # 🔍 Example search
299
+ search_result = mcp_client.search_web(
300
+ query="latest AI developments 2024",
301
+ max_results=5,
302
+ search_depth="basic",
303
+ time_range="month",
304
+ )
305
+
306
+ print(f"Search results: {search_result.get('total_results', 0)} found")
307
+ for result in search_result.get("results", []):
308
+ print(f"- {result['title']}: {result['url']}")
src/rag/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ RAG (Retrieval Augmented Generation) module.
3
+
4
+ This module contains components for processing queries and
5
+ generating responses using retrieved knowledge.
6
+ """
src/rag/live_search.py ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Live Search Processor using Tavily Python Client.
3
+ Provides real-time web search capabilities for the RAG system.
4
+ """
5
+
6
+ import logging
7
+ import os
8
+ import time
9
+ from typing import Dict, List, Any, Optional
10
+ from datetime import datetime, timedelta
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class LiveSearchProcessor:
16
+ """Handles live web search using Tavily Python Client."""
17
+
18
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
19
+ """
20
+ Initialize the LiveSearchProcessor.
21
+
22
+ Args:
23
+ config: Configuration dictionary containing live search settings
24
+ """
25
+ self.config = config or {}
26
+ self.logger = logging.getLogger(__name__)
27
+
28
+ # Search configuration
29
+ self.enabled = self.config.get("enabled", False)
30
+ self.max_results = self.config.get("max_results", 5)
31
+ self.search_depth = self.config.get("search_depth", "basic")
32
+ self.include_answer = self.config.get("include_answer", True)
33
+ self.include_raw_content = self.config.get("include_raw_content", False)
34
+ self.include_images = self.config.get("include_images", False)
35
+ self.topic = self.config.get("topic", "general")
36
+ self.enable_caching = self.config.get("enable_caching", True)
37
+
38
+ # Search cache and analytics
39
+ self.search_cache = {}
40
+ self.search_history = []
41
+
42
+ # Initialize Tavily client
43
+ self.tavily_client = None
44
+ self._initialize_client()
45
+
46
+ self.logger.info(f"LiveSearchProcessor initialized - Enabled: {self.enabled}")
47
+
48
+ def _initialize_client(self):
49
+ """Initialize the Tavily client."""
50
+ try:
51
+ # Get API key from environment variable
52
+ api_key = os.getenv("TAVILY_API_KEY")
53
+
54
+ if not api_key:
55
+ self.logger.warning("TAVILY_API_KEY not found in environment variables")
56
+ self.enabled = False
57
+ return
58
+
59
+ # Import and initialize Tavily client
60
+ from tavily import TavilyClient
61
+
62
+ self.tavily_client = TavilyClient(api_key=api_key)
63
+
64
+ # ✅ Auto-enable if client initializes successfully and no explicit config
65
+ if self.tavily_client and not self.config.get(
66
+ "enabled_explicitly_set", False
67
+ ):
68
+ self.enabled = True
69
+ self.logger.info(
70
+ "Tavily client initialized successfully - Auto-enabled live search"
71
+ )
72
+ else:
73
+ self.logger.info("Tavily client initialized successfully")
74
+
75
+ except ImportError:
76
+ self.logger.error(
77
+ "tavily-python package not installed. Install with: pip install tavily-python"
78
+ )
79
+ self.enabled = False
80
+ except Exception as e:
81
+ self.logger.error(f"Failed to initialize Tavily client: {str(e)}")
82
+ self.enabled = False
83
+
84
+ def is_enabled(self) -> bool:
85
+ """Check if live search is enabled."""
86
+ return self.enabled and self.tavily_client is not None
87
+
88
+ def search_web(
89
+ self,
90
+ query: str,
91
+ max_results: Optional[int] = None,
92
+ search_depth: Optional[str] = None,
93
+ time_range: Optional[str] = None,
94
+ ) -> Dict[str, Any]:
95
+ """
96
+ Perform live web search using Tavily API.
97
+
98
+ Args:
99
+ query: Search query string
100
+ max_results: Maximum number of results to return
101
+ search_depth: Search depth ('basic' or 'advanced')
102
+ time_range: Time range for search results
103
+
104
+ Returns:
105
+ Dictionary containing search results and metadata
106
+ """
107
+ if not query or not query.strip():
108
+ return {
109
+ "query": query,
110
+ "results": [],
111
+ "total_results": 0,
112
+ "error": "Empty query provided",
113
+ "source": "live_search",
114
+ }
115
+
116
+ if not self.is_enabled():
117
+ self.logger.warning("Live search is disabled or client not initialized")
118
+ return {
119
+ "query": query,
120
+ "results": [],
121
+ "total_results": 0,
122
+ "error": "Live search is disabled or Tavily client not initialized",
123
+ "source": "live_search",
124
+ }
125
+
126
+ self.logger.info(f"Performing live search: {query[:100]}...")
127
+ start_time = time.time()
128
+
129
+ try:
130
+ # Use provided parameters or defaults
131
+ search_max_results = max_results or self.max_results
132
+ search_depth_param = search_depth or self.search_depth
133
+
134
+ # Check cache first
135
+ cache_key = self._generate_cache_key(
136
+ query, search_max_results, search_depth_param
137
+ )
138
+ if self.enable_caching and cache_key in self.search_cache:
139
+ cached_result = self.search_cache[cache_key]
140
+ if self._is_cache_valid(cached_result["timestamp"]):
141
+ self.logger.info("Returning cached search result")
142
+ cached_result["from_cache"] = True
143
+ return cached_result
144
+
145
+ # Prepare search parameters
146
+ search_params = {
147
+ "query": query,
148
+ "max_results": min(search_max_results, 20), # Tavily limit
149
+ "search_depth": search_depth_param,
150
+ "include_answer": self.include_answer,
151
+ "include_raw_content": self.include_raw_content,
152
+ "include_images": self.include_images,
153
+ "topic": self.topic,
154
+ }
155
+
156
+ # Add time_range if provided
157
+ if time_range:
158
+ search_params["time_range"] = time_range
159
+
160
+ # Perform the search
161
+ response = self.tavily_client.search(**search_params)
162
+
163
+ # Process and format results
164
+ processed_results = self._process_search_results(
165
+ response.get("results", []), query
166
+ )
167
+
168
+ # Prepare final result
169
+ result = {
170
+ "query": query,
171
+ "results": processed_results,
172
+ "total_results": len(processed_results),
173
+ "answer": response.get("answer"),
174
+ "images": response.get("images", []),
175
+ "follow_up_questions": response.get("follow_up_questions", []),
176
+ "search_params": {
177
+ "max_results": search_max_results,
178
+ "search_depth": search_depth_param,
179
+ "time_range": time_range,
180
+ },
181
+ "processing_time": time.time() - start_time,
182
+ "timestamp": datetime.now(),
183
+ "source": "live_search",
184
+ "from_cache": False,
185
+ "search_metadata": {
186
+ "source": "tavily",
187
+ "timestamp": datetime.now().isoformat(),
188
+ "results_count": len(processed_results),
189
+ "search_depth": search_depth_param,
190
+ "max_results": search_max_results,
191
+ "response_time": response.get("response_time"),
192
+ },
193
+ }
194
+
195
+ # Cache the result
196
+ if self.enable_caching:
197
+ self.search_cache[cache_key] = result.copy()
198
+
199
+ # Add to search history
200
+ self._add_to_history(query, len(processed_results))
201
+
202
+ self.logger.info(
203
+ f"Live search completed in {result['processing_time']:.2f}s"
204
+ )
205
+ return result
206
+
207
+ except Exception as e:
208
+ self.logger.error(f"Error in live search: {str(e)}")
209
+ return {
210
+ "query": query,
211
+ "results": [],
212
+ "total_results": 0,
213
+ "error": str(e),
214
+ "processing_time": time.time() - start_time,
215
+ "source": "live_search",
216
+ }
217
+
218
+ def search(self, query: str, **kwargs) -> Dict[str, Any]:
219
+ """
220
+ Perform a live web search using Tavily API.
221
+
222
+ Args:
223
+ query: Search query string
224
+ **kwargs: Additional search parameters
225
+
226
+ Returns:
227
+ Dictionary containing search results
228
+ """
229
+ return self.search_web(query, **kwargs)
230
+
231
+ def _process_search_results(
232
+ self, raw_results: List[Dict[str, Any]], query: str
233
+ ) -> List[Dict[str, Any]]:
234
+ """
235
+ Process and format raw search results from Tavily.
236
+
237
+ Args:
238
+ raw_results: Raw results from Tavily API
239
+ query: Original search query
240
+
241
+ Returns:
242
+ Processed and formatted results
243
+ """
244
+ processed_results = []
245
+ query_words = set(query.lower().split())
246
+
247
+ for i, result in enumerate(raw_results):
248
+ try:
249
+ # Extract key information
250
+ title = result.get("title", "")
251
+ url = result.get("url", "")
252
+ content = result.get("content", "")
253
+ raw_content = result.get("raw_content", "")
254
+ score = result.get("score", 0.0)
255
+
256
+ # Calculate relevance score
257
+ relevance_score = self._calculate_relevance_score(
258
+ title, content, query_words, score
259
+ )
260
+
261
+ # Format result
262
+ formatted_result = {
263
+ "title": title,
264
+ "url": url,
265
+ "content": content[:500] + "..." if len(content) > 500 else content,
266
+ "raw_content": raw_content if self.include_raw_content else "",
267
+ "score": score,
268
+ "relevance_score": relevance_score,
269
+ "rank": i + 1,
270
+ "source": "web_search",
271
+ "search_engine": "tavily",
272
+ "published_date": result.get("published_date"),
273
+ "metadata": {
274
+ "title": title,
275
+ "url": url,
276
+ "content_length": len(content),
277
+ "has_raw_content": bool(raw_content),
278
+ "search_rank": i + 1,
279
+ },
280
+ }
281
+
282
+ processed_results.append(formatted_result)
283
+
284
+ except Exception as e:
285
+ self.logger.warning(f"Error processing search result {i}: {str(e)}")
286
+ continue
287
+
288
+ # Sort by relevance score
289
+ processed_results.sort(key=lambda x: x["relevance_score"], reverse=True)
290
+
291
+ return processed_results
292
+
293
+ def _calculate_relevance_score(
294
+ self, title: str, content: str, query_words: set, base_score: float
295
+ ) -> float:
296
+ """
297
+ Calculate relevance score for search results.
298
+
299
+ Args:
300
+ title: Result title
301
+ content: Result content
302
+ query_words: Set of query words
303
+ base_score: Base score from search engine
304
+
305
+ Returns:
306
+ Calculated relevance score
307
+ """
308
+ try:
309
+ # Start with base score
310
+ relevance = base_score
311
+
312
+ # Title relevance (higher weight)
313
+ title_words = set(title.lower().split())
314
+ title_overlap = len(query_words.intersection(title_words))
315
+ title_boost = (title_overlap / max(len(query_words), 1)) * 0.3
316
+
317
+ # Content relevance
318
+ content_words = set(content.lower().split())
319
+ content_overlap = len(query_words.intersection(content_words))
320
+ content_boost = (content_overlap / max(len(query_words), 1)) * 0.2
321
+
322
+ # Exact phrase matching bonus
323
+ query_phrase = " ".join(query_words).lower()
324
+ if query_phrase in title.lower():
325
+ relevance += 0.2
326
+ elif query_phrase in content.lower():
327
+ relevance += 0.1
328
+
329
+ # Final score calculation
330
+ final_score = min(relevance + title_boost + content_boost, 1.0)
331
+
332
+ return round(final_score, 3)
333
+
334
+ except Exception as e:
335
+ self.logger.warning(f"Error calculating relevance score: {str(e)}")
336
+ return base_score
337
+
338
+ def get_search_context(self, query: str, **kwargs) -> str:
339
+ """
340
+ Get search context suitable for RAG applications.
341
+
342
+ Args:
343
+ query: Search query string
344
+ **kwargs: Additional search parameters
345
+
346
+ Returns:
347
+ Formatted context string
348
+ """
349
+ search_results = self.search(query, **kwargs)
350
+
351
+ if not search_results.get("results"):
352
+ error_msg = search_results.get("error", "Unknown error")
353
+ return f"No live search results found for: {query}. Error: {error_msg}"
354
+
355
+ context_parts = []
356
+
357
+ # Add answer if available
358
+ if search_results.get("answer"):
359
+ context_parts.append(f"Answer: {search_results['answer']}")
360
+ context_parts.append("")
361
+
362
+ # Add search results
363
+ context_parts.append("Search Results:")
364
+ for i, result in enumerate(search_results["results"], 1):
365
+ context_parts.append(f"{i}. {result['title']}")
366
+ context_parts.append(f" URL: {result['url']}")
367
+ context_parts.append(f" Content: {result['content']}")
368
+ if result.get("published_date"):
369
+ context_parts.append(f" Published: {result['published_date']}")
370
+ context_parts.append("")
371
+
372
+ # Add metadata
373
+ metadata = search_results.get("search_metadata", {})
374
+ context_parts.append(
375
+ f"Search performed at: {metadata.get('timestamp', 'Unknown')}"
376
+ )
377
+ context_parts.append(f"Source: {metadata.get('source', 'Unknown')}")
378
+ context_parts.append(f"Results count: {metadata.get('results_count', 0)}")
379
+
380
+ return "\n".join(context_parts)
381
+
382
+ def qna_search(self, query: str, **kwargs) -> str:
383
+ """
384
+ Get a quick answer to a question using Tavily's QnA search.
385
+
386
+ Args:
387
+ query: Question to answer
388
+ **kwargs: Additional search parameters
389
+
390
+ Returns:
391
+ Answer string
392
+ """
393
+ if not self.is_enabled():
394
+ return "Live search is disabled or not properly configured."
395
+
396
+ try:
397
+ # Use Tavily's QnA search method
398
+ answer = self.tavily_client.qna_search(query=query)
399
+ return answer if answer else "No answer found for the given question."
400
+
401
+ except Exception as e:
402
+ self.logger.error(f"Error in QnA search: {str(e)}")
403
+ return f"Error getting answer: {str(e)}"
404
+
405
+ def _generate_cache_key(
406
+ self, query: str, max_results: int, search_depth: str
407
+ ) -> str:
408
+ """Generate cache key for search results."""
409
+ import hashlib
410
+
411
+ cache_string = f"{query.lower().strip()}{max_results}{search_depth}"
412
+ return hashlib.md5(cache_string.encode()).hexdigest()
413
+
414
+ def _is_cache_valid(self, timestamp: datetime) -> bool:
415
+ """Check if cached result is still valid (30 minutes for live search)."""
416
+ return datetime.now() - timestamp < timedelta(minutes=30)
417
+
418
+ def _add_to_history(self, query: str, result_count: int):
419
+ """Add search to history for analytics."""
420
+ self.search_history.append(
421
+ {
422
+ "query": query,
423
+ "timestamp": datetime.now(),
424
+ "result_count": result_count,
425
+ "search_type": "live_web",
426
+ }
427
+ )
428
+
429
+ # Keep only last 50 searches
430
+ if len(self.search_history) > 50:
431
+ self.search_history = self.search_history[-50:]
432
+
433
+ def health_check(self) -> Dict[str, Any]:
434
+ """
435
+ Perform a health check of the live search service.
436
+
437
+ Returns:
438
+ Dictionary containing health status
439
+ """
440
+ try:
441
+ if not self.enabled:
442
+ return {
443
+ "status": "disabled",
444
+ "message": "Live search is disabled in configuration",
445
+ "timestamp": datetime.now().isoformat(),
446
+ }
447
+
448
+ if not self.tavily_client:
449
+ return {
450
+ "status": "error",
451
+ "message": "Tavily client not initialized. Check TAVILY_API_KEY environment variable.",
452
+ "timestamp": datetime.now().isoformat(),
453
+ }
454
+
455
+ # Perform a simple test search
456
+ test_result = self.search("test health check", max_results=1)
457
+
458
+ if test_result.get("error"):
459
+ return {
460
+ "status": "error",
461
+ "message": f"Health check failed: {test_result['error']}",
462
+ "timestamp": datetime.now().isoformat(),
463
+ }
464
+
465
+ return {
466
+ "status": "healthy",
467
+ "message": "Live search service is operational",
468
+ "timestamp": datetime.now().isoformat(),
469
+ "config": {
470
+ "max_results": self.max_results,
471
+ "search_depth": self.search_depth,
472
+ "include_answer": self.include_answer,
473
+ "topic": self.topic,
474
+ },
475
+ }
476
+
477
+ except Exception as e:
478
+ self.logger.error(f"Health check failed: {str(e)}")
479
+ return {
480
+ "status": "error",
481
+ "message": f"Health check failed: {str(e)}",
482
+ "timestamp": datetime.now().isoformat(),
483
+ }
484
+
485
+ def get_search_analytics(self) -> Dict[str, Any]:
486
+ """
487
+ Get analytics about search patterns.
488
+
489
+ Returns:
490
+ Dictionary with search analytics
491
+ """
492
+ if not self.search_history:
493
+ return {"total_searches": 0, "cache_hit_rate": 0.0, "average_results": 0.0}
494
+
495
+ total_searches = len(self.search_history)
496
+ avg_results = (
497
+ sum(s["result_count"] for s in self.search_history) / total_searches
498
+ )
499
+
500
+ # Recent search trends
501
+ recent_searches = [s["query"] for s in self.search_history[-10:]]
502
+
503
+ return {
504
+ "total_searches": total_searches,
505
+ "average_results_per_search": round(avg_results, 2),
506
+ "recent_searches": recent_searches,
507
+ "cache_size": len(self.search_cache),
508
+ "search_type": "live_web",
509
+ }
510
+
511
+ def clear_cache(self):
512
+ """Clear the search cache."""
513
+ self.search_cache.clear()
514
+ self.logger.info("Live search cache cleared")
515
+
516
+ def clear_history(self):
517
+ """Clear the search history."""
518
+ self.search_history.clear()
519
+ self.logger.info("Live search history cleared")
520
+
521
+
522
+ # 🔄 Compatibility alias for existing imports
523
+ LiveSearchManager = LiveSearchProcessor
src/rag/optimized_query_processor.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Optimized Query Processor with Rate Limiting and Better Error Handling
3
+ """
4
+
5
+ import logging
6
+ import time
7
+ from typing import Dict, List, Any, Optional
8
+ from datetime import datetime, timedelta
9
+
10
+
11
+ class OptimizedQueryProcessor:
12
+ """
13
+ Optimized QueryProcessor with rate limiting and better error handling
14
+ """
15
+
16
+ def __init__(
17
+ self, embedding_generator, vector_db, config: Optional[Dict[str, Any]] = None
18
+ ):
19
+ self.embedding_generator = embedding_generator
20
+ self.vector_db = vector_db
21
+ self.config = config or {}
22
+ self.logger = logging.getLogger(__name__)
23
+
24
+ # Optimized configuration settings
25
+ self.top_k = self.config.get("top_k", 10) # Increased from 5
26
+ self.similarity_threshold = self.config.get(
27
+ "similarity_threshold", 0.4
28
+ ) # Lowered from 0.7
29
+ self.max_context_length = self.config.get(
30
+ "max_context_length", 8000
31
+ ) # Increased
32
+ self.enable_caching = self.config.get("enable_caching", True)
33
+ self.cache_ttl = self.config.get("cache_ttl", 7200) # 2 hours
34
+
35
+ # Rate limiting settings
36
+ self.last_api_call = 0
37
+ self.min_api_interval = 1.0 # Minimum 1 second between API calls
38
+ self.max_retries = 3
39
+ self.retry_delay = 2.0
40
+
41
+ # Query cache and history
42
+ self.query_cache = {}
43
+ self.query_history = []
44
+
45
+ self.logger.info("OptimizedQueryProcessor initialized")
46
+
47
+ def process_query(
48
+ self, query: str, filter: Optional[Dict[str, Any]] = None
49
+ ) -> Dict[str, Any]:
50
+ """
51
+ Process query with optimized rate limiting and error handling
52
+ """
53
+ if not query or not query.strip():
54
+ return {
55
+ "query": query,
56
+ "context": [],
57
+ "total_results": 0,
58
+ "error": "Empty query provided",
59
+ }
60
+
61
+ self.logger.info(f"Processing query: {query[:100]}...")
62
+ start_time = time.time()
63
+
64
+ try:
65
+ # Check cache first
66
+ cache_key = self._generate_cache_key(query, filter)
67
+ if self.enable_caching and cache_key in self.query_cache:
68
+ cached_result = self.query_cache[cache_key]
69
+ if self._is_cache_valid(cached_result["timestamp"]):
70
+ self.logger.info("Returning cached result")
71
+ cached_result["from_cache"] = True
72
+ return cached_result
73
+
74
+ # Rate limiting protection
75
+ self._enforce_rate_limit()
76
+
77
+ # Generate query embedding with retry logic
78
+ query_embedding = self._generate_embedding_with_retry(query)
79
+
80
+ if not query_embedding:
81
+ return {
82
+ "query": query,
83
+ "context": [],
84
+ "total_results": 0,
85
+ "error": "Failed to generate query embedding",
86
+ }
87
+
88
+ # Search for similar vectors with increased top_k
89
+ search_results = self.vector_db.search(
90
+ query_embedding=query_embedding,
91
+ top_k=self.top_k * 2, # Get more results for better filtering
92
+ filter=filter,
93
+ include_metadata=True,
94
+ )
95
+
96
+ if not search_results:
97
+ self.logger.warning("No search results returned from vector database")
98
+ return {
99
+ "query": query,
100
+ "context": [],
101
+ "total_results": 0,
102
+ "error": "No similar documents found",
103
+ }
104
+
105
+ # Apply optimized filtering
106
+ filtered_results = self._apply_smart_filtering(search_results, query)
107
+
108
+ # Extract and format context with better error handling
109
+ context = self._extract_context_safely(filtered_results)
110
+
111
+ # Prepare result
112
+ result = {
113
+ "query": query,
114
+ "context": context,
115
+ "total_results": len(filtered_results),
116
+ "processing_time": time.time() - start_time,
117
+ "timestamp": datetime.now(),
118
+ "from_cache": False,
119
+ "similarity_scores": [r.get("score", 0) for r in filtered_results[:5]],
120
+ }
121
+
122
+ # Cache the result
123
+ if self.enable_caching:
124
+ self.query_cache[cache_key] = result.copy()
125
+
126
+ self.logger.info(
127
+ f"Query processed in {result['processing_time']:.2f}s, {len(context)} context items"
128
+ )
129
+ return result
130
+
131
+ except Exception as e:
132
+ self.logger.error(f"Error processing query: {str(e)}")
133
+ return {
134
+ "query": query,
135
+ "context": [],
136
+ "total_results": 0,
137
+ "error": str(e),
138
+ "processing_time": time.time() - start_time,
139
+ }
140
+
141
+ def _enforce_rate_limit(self):
142
+ """Enforce rate limiting between API calls"""
143
+ current_time = time.time()
144
+ time_since_last_call = current_time - self.last_api_call
145
+
146
+ if time_since_last_call < self.min_api_interval:
147
+ sleep_time = self.min_api_interval - time_since_last_call
148
+ self.logger.info(f"Rate limiting: sleeping {sleep_time:.1f}s")
149
+ time.sleep(sleep_time)
150
+
151
+ self.last_api_call = time.time()
152
+
153
+ def _generate_embedding_with_retry(self, query: str) -> List[float]:
154
+ """Generate embedding with retry logic and rate limiting"""
155
+ for attempt in range(self.max_retries):
156
+ try:
157
+ self._enforce_rate_limit()
158
+ embedding = self.embedding_generator.generate_query_embedding(query)
159
+
160
+ if embedding:
161
+ return embedding
162
+ else:
163
+ self.logger.warning(
164
+ f"Attempt {attempt + 1}: Empty embedding returned"
165
+ )
166
+
167
+ except Exception as e:
168
+ self.logger.warning(f"Attempt {attempt + 1} failed: {str(e)}")
169
+
170
+ if "429" in str(e) or "quota" in str(e).lower():
171
+ # Rate limit hit - wait longer
172
+ wait_time = self.retry_delay * (2**attempt)
173
+ self.logger.info(f"Rate limit hit, waiting {wait_time}s...")
174
+ time.sleep(wait_time)
175
+ elif attempt < self.max_retries - 1:
176
+ time.sleep(self.retry_delay)
177
+
178
+ self.logger.error("All embedding generation attempts failed")
179
+ return []
180
+
181
+ def _apply_smart_filtering(
182
+ self, search_results: List[Dict[str, Any]], query: str
183
+ ) -> List[Dict[str, Any]]:
184
+ """Apply smart filtering with adaptive threshold"""
185
+ if not search_results:
186
+ return []
187
+
188
+ # Get score statistics
189
+ scores = [r.get("score", 0) for r in search_results]
190
+ max_score = max(scores)
191
+ avg_score = sum(scores) / len(scores)
192
+
193
+ # Adaptive threshold: use lower threshold if max score is low
194
+ adaptive_threshold = min(self.similarity_threshold, max_score * 0.8)
195
+
196
+ self.logger.info(
197
+ f"Score stats - Max: {max_score:.3f}, Avg: {avg_score:.3f}, Threshold: {adaptive_threshold:.3f}"
198
+ )
199
+
200
+ # Filter results
201
+ filtered = [
202
+ result
203
+ for result in search_results[: self.top_k]
204
+ if result.get("score", 0) >= adaptive_threshold
205
+ ]
206
+
207
+ # If no results pass threshold, return top 3 anyway
208
+ if not filtered and search_results:
209
+ self.logger.warning(
210
+ f"No results above threshold {adaptive_threshold:.3f}, returning top 3"
211
+ )
212
+ filtered = search_results[:3]
213
+
214
+ return filtered
215
+
216
+ def _extract_context_safely(
217
+ self, search_results: List[Dict[str, Any]]
218
+ ) -> List[Dict[str, Any]]:
219
+ """Extract context with better error handling"""
220
+ context = []
221
+ total_length = 0
222
+
223
+ for i, result in enumerate(search_results):
224
+ try:
225
+ # Multiple ways to extract text content
226
+ text = ""
227
+ metadata = result.get("metadata", {})
228
+
229
+ # Try different text fields
230
+ for field in ["text", "content", "content_preview", "description"]:
231
+ if field in metadata and metadata[field]:
232
+ text = str(metadata[field])
233
+ break
234
+
235
+ if not text:
236
+ self.logger.warning(f"No text content found in result {i}")
237
+ continue
238
+
239
+ # Check length limit
240
+ if total_length + len(text) > self.max_context_length and context:
241
+ break
242
+
243
+ # Create context item
244
+ context_item = {
245
+ "text": text,
246
+ "score": result.get("score", 0),
247
+ "source": metadata.get("source", f"Document {i+1}"),
248
+ "chunk_id": result.get("id", ""),
249
+ "metadata": metadata,
250
+ "relevance_rank": len(context) + 1,
251
+ }
252
+
253
+ context.append(context_item)
254
+ total_length += len(text)
255
+
256
+ except Exception as e:
257
+ self.logger.warning(f"Error extracting context from result {i}: {e}")
258
+ continue
259
+
260
+ self.logger.info(
261
+ f"Extracted {len(context)} context items (total length: {total_length})"
262
+ )
263
+ return context
264
+
265
+ def _generate_cache_key(self, query: str, filter: Optional[Dict[str, Any]]) -> str:
266
+ """Generate cache key for query"""
267
+ import hashlib
268
+
269
+ filter_str = str(sorted(filter.items())) if filter else ""
270
+ cache_string = f"{query.lower().strip()}{filter_str}"
271
+ return hashlib.md5(cache_string.encode()).hexdigest()
272
+
273
+ def _is_cache_valid(self, timestamp: datetime) -> bool:
274
+ """Check if cached result is still valid"""
275
+ return datetime.now() - timestamp < timedelta(seconds=self.cache_ttl)
src/rag/query_processor.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Query Processor Module
3
+
4
+ This module is responsible for processing user queries and converting
5
+ them to vector embeddings for retrieval.
6
+
7
+ Technologies: Gemini Embedding v3, LangChain, Pinecone
8
+ """
9
+
10
+ import logging
11
+ import time
12
+ from typing import Dict, List, Any, Optional
13
+ from datetime import datetime, timedelta
14
+
15
+
16
+ class QueryProcessor:
17
+ """
18
+ Processes user queries and converts them to vector embeddings.
19
+
20
+ Features:
21
+ - Query preprocessing and normalization
22
+ - Query embedding generation
23
+ - Context retrieval from vector database
24
+ - Query expansion and caching
25
+ - Metadata filtering and ranking
26
+ """
27
+
28
+ def __init__(
29
+ self, embedding_generator, vector_db, config: Optional[Dict[str, Any]] = None
30
+ ):
31
+ """
32
+ Initialize the QueryProcessor with dependencies.
33
+
34
+ Args:
35
+ embedding_generator: Instance of EmbeddingGenerator
36
+ vector_db: Instance of VectorDB
37
+ config: Configuration dictionary with processing parameters
38
+ """
39
+ self.embedding_generator = embedding_generator
40
+ self.vector_db = vector_db
41
+ self.config = config or {}
42
+ self.logger = logging.getLogger(__name__)
43
+
44
+ # Configuration settings
45
+ self.top_k = self.config.get("top_k", 5)
46
+ self.similarity_threshold = self.config.get("similarity_threshold", 0.7)
47
+ self.max_context_length = self.config.get("max_context_length", 4000)
48
+ self.enable_caching = self.config.get("enable_caching", True)
49
+ self.cache_ttl = self.config.get("cache_ttl", 3600) # 1 hour
50
+
51
+ # Query cache and history
52
+ self.query_cache = {}
53
+ self.query_history = []
54
+
55
+ self.logger.info("QueryProcessor initialized with advanced features")
56
+
57
+ def process_query(
58
+ self, query: str, filter: Optional[Dict[str, Any]] = None
59
+ ) -> Dict[str, Any]:
60
+ """
61
+ Process a user query and retrieve relevant context.
62
+
63
+ Args:
64
+ query: User query string
65
+ filter: Optional metadata filter for search
66
+
67
+ Returns:
68
+ Dictionary containing query, retrieved context, and metadata
69
+ """
70
+ if not query or not query.strip():
71
+ return {
72
+ "query": query,
73
+ "context": [],
74
+ "total_results": 0,
75
+ "error": "Empty query provided",
76
+ }
77
+
78
+ self.logger.info(f"Processing query: {query[:100]}...")
79
+ start_time = time.time()
80
+
81
+ try:
82
+ # Check cache first
83
+ cache_key = self._generate_cache_key(query, filter)
84
+ if self.enable_caching and cache_key in self.query_cache:
85
+ cached_result = self.query_cache[cache_key]
86
+ if self._is_cache_valid(cached_result["timestamp"]):
87
+ self.logger.info(" Returning cached result")
88
+ cached_result["from_cache"] = True
89
+ return cached_result
90
+
91
+ # Preprocess the query
92
+ processed_query = self._preprocess_query(query)
93
+ expanded_queries = self._expand_query(processed_query)
94
+
95
+ # Generate embeddings for all query variations
96
+ all_results = []
97
+ for q in expanded_queries:
98
+ query_embedding = self.embedding_generator.generate_query_embedding(q)
99
+
100
+ if query_embedding:
101
+ # Search for similar vectors
102
+ search_results = self.vector_db.search(
103
+ query_embedding=query_embedding,
104
+ top_k=self.top_k * 2, # Get more results for better filtering
105
+ filter=filter,
106
+ )
107
+ all_results.extend(search_results)
108
+
109
+ # Deduplicate and rank results
110
+ unique_results = self._deduplicate_results(all_results)
111
+ ranked_results = self._rank_results(unique_results, query)
112
+
113
+ # Filter results by similarity threshold
114
+ filtered_results = [
115
+ result
116
+ for result in ranked_results[: self.top_k]
117
+ if result.get("score", 0) >= self.similarity_threshold
118
+ ]
119
+
120
+ # Extract and format context
121
+ context = self._extract_context(filtered_results)
122
+
123
+ # Prepare result
124
+ result = {
125
+ "query": query,
126
+ "processed_query": processed_query,
127
+ "expanded_queries": expanded_queries,
128
+ "context": context,
129
+ "total_results": len(filtered_results),
130
+ "processing_time": time.time() - start_time,
131
+ "timestamp": datetime.now(),
132
+ "from_cache": False,
133
+ }
134
+
135
+ # Cache the result
136
+ if self.enable_caching:
137
+ self.query_cache[cache_key] = result.copy()
138
+
139
+ # Add to query history
140
+ self._add_to_history(query, len(filtered_results))
141
+
142
+ self.logger.info(f"Query processed in {result['processing_time']:.2f}s")
143
+ return result
144
+
145
+ except Exception as e:
146
+ self.logger.error(f"❌ Error processing query: {str(e)}")
147
+ return {
148
+ "query": query,
149
+ "context": [],
150
+ "total_results": 0,
151
+ "error": str(e),
152
+ "processing_time": time.time() - start_time,
153
+ }
154
+
155
+ def _preprocess_query(self, query: str) -> str:
156
+ """
157
+ Preprocess the query for better embedding generation.
158
+
159
+ Args:
160
+ query: Raw query string
161
+
162
+ Returns:
163
+ Preprocessed query string
164
+ """
165
+ # Remove extra whitespace
166
+ query = " ".join(query.split())
167
+
168
+ # Remove special characters that might interfere
169
+ import re
170
+
171
+ query = re.sub(r"[^\w\s\-\?\!]", " ", query)
172
+
173
+ # Normalize question words
174
+ question_words = {
175
+ "whats": "what is",
176
+ "hows": "how is",
177
+ "wheres": "where is",
178
+ "whos": "who is",
179
+ "whens": "when is",
180
+ }
181
+
182
+ for abbrev, full in question_words.items():
183
+ query = query.replace(abbrev, full)
184
+
185
+ return query.strip()
186
+
187
+ def _expand_query(self, query: str) -> List[str]:
188
+ """
189
+ Expand the query with variations for better retrieval.
190
+
191
+ Args:
192
+ query: Preprocessed query
193
+
194
+ Returns:
195
+ List of query variations
196
+ """
197
+ expanded = [query]
198
+
199
+ # Add question variations
200
+ if not any(
201
+ q in query.lower() for q in ["what", "how", "why", "when", "where", "who"]
202
+ ):
203
+ expanded.append(f"what is {query}")
204
+ expanded.append(f"how does {query} work")
205
+
206
+ # Add definition variation
207
+ if "definition" not in query.lower() and "define" not in query.lower():
208
+ expanded.append(f"{query} definition")
209
+
210
+ # Add example variation
211
+ if "example" not in query.lower():
212
+ expanded.append(f"{query} examples")
213
+
214
+ return expanded[:3] # Limit to 3 variations
215
+
216
+ def _deduplicate_results(
217
+ self, results: List[Dict[str, Any]]
218
+ ) -> List[Dict[str, Any]]:
219
+ """
220
+ Remove duplicate results based on content similarity.
221
+
222
+ Args:
223
+ results: List of search results
224
+
225
+ Returns:
226
+ Deduplicated results
227
+ """
228
+ seen_ids = set()
229
+ unique_results = []
230
+
231
+ for result in results:
232
+ result_id = result.get("id")
233
+ if result_id and result_id not in seen_ids:
234
+ seen_ids.add(result_id)
235
+ unique_results.append(result)
236
+
237
+ return unique_results
238
+
239
+ def _rank_results(
240
+ self, results: List[Dict[str, Any]], query: str
241
+ ) -> List[Dict[str, Any]]:
242
+ """
243
+ Rank results based on multiple factors.
244
+
245
+ Args:
246
+ results: List of search results
247
+ query: Original query
248
+
249
+ Returns:
250
+ Ranked results
251
+ """
252
+ query_words = set(query.lower().split())
253
+
254
+ for result in results:
255
+ # Base score from similarity
256
+ base_score = result.get("score", 0.0)
257
+
258
+ # Boost score based on text relevance
259
+ text = result.get("metadata", {}).get("text", "").lower()
260
+ text_words = set(text.split())
261
+ word_overlap = len(query_words.intersection(text_words))
262
+ relevance_boost = word_overlap / max(len(query_words), 1) * 0.1
263
+
264
+ # Boost score based on source type
265
+ source = result.get("metadata", {}).get("source", "")
266
+ source_boost = 0.0
267
+ if source.endswith(".pdf"):
268
+ source_boost = 0.05 # PDFs often contain structured info
269
+ elif "http" in source:
270
+ source_boost = 0.02 # Web content
271
+
272
+ # Calculate final score
273
+ final_score = base_score + relevance_boost + source_boost
274
+ result["final_score"] = min(final_score, 1.0)
275
+
276
+ # Sort by final score
277
+ return sorted(results, key=lambda x: x.get("final_score", 0), reverse=True)
278
+
279
+ def _extract_context(
280
+ self, search_results: List[Dict[str, Any]]
281
+ ) -> List[Dict[str, Any]]:
282
+ """
283
+ Extract and format context from search results.
284
+
285
+ Args:
286
+ search_results: List of search results from vector database
287
+
288
+ Returns:
289
+ List of formatted context items
290
+ """
291
+ context = []
292
+ total_length = 0
293
+
294
+ for result in search_results:
295
+ # Extract text content from metadata
296
+ text = result.get("metadata", {}).get("text", "")
297
+
298
+ # Check if adding this context would exceed the limit
299
+ if total_length + len(text) > self.max_context_length and context:
300
+ break
301
+
302
+ # Format context item with enhanced metadata
303
+ context_item = {
304
+ "text": text,
305
+ "score": result.get("score", 0),
306
+ "final_score": result.get("final_score", result.get("score", 0)),
307
+ "source": result.get("metadata", {}).get("source", "unknown"),
308
+ "chunk_id": result.get("id", ""),
309
+ "metadata": result.get("metadata", {}),
310
+ "relevance_rank": len(context) + 1,
311
+ }
312
+
313
+ context.append(context_item)
314
+ total_length += len(text)
315
+
316
+ self.logger.info(
317
+ f"Extracted {len(context)} context items (total length: {total_length})"
318
+ )
319
+ return context
320
+
321
+ def _generate_cache_key(self, query: str, filter: Optional[Dict[str, Any]]) -> str:
322
+ """Generate a cache key for the query."""
323
+ import hashlib
324
+
325
+ filter_str = str(sorted(filter.items())) if filter else ""
326
+ cache_string = f"{query.lower().strip()}{filter_str}"
327
+ return hashlib.md5(cache_string.encode()).hexdigest()
328
+
329
+ def _is_cache_valid(self, timestamp: datetime) -> bool:
330
+ """Check if cached result is still valid."""
331
+ return datetime.now() - timestamp < timedelta(seconds=self.cache_ttl)
332
+
333
+ def _add_to_history(self, query: str, result_count: int):
334
+ """Add query to history for analytics."""
335
+ self.query_history.append(
336
+ {
337
+ "query": query,
338
+ "timestamp": datetime.now(),
339
+ "result_count": result_count,
340
+ }
341
+ )
342
+
343
+ # Keep only last 100 queries
344
+ if len(self.query_history) > 100:
345
+ self.query_history = self.query_history[-100:]
346
+
347
+ def get_query_suggestions(self, partial_query: str) -> List[str]:
348
+ """
349
+ Generate query suggestions based on partial input and history.
350
+
351
+ Args:
352
+ partial_query: Partial query string
353
+
354
+ Returns:
355
+ List of suggested queries
356
+ """
357
+ suggestions = []
358
+
359
+ # Add suggestions from query history
360
+ for hist_item in reversed(self.query_history[-20:]): # Last 20 queries
361
+ hist_query = hist_item["query"]
362
+ if (
363
+ partial_query.lower() in hist_query.lower()
364
+ and hist_query not in suggestions
365
+ ):
366
+ suggestions.append(hist_query)
367
+
368
+ # Add template-based suggestions
369
+ if len(suggestions) < 3:
370
+ templates = [
371
+ f"What is {partial_query}?",
372
+ f"How does {partial_query} work?",
373
+ f"Examples of {partial_query}",
374
+ f"{partial_query} definition",
375
+ f"{partial_query} best practices",
376
+ ]
377
+
378
+ for template in templates:
379
+ if template not in suggestions:
380
+ suggestions.append(template)
381
+ if len(suggestions) >= 5:
382
+ break
383
+
384
+ return suggestions[:5]
385
+
386
+ def get_query_analytics(self) -> Dict[str, Any]:
387
+ """
388
+ Get analytics about query patterns.
389
+
390
+ Returns:
391
+ Dictionary with query analytics
392
+ """
393
+ if not self.query_history:
394
+ return {"total_queries": 0, "cache_hit_rate": 0.0}
395
+
396
+ total_queries = len(self.query_history)
397
+ recent_queries = [q["query"] for q in self.query_history[-10:]]
398
+
399
+ # Calculate average results per query
400
+ avg_results = sum(q["result_count"] for q in self.query_history) / total_queries
401
+
402
+ # Most common query patterns
403
+ query_words = []
404
+ for q in self.query_history:
405
+ query_words.extend(q["query"].lower().split())
406
+
407
+ from collections import Counter
408
+
409
+ common_words = Counter(query_words).most_common(5)
410
+
411
+ return {
412
+ "total_queries": total_queries,
413
+ "average_results_per_query": round(avg_results, 2),
414
+ "recent_queries": recent_queries,
415
+ "common_query_words": common_words,
416
+ "cache_size": len(self.query_cache),
417
+ }
418
+
419
+ def clear_cache(self):
420
+ """Clear the query cache."""
421
+ self.query_cache.clear()
422
+ self.logger.info("Query cache cleared")
423
+
424
+ def clear_history(self):
425
+ """Clear the query history."""
426
+ self.query_history.clear()
427
+ self.logger.info("Query history cleared")
src/rag/query_router.py ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Query Router Module
3
+
4
+ This module intelligently routes queries between local document search
5
+ and live web search based on query analysis and user preferences.
6
+
7
+ Technology: Custom routing logic with RAG + Live Search integration
8
+ """
9
+
10
+ import logging
11
+ import time
12
+ from typing import Dict, List, Any, Optional, Tuple
13
+ from datetime import datetime
14
+ from enum import Enum
15
+
16
+
17
+ class QueryType(Enum):
18
+ """Enumeration of different query types for routing decisions."""
19
+
20
+ FACTUAL = "factual" # 📊 Current facts, news, data
21
+ CONCEPTUAL = "conceptual" # 💡 Definitions, explanations
22
+ PROCEDURAL = "procedural" # 🔧 How-to, instructions
23
+ ANALYTICAL = "analytical" # 📈 Analysis, comparisons
24
+ TEMPORAL = "temporal" # ⏰ Time-sensitive information
25
+ HYBRID = "hybrid" # 🔄 Requires both sources
26
+
27
+
28
+ class QueryRouter:
29
+ """
30
+ Intelligent query router that decides between local docs and live search.
31
+
32
+ Features:
33
+ - Query type classification
34
+ - Intelligent routing decisions
35
+ - Hybrid search coordination
36
+ - Result fusion and ranking
37
+ - Performance optimization
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ local_query_processor,
43
+ live_search_processor,
44
+ config: Optional[Dict[str, Any]] = None,
45
+ ):
46
+ """
47
+ Initialize the QueryRouter.
48
+
49
+ Args:
50
+ local_query_processor: Local document query processor
51
+ live_search_processor: Live web search processor
52
+ config: Configuration dictionary
53
+ """
54
+ self.local_processor = local_query_processor
55
+ self.live_processor = live_search_processor
56
+ self.config = config or {}
57
+ self.logger = logging.getLogger(__name__)
58
+
59
+ # 🎯 Routing configuration
60
+ self.enable_hybrid_search = self.config.get("enable_hybrid_search", True)
61
+ self.local_weight = self.config.get("local_weight", 0.6)
62
+ self.live_weight = self.config.get("live_weight", 0.4)
63
+ self.confidence_threshold = self.config.get("confidence_threshold", 0.5)
64
+ self.max_hybrid_results = self.config.get("max_hybrid_results", 10)
65
+
66
+ # 📊 Analytics and caching
67
+ self.routing_history = []
68
+ self.routing_cache = {}
69
+
70
+ # 🔍 Query classification patterns
71
+ self._init_classification_patterns()
72
+
73
+ self.logger.info("QueryRouter initialized with intelligent routing")
74
+
75
+ def _init_classification_patterns(self):
76
+ """Initialize patterns for query classification."""
77
+ self.temporal_keywords = {
78
+ "current",
79
+ "latest",
80
+ "recent",
81
+ "today",
82
+ "now",
83
+ "2025",
84
+ "breaking",
85
+ "news",
86
+ "update",
87
+ "trending",
88
+ "happening",
89
+ }
90
+
91
+ self.factual_keywords = {
92
+ "what is",
93
+ "who is",
94
+ "when did",
95
+ "where is",
96
+ "statistics",
97
+ "data",
98
+ "facts",
99
+ "numbers",
100
+ "rate",
101
+ "percentage",
102
+ }
103
+
104
+ self.procedural_keywords = {
105
+ "how to",
106
+ "steps",
107
+ "guide",
108
+ "tutorial",
109
+ "instructions",
110
+ "process",
111
+ "method",
112
+ "way to",
113
+ "procedure",
114
+ }
115
+
116
+ self.conceptual_keywords = {
117
+ "explain",
118
+ "definition",
119
+ "meaning",
120
+ "concept",
121
+ "theory",
122
+ "principle",
123
+ "idea",
124
+ "understand",
125
+ "clarify",
126
+ }
127
+
128
+ def route_query(
129
+ self,
130
+ query: str,
131
+ use_live_search: bool = False,
132
+ max_results: int = 5,
133
+ search_options: Optional[Dict[str, Any]] = None,
134
+ search_mode: str = "auto",
135
+ ) -> Dict[str, Any]:
136
+ """
137
+ Route query to appropriate search method(s) with enhanced control.
138
+
139
+ Args:
140
+ query: User query string
141
+ use_live_search: Enable live search (will use hybrid approach)
142
+ max_results: Maximum results to return
143
+ search_options: Additional search options
144
+ search_mode: Search mode - "auto", "local_only", "live_only", "hybrid"
145
+
146
+ Returns:
147
+ Dictionary with routed results and metadata
148
+ """
149
+ if not query or not query.strip():
150
+ return {
151
+ "query": query,
152
+ "results": [],
153
+ "routing_decision": "error",
154
+ "error": "Empty query provided",
155
+ }
156
+
157
+ self.logger.info(f" Routing query: {query[:100]}...")
158
+ start_time = time.time()
159
+
160
+ try:
161
+ # 🎯 Classify query type
162
+ query_type = self._classify_query(query)
163
+
164
+ # 🔄 Make routing decision with enhanced logic
165
+ routing_decision = self._make_enhanced_routing_decision(
166
+ query, query_type, use_live_search, search_mode
167
+ )
168
+
169
+ # 🚀 Execute search based on routing decision
170
+ if routing_decision == "local_only":
171
+ result = self._search_local_only(query, max_results)
172
+ elif routing_decision == "live_only":
173
+ result = self._search_live_only(query, max_results, search_options)
174
+ elif routing_decision == "hybrid":
175
+ result = self._search_hybrid(query, max_results, search_options)
176
+ else:
177
+ result = self._search_fallback(query, max_results)
178
+
179
+ # 📊 Add routing metadata
180
+ result.update(
181
+ {
182
+ "query_type": query_type.value,
183
+ "routing_decision": routing_decision,
184
+ "processing_time": time.time() - start_time,
185
+ "timestamp": datetime.now(),
186
+ }
187
+ )
188
+
189
+ # 📈 Track routing decision
190
+ self._track_routing_decision(query, query_type, routing_decision)
191
+
192
+ self.logger.info(
193
+ f" Query routed via {routing_decision} in {result['processing_time']:.2f}s"
194
+ )
195
+ return result
196
+
197
+ except Exception as e:
198
+ self.logger.error(f" Error in query routing: {str(e)}")
199
+ return {
200
+ "query": query,
201
+ "results": [],
202
+ "routing_decision": "error",
203
+ "error": str(e),
204
+ "processing_time": time.time() - start_time,
205
+ }
206
+
207
+ def _classify_query(self, query: str) -> QueryType:
208
+ """
209
+ Classify query type for routing decisions.
210
+
211
+ Args:
212
+ query: Query string to classify
213
+
214
+ Returns:
215
+ QueryType enum value
216
+ """
217
+ query_lower = query.lower()
218
+
219
+ # 🔍 Check for temporal indicators
220
+ if any(keyword in query_lower for keyword in self.temporal_keywords):
221
+ return QueryType.TEMPORAL
222
+
223
+ # 📊 Check for factual queries
224
+ if any(keyword in query_lower for keyword in self.factual_keywords):
225
+ return QueryType.FACTUAL
226
+
227
+ # 🔧 Check for procedural queries
228
+ if any(keyword in query_lower for keyword in self.procedural_keywords):
229
+ return QueryType.PROCEDURAL
230
+
231
+ # 💡 Check for conceptual queries
232
+ if any(keyword in query_lower for keyword in self.conceptual_keywords):
233
+ return QueryType.CONCEPTUAL
234
+
235
+ # 📈 Default to analytical for complex queries
236
+ if len(query.split()) > 10:
237
+ return QueryType.ANALYTICAL
238
+
239
+ # 🔄 Default to hybrid for uncertain cases
240
+ return QueryType.HYBRID
241
+
242
+ def _make_routing_decision(
243
+ self, query: str, query_type: QueryType, force_live: bool
244
+ ) -> str:
245
+ """
246
+ Make intelligent routing decision based on query analysis.
247
+
248
+ Args:
249
+ query: Query string
250
+ query_type: Classified query type
251
+ force_live: Whether to enable live search (not force only live)
252
+
253
+ Returns:
254
+ Routing decision string
255
+ """
256
+ # 🔄 Smart hybrid approach when live search is enabled
257
+ if force_live:
258
+ # ✨ Instead of live_only, use hybrid to combine both sources
259
+ if query_type == QueryType.TEMPORAL:
260
+ return "hybrid" # ⏰ Time-sensitive + stored context
261
+ else:
262
+ return "hybrid" # 🎯 Always combine live + stored data
263
+
264
+ # 🎯 Route based on query type (when live search is disabled)
265
+ if query_type == QueryType.TEMPORAL:
266
+ return "local_only" # ⏰ Only stored data when live disabled
267
+
268
+ elif query_type == QueryType.FACTUAL:
269
+ return "local_only" # 📊 Facts from stored documents
270
+
271
+ elif query_type == QueryType.PROCEDURAL:
272
+ return "local_only" # 🔧 Procedures likely in documents
273
+
274
+ elif query_type == QueryType.CONCEPTUAL:
275
+ return "local_only" # 💡 Concepts likely in documents
276
+
277
+ elif query_type == QueryType.ANALYTICAL:
278
+ return "local_only" # 📈 Analysis from stored data
279
+
280
+ else: # QueryType.HYBRID
281
+ return "local_only" # 🔄 Default to local when live disabled
282
+
283
+ def _make_enhanced_routing_decision(
284
+ self, query: str, query_type: QueryType, use_live_search: bool, search_mode: str
285
+ ) -> str:
286
+ """
287
+ Enhanced routing decision with explicit search mode control.
288
+
289
+ Args:
290
+ query: Query string
291
+ query_type: Classified query type
292
+ use_live_search: Whether live search is enabled
293
+ search_mode: Explicit search mode preference
294
+
295
+ Returns:
296
+ Routing decision string
297
+ """
298
+ # 🎯 Explicit mode override - user ka choice priority
299
+ if search_mode == "local_only":
300
+ return "local_only"
301
+ elif search_mode == "live_only":
302
+ return "live_only" if self.live_processor.is_enabled() else "local_only"
303
+ elif search_mode == "hybrid":
304
+ return "hybrid" if self.live_processor.is_enabled() else "local_only"
305
+
306
+ # 🧠 Auto mode - intelligent decision making
307
+ elif search_mode == "auto":
308
+ return self._make_routing_decision(query, query_type, use_live_search)
309
+
310
+ # 🔄 Fallback to original logic
311
+ else:
312
+ return self._make_routing_decision(query, query_type, use_live_search)
313
+
314
+ def _search_local_only(self, query: str, max_results: int) -> Dict[str, Any]:
315
+ """Search only local documents."""
316
+ self.logger.info(" Searching local documents only")
317
+
318
+ try:
319
+ local_result = self.local_processor.process_query(query)
320
+
321
+ # 🔄 Format results consistently
322
+ formatted_results = []
323
+ for item in local_result.get("context", [])[:max_results]:
324
+ formatted_results.append(
325
+ {
326
+ "title": f"Document: {item.get('source', 'Unknown')}",
327
+ "content": item.get("text", ""),
328
+ "score": item.get("score", 0.0),
329
+ "source": item.get("source", "local_document"),
330
+ "type": "local_document",
331
+ "metadata": item.get("metadata", {}),
332
+ }
333
+ )
334
+
335
+ return {
336
+ "query": query,
337
+ "results": formatted_results,
338
+ "total_results": len(formatted_results),
339
+ "sources": ["local_documents"],
340
+ "local_results": local_result.get("total_results", 0),
341
+ }
342
+
343
+ except Exception as e:
344
+ self.logger.error(f" Local search error: {str(e)}")
345
+ return {
346
+ "query": query,
347
+ "results": [],
348
+ "total_results": 0,
349
+ "error": f"Local search failed: {str(e)}",
350
+ }
351
+
352
+ def _search_live_only(
353
+ self, query: str, max_results: int, search_options: Optional[Dict[str, Any]]
354
+ ) -> Dict[str, Any]:
355
+ """Search only live web sources."""
356
+ self.logger.info(" Searching live web sources only")
357
+
358
+ try:
359
+ # 🎯 Extract search options
360
+ options = search_options or {}
361
+ search_depth = options.get("search_depth", "basic")
362
+ time_range = options.get("time_range", "month")
363
+
364
+ live_result = self.live_processor.search_web(
365
+ query,
366
+ max_results=max_results,
367
+ search_depth=search_depth,
368
+ time_range=time_range,
369
+ )
370
+
371
+ return {
372
+ "query": query,
373
+ "results": live_result.get("results", []),
374
+ "total_results": live_result.get("total_results", 0),
375
+ "sources": ["live_web"],
376
+ "live_results": live_result.get("total_results", 0),
377
+ "search_params": live_result.get("search_params", {}),
378
+ }
379
+
380
+ except Exception as e:
381
+ self.logger.error(f" Live search error: {str(e)}")
382
+ return {
383
+ "query": query,
384
+ "results": [],
385
+ "total_results": 0,
386
+ "error": f"Live search failed: {str(e)}",
387
+ }
388
+
389
+ def _search_hybrid(
390
+ self, query: str, max_results: int, search_options: Optional[Dict[str, Any]]
391
+ ) -> Dict[str, Any]:
392
+ """Perform hybrid search combining local and live sources."""
393
+ self.logger.info(" Performing hybrid search")
394
+
395
+ try:
396
+ # 📊 Calculate result distribution
397
+ local_count = int(max_results * self.local_weight)
398
+ live_count = max_results - local_count
399
+
400
+ # 🚀 Perform both searches concurrently (simplified sequential for now)
401
+ local_result = self.local_processor.process_query(query)
402
+
403
+ options = search_options or {}
404
+ live_result = self.live_processor.search_web(
405
+ query,
406
+ max_results=live_count,
407
+ search_depth=options.get("search_depth", "basic"),
408
+ time_range=options.get("time_range", "month"),
409
+ )
410
+
411
+ # 🔄 Combine and rank results
412
+ combined_results = self._fuse_results(
413
+ local_result, live_result, local_count, live_count
414
+ )
415
+
416
+ return {
417
+ "query": query,
418
+ "results": combined_results[:max_results],
419
+ "total_results": len(combined_results),
420
+ "sources": ["local_documents", "live_web"],
421
+ "local_results": local_result.get("total_results", 0),
422
+ "live_results": live_result.get("total_results", 0),
423
+ "fusion_method": "weighted_ranking",
424
+ }
425
+
426
+ except Exception as e:
427
+ self.logger.error(f" Hybrid search error: {str(e)}")
428
+ return self._search_fallback(query, max_results)
429
+
430
+ def _fuse_results(
431
+ self,
432
+ local_result: Dict[str, Any],
433
+ live_result: Dict[str, Any],
434
+ local_count: int,
435
+ live_count: int,
436
+ ) -> List[Dict[str, Any]]:
437
+ """
438
+ Fuse results from local and live searches.
439
+
440
+ Args:
441
+ local_result: Results from local search
442
+ live_result: Results from live search
443
+ local_count: Number of local results to include
444
+ live_count: Number of live results to include
445
+
446
+ Returns:
447
+ Fused and ranked results
448
+ """
449
+ fused_results = []
450
+
451
+ # 📚 Process local results
452
+ for item in local_result.get("context", [])[:local_count]:
453
+ fused_results.append(
454
+ {
455
+ "title": f"Document: {item.get('source', 'Unknown')}",
456
+ "content": item.get("text", ""),
457
+ "score": item.get("score", 0.0) * self.local_weight,
458
+ "source": item.get("source", "local_document"),
459
+ "type": "local_document",
460
+ "metadata": item.get("metadata", {}),
461
+ "fusion_score": item.get("score", 0.0) * self.local_weight,
462
+ }
463
+ )
464
+
465
+ # 🌐 Process live results
466
+ for item in live_result.get("results", [])[:live_count]:
467
+ fused_results.append(
468
+ {
469
+ "title": item.get("title", "Web Result"),
470
+ "content": item.get("content", ""),
471
+ "score": item.get("relevance_score", 0.0) * self.live_weight,
472
+ "source": item.get("url", "web_search"),
473
+ "type": "web_result",
474
+ "metadata": item.get("metadata", {}),
475
+ "fusion_score": item.get("relevance_score", 0.0) * self.live_weight,
476
+ }
477
+ )
478
+
479
+ # 🔄 Sort by fusion score
480
+ fused_results.sort(key=lambda x: x.get("fusion_score", 0), reverse=True)
481
+
482
+ return fused_results
483
+
484
+ def _search_fallback(self, query: str, max_results: int) -> Dict[str, Any]:
485
+ """Fallback search method when other methods fail."""
486
+ self.logger.warning(" Using fallback search method")
487
+
488
+ try:
489
+ # 📚 Try local search first
490
+ local_result = self.local_processor.process_query(query)
491
+
492
+ if local_result.get("context"):
493
+ return self._search_local_only(query, max_results)
494
+ else:
495
+ return {
496
+ "query": query,
497
+ "results": [],
498
+ "total_results": 0,
499
+ "sources": [],
500
+ "error": "No results found in fallback search",
501
+ }
502
+
503
+ except Exception as e:
504
+ self.logger.error(f" Fallback search failed: {str(e)}")
505
+ return {
506
+ "query": query,
507
+ "results": [],
508
+ "total_results": 0,
509
+ "error": f"All search methods failed: {str(e)}",
510
+ }
511
+
512
+ def _track_routing_decision(
513
+ self, query: str, query_type: QueryType, routing_decision: str
514
+ ):
515
+ """Track routing decisions for analytics."""
516
+ self.routing_history.append(
517
+ {
518
+ "query": query[:100], # Truncate for privacy
519
+ "query_type": query_type.value,
520
+ "routing_decision": routing_decision,
521
+ "timestamp": datetime.now(),
522
+ }
523
+ )
524
+
525
+ # 📊 Keep only last 100 routing decisions
526
+ if len(self.routing_history) > 100:
527
+ self.routing_history = self.routing_history[-100:]
528
+
529
+ def get_routing_analytics(self) -> Dict[str, Any]:
530
+ """
531
+ Get analytics about routing patterns.
532
+
533
+ Returns:
534
+ Dictionary with routing analytics
535
+ """
536
+ if not self.routing_history:
537
+ return {
538
+ "total_queries": 0,
539
+ "routing_distribution": {},
540
+ "query_type_distribution": {},
541
+ }
542
+
543
+ total_queries = len(self.routing_history)
544
+
545
+ # 📊 Calculate routing distribution
546
+ routing_counts = {}
547
+ query_type_counts = {}
548
+
549
+ for entry in self.routing_history:
550
+ routing = entry["routing_decision"]
551
+ query_type = entry["query_type"]
552
+
553
+ routing_counts[routing] = routing_counts.get(routing, 0) + 1
554
+ query_type_counts[query_type] = query_type_counts.get(query_type, 0) + 1
555
+
556
+ # 📈 Convert to percentages
557
+ routing_distribution = {
558
+ k: round((v / total_queries) * 100, 1) for k, v in routing_counts.items()
559
+ }
560
+
561
+ query_type_distribution = {
562
+ k: round((v / total_queries) * 100, 1) for k, v in query_type_counts.items()
563
+ }
564
+
565
+ return {
566
+ "total_queries": total_queries,
567
+ "routing_distribution": routing_distribution,
568
+ "query_type_distribution": query_type_distribution,
569
+ "recent_decisions": [
570
+ {
571
+ "query": entry["query"][:50] + "...",
572
+ "type": entry["query_type"],
573
+ "routing": entry["routing_decision"],
574
+ }
575
+ for entry in self.routing_history[-5:]
576
+ ],
577
+ }
578
+
579
+ def clear_cache(self):
580
+ """Clear routing cache."""
581
+ self.routing_cache.clear()
582
+ self.logger.info(" Routing cache cleared")
583
+
584
+ def clear_history(self):
585
+ """Clear routing history."""
586
+ self.routing_history.clear()
587
+ self.logger.info(" Routing history cleared")
src/rag/response_generator.py ADDED
@@ -0,0 +1,591 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Response Generator Module
3
+
4
+ This module is responsible for generating coherent responses based on
5
+ retrieved knowledge using LangChain RAG.
6
+
7
+ Technology: LangChain RAG (Retrieval Augmented Generation)
8
+ """
9
+
10
+ import logging
11
+ import time
12
+ import os
13
+ from typing import Dict, List, Any, Optional
14
+ from datetime import datetime
15
+
16
+
17
+ class ResponseGenerator:
18
+ """
19
+ Generates coherent responses based on retrieved knowledge.
20
+
21
+ Features:
22
+ - Context-aware response generation
23
+ - Source attribution and confidence scoring
24
+ - Multiple LLM provider support (Gemini, OpenAI)
25
+ - Response quality assessment
26
+ - Template-based fallback generation
27
+ """
28
+
29
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
30
+ """
31
+ Initialize the ResponseGenerator with configuration.
32
+
33
+ Args:
34
+ config: Configuration dictionary with generation parameters
35
+ """
36
+ self.config = config or {}
37
+ self.logger = logging.getLogger(__name__)
38
+
39
+ # Configuration settings
40
+ self.model = self.config.get("model", "gpt-3.5-turbo")
41
+ self.max_tokens = self.config.get("max_tokens", 500)
42
+ self.temperature = self.config.get("temperature", 0.7)
43
+ self.include_sources = self.config.get("include_sources", True)
44
+
45
+ # Initialize LLM providers
46
+ self.llm = None
47
+ self.gemini_client = None
48
+ self.openai_client = None
49
+
50
+ self._initialize_llm_providers()
51
+
52
+ # Response templates with markdown formatting
53
+ self.response_templates = {
54
+ "no_context": "## ℹ️ No Information Available\n\nI don't have enough information to answer your question. Please try:\n\n- **Uploading relevant documents** using the Upload tab\n- **Adding URLs** using the Add URLs tab\n- **Enabling live search** for real-time web results",
55
+ "error": "## ⚠️ Error Occurred\n\nI encountered an error while generating the response. Please try again.\n\nIf the problem persists, check your API keys in the Settings tab.",
56
+ "insufficient_confidence": "## 🤔 Limited Confidence\n\nBased on the available information, I found some relevant content, but I'm **not confident enough** to provide a definitive answer.\n\n**Suggestions:**\n- Try rephrasing your question\n- Add more specific documents\n- Enable live search for additional context",
57
+ }
58
+
59
+ self.logger.info("ResponseGenerator initialized with advanced features")
60
+
61
+ def _initialize_llm_providers(self):
62
+ """Initialize available LLM providers with optimization."""
63
+ try:
64
+ # Try to initialize Gemini
65
+ gemini_api_key = os.getenv("GEMINI_API_KEY")
66
+ if gemini_api_key:
67
+ try:
68
+ import google.generativeai as genai
69
+
70
+ # Check if settings manager has already initialized Gemini client
71
+ # This is an optimization to avoid recreating the client
72
+ from utils.settings_manager import SettingsManager
73
+
74
+ if (
75
+ hasattr(SettingsManager, "_gemini_client_cache")
76
+ and SettingsManager._gemini_client_cache is not None
77
+ and SettingsManager._gemini_client_key == gemini_api_key
78
+ ):
79
+
80
+ self.logger.info(
81
+ "Reusing existing Gemini client from settings manager"
82
+ )
83
+ genai_client = SettingsManager._gemini_client_cache
84
+ else:
85
+ # Configure new client
86
+ genai.configure(api_key=gemini_api_key)
87
+ genai_client = genai
88
+
89
+ # Create model instance
90
+ self.gemini_client = genai_client.GenerativeModel(
91
+ "gemini-2.5-flash-preview-05-20"
92
+ )
93
+ self.logger.info("Gemini client initialized")
94
+ except ImportError:
95
+ self.logger.warning("Gemini SDK not available")
96
+ except Exception as e:
97
+ self.logger.warning(f"Failed to initialize Gemini: {e}")
98
+
99
+ # Try to initialize OpenAI
100
+ openai_api_key = os.getenv("OPENAI_API_KEY")
101
+ if openai_api_key:
102
+ try:
103
+ import openai
104
+
105
+ self.openai_client = openai.OpenAI(api_key=openai_api_key)
106
+ self.logger.info("OpenAI client initialized")
107
+ except ImportError:
108
+ self.logger.warning("OpenAI SDK not available")
109
+ except Exception as e:
110
+ self.logger.warning(f"Failed to initialize OpenAI: {e}")
111
+
112
+ # Try to initialize LangChain
113
+ try:
114
+ if self.gemini_client:
115
+ from langchain_google_genai import ChatGoogleGenerativeAI
116
+
117
+ self.llm = ChatGoogleGenerativeAI(
118
+ model="gemini-2.5-flash-preview-05-20",
119
+ temperature=self.temperature,
120
+ google_api_key=gemini_api_key,
121
+ )
122
+ elif self.openai_client:
123
+ from langchain_openai import ChatOpenAI
124
+
125
+ self.llm = ChatOpenAI(
126
+ model=self.model,
127
+ temperature=self.temperature,
128
+ max_tokens=self.max_tokens,
129
+ openai_api_key=openai_api_key,
130
+ )
131
+
132
+ if self.llm:
133
+ self.logger.info("LangChain LLM initialized")
134
+
135
+ except ImportError:
136
+ self.logger.warning("LangChain not available")
137
+ except Exception as e:
138
+ self.logger.warning(f"Failed to initialize LangChain: {e}")
139
+
140
+ except Exception as e:
141
+ self.logger.error(f"❌ Error initializing LLM providers: {e}")
142
+
143
+ def generate_response(
144
+ self, query: str, context: List[Dict[str, Any]]
145
+ ) -> Dict[str, Any]:
146
+ """
147
+ Generate a response based on the query and retrieved context.
148
+
149
+ Args:
150
+ query: Original user query
151
+ context: List of retrieved context items with text and metadata
152
+
153
+ Returns:
154
+ Dictionary containing the generated response and metadata
155
+ """
156
+ if not query:
157
+ return {
158
+ "response": "I need a question to answer.",
159
+ "sources": [],
160
+ "confidence": 0.0,
161
+ "error": "No query provided",
162
+ }
163
+
164
+ if not context:
165
+ return {
166
+ "response": self.response_templates["no_context"],
167
+ "sources": [],
168
+ "confidence": 0.0,
169
+ "error": "No context available",
170
+ }
171
+
172
+ self.logger.info(f"Generating response for query: {query[:100]}...")
173
+ start_time = time.time()
174
+
175
+ try:
176
+ # Prepare context for generation
177
+ formatted_context = self._format_context(context)
178
+
179
+ # Calculate initial confidence based on context quality
180
+ base_confidence = self._calculate_confidence(context)
181
+
182
+ # Generate response using available LLM
183
+ response_result = self._generate_with_llm(query, formatted_context)
184
+
185
+ if not response_result["success"]:
186
+ # Fallback to template-based generation
187
+ response_result = self._fallback_generation(query, formatted_context)
188
+
189
+ # Extract sources from context
190
+ sources = self._extract_sources(context) if self.include_sources else []
191
+
192
+ # Assess response quality
193
+ quality_score = self._assess_response_quality(
194
+ response_result["response"], query, context
195
+ )
196
+
197
+ # Calculate final confidence
198
+ final_confidence = min(base_confidence * quality_score, 1.0)
199
+
200
+ # Check if confidence is too low
201
+ if final_confidence < 0.3:
202
+ response_text = self.response_templates["insufficient_confidence"]
203
+ final_confidence = 0.2
204
+ else:
205
+ response_text = response_result["response"]
206
+
207
+ result = {
208
+ "response": response_text,
209
+ "sources": sources,
210
+ "confidence": final_confidence,
211
+ "context_items": len(context),
212
+ "generation_time": time.time() - start_time,
213
+ "model_used": response_result.get("model", "fallback"),
214
+ "quality_score": quality_score,
215
+ }
216
+
217
+ self.logger.info(f"Response generated in {result['generation_time']:.2f}s")
218
+ return result
219
+
220
+ except Exception as e:
221
+ self.logger.error(f"❌ Error generating response: {str(e)}")
222
+ return {
223
+ "response": self.response_templates["error"],
224
+ "sources": [],
225
+ "confidence": 0.0,
226
+ "error": str(e),
227
+ "generation_time": time.time() - start_time,
228
+ }
229
+
230
+ def _generate_with_llm(self, query: str, context: str) -> Dict[str, Any]:
231
+ """
232
+ Generate response using available LLM providers.
233
+
234
+ Args:
235
+ query: User query
236
+ context: Formatted context string
237
+
238
+ Returns:
239
+ Dictionary with generation result
240
+ """
241
+ # Create RAG prompt
242
+ prompt = self._create_rag_prompt(query, context)
243
+
244
+ # Try LangChain first
245
+ if self.llm:
246
+ try:
247
+ from langchain.schema import HumanMessage
248
+
249
+ messages = [HumanMessage(content=prompt)]
250
+ response = self.llm.invoke(messages)
251
+ return {
252
+ "success": True,
253
+ "response": response.content,
254
+ "model": "langchain",
255
+ }
256
+ except Exception as e:
257
+ self.logger.warning(f"LangChain generation failed: {e}")
258
+
259
+ # Try Gemini directly
260
+ if self.gemini_client:
261
+ try:
262
+ response = self.gemini_client.generate_content(prompt)
263
+ return {
264
+ "success": True,
265
+ "response": response.text,
266
+ "model": "gemini-2.5-flash-preview-05-20",
267
+ }
268
+ except Exception as e:
269
+ self.logger.warning(f"Gemini generation failed: {e}")
270
+
271
+ # Try OpenAI directly
272
+ if self.openai_client:
273
+ try:
274
+ response = self.openai_client.chat.completions.create(
275
+ model=self.model,
276
+ messages=[{"role": "user", "content": prompt}],
277
+ max_tokens=self.max_tokens,
278
+ temperature=self.temperature,
279
+ )
280
+ return {
281
+ "success": True,
282
+ "response": response.choices[0].message.content,
283
+ "model": self.model,
284
+ }
285
+ except Exception as e:
286
+ self.logger.warning(f"OpenAI generation failed: {e}")
287
+
288
+ return {"success": False, "response": "", "model": "none"}
289
+
290
+ def _create_rag_prompt(self, query: str, context: str) -> str:
291
+ """
292
+ Create an enhanced prompt template for RAG generation with markdown formatting.
293
+
294
+ Args:
295
+ query: User query
296
+ context: Formatted context
297
+
298
+ Returns:
299
+ Formatted prompt string
300
+ """
301
+ prompt = f"""You are an AI assistant that answers questions based on provided context. Follow these guidelines:
302
+
303
+ 1. Answer the question using ONLY the information provided in the context
304
+ 2. If the context doesn't contain enough information, clearly state this
305
+ 3. Cite specific sources when making claims
306
+ 4. Be concise but comprehensive
307
+ 5. If multiple sources provide different information, acknowledge this
308
+ 6. Use a professional and helpful tone
309
+ 7. **Format your response in clean, readable Markdown**
310
+
311
+ Context Information:
312
+ {context}
313
+
314
+ Question: {query}
315
+
316
+ Instructions:
317
+ - Provide a clear, well-structured answer using **Markdown formatting**
318
+ - Use headers (##, ###) to organize sections
319
+ - Use **bold** for important points
320
+ - Use bullet points (-) or numbered lists (1.) for clarity
321
+ - Use `code blocks` for technical terms or specific data
322
+ - Include relevant details from the context
323
+ - If uncertain, express the level of confidence
324
+ - Do not make up information not present in the context
325
+
326
+ Format your response in Markdown with proper structure and formatting.
327
+
328
+ Answer:"""
329
+
330
+ return prompt
331
+
332
+ def _fallback_generation(self, query: str, context: str) -> Dict[str, Any]:
333
+ """
334
+ Fallback response generation when LLM is not available.
335
+
336
+ Args:
337
+ query: User query
338
+ context: Formatted context
339
+
340
+ Returns:
341
+ Dictionary with generation result
342
+ """
343
+ self.logger.info("Using fallback generation")
344
+
345
+ # Extract key information from context
346
+ context_lines = context.split("\n")
347
+ relevant_lines = [
348
+ line.strip()
349
+ for line in context_lines
350
+ if line.strip() and not line.startswith("[Source:")
351
+ ]
352
+
353
+ if not relevant_lines:
354
+ return {
355
+ "success": True,
356
+ "response": self.response_templates["no_context"],
357
+ "model": "fallback",
358
+ }
359
+
360
+ # Create a structured markdown response
361
+ response_parts = [
362
+ f"## Answer to: {query}",
363
+ "",
364
+ "Based on the available information:",
365
+ "",
366
+ ]
367
+
368
+ # Add key information as markdown list
369
+ for i, line in enumerate(relevant_lines[:3]): # Limit to 3 most relevant
370
+ if len(line) > 50: # Only include substantial content
371
+ response_parts.append(f"- {line}")
372
+
373
+ response_parts.extend(
374
+ [
375
+ "",
376
+ "---",
377
+ "",
378
+ "**Note:** This response is generated using available context. For more detailed analysis, please ensure proper language model integration.",
379
+ ]
380
+ )
381
+
382
+ response = "\n".join(response_parts)
383
+
384
+ return {
385
+ "success": True,
386
+ "response": response,
387
+ "model": "fallback",
388
+ }
389
+
390
+ def _format_context(self, context: List[Dict[str, Any]]) -> str:
391
+ """
392
+ Format the retrieved context for use in response generation.
393
+
394
+ Args:
395
+ context: List of context items
396
+
397
+ Returns:
398
+ Formatted context string
399
+ """
400
+ formatted_parts = []
401
+
402
+ for i, item in enumerate(context):
403
+ text = item.get("text", "")
404
+ source = item.get("source", f"Source {i+1}")
405
+ score = item.get("score", 0.0)
406
+
407
+ # Format each context item with metadata
408
+ formatted_part = f"""[Source {i+1}: {source} (Relevance: {score:.2f})]
409
+ {text}
410
+ ---"""
411
+ formatted_parts.append(formatted_part)
412
+
413
+ return "\n\n".join(formatted_parts)
414
+
415
+ def _extract_sources(self, context: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
416
+ """
417
+ Extract source information from context items.
418
+
419
+ Args:
420
+ context: List of context items
421
+
422
+ Returns:
423
+ List of source dictionaries
424
+ """
425
+ sources = []
426
+ seen_sources = set()
427
+
428
+ for item in context:
429
+ source = item.get("source", "Unknown")
430
+ score = item.get("score", 0.0)
431
+ final_score = item.get("final_score", score)
432
+
433
+ if source not in seen_sources:
434
+ source_info = {
435
+ "source": source,
436
+ "relevance_score": round(score, 3),
437
+ "final_score": round(final_score, 3),
438
+ "metadata": item.get("metadata", {}),
439
+ }
440
+
441
+ # Add source type
442
+ if source.endswith(".pdf"):
443
+ source_info["type"] = "PDF Document"
444
+ elif source.startswith("http"):
445
+ source_info["type"] = "Web Page"
446
+ elif source.endswith((".docx", ".doc")):
447
+ source_info["type"] = "Word Document"
448
+ else:
449
+ source_info["type"] = "Document"
450
+
451
+ sources.append(source_info)
452
+ seen_sources.add(source)
453
+
454
+ # Sort by relevance score
455
+ sources.sort(key=lambda x: x["final_score"], reverse=True)
456
+ return sources
457
+
458
+ def _calculate_confidence(self, context: List[Dict[str, Any]]) -> float:
459
+ """
460
+ Calculate confidence score based on context quality.
461
+
462
+ Args:
463
+ context: List of context items
464
+
465
+ Returns:
466
+ Confidence score between 0.0 and 1.0
467
+ """
468
+ if not context:
469
+ return 0.0
470
+
471
+ # Calculate average similarity score
472
+ scores = [item.get("final_score", item.get("score", 0.0)) for item in context]
473
+ avg_score = sum(scores) / len(scores)
474
+
475
+ # Factor in the number of context items
476
+ context_factor = min(len(context) / 3.0, 1.0) # Normalize to max of 3 items
477
+
478
+ # Factor in score distribution (prefer consistent scores)
479
+ if len(scores) > 1:
480
+ score_variance = sum((s - avg_score) ** 2 for s in scores) / len(scores)
481
+ consistency_factor = max(0.5, 1.0 - score_variance)
482
+ else:
483
+ consistency_factor = 1.0
484
+
485
+ # Combine factors
486
+ confidence = (
487
+ (avg_score * 0.6) + (context_factor * 0.2) + (consistency_factor * 0.2)
488
+ )
489
+
490
+ return min(confidence, 1.0)
491
+
492
+ def _assess_response_quality(
493
+ self, response: str, query: str, context: List[Dict[str, Any]]
494
+ ) -> float:
495
+ """
496
+ Assess the quality of the generated response.
497
+
498
+ Args:
499
+ response: Generated response
500
+ query: Original query
501
+ context: Context used for generation
502
+
503
+ Returns:
504
+ Quality score between 0.0 and 1.0
505
+ """
506
+ if not response or len(response.strip()) < 10:
507
+ return 0.1
508
+
509
+ quality_score = 0.5 # Base score
510
+
511
+ # Check response length (not too short, not too long)
512
+ response_length = len(response)
513
+ if 50 <= response_length <= 1000:
514
+ quality_score += 0.2
515
+ elif response_length > 1000:
516
+ quality_score += 0.1
517
+
518
+ # Check if response addresses the query
519
+ query_words = set(query.lower().split())
520
+ response_words = set(response.lower().split())
521
+ word_overlap = len(query_words.intersection(response_words))
522
+ if word_overlap > 0:
523
+ quality_score += min(word_overlap / len(query_words), 0.2)
524
+
525
+ # Check if response uses context information
526
+ context_texts = [item.get("text", "") for item in context]
527
+ context_words = set()
528
+ for text in context_texts:
529
+ context_words.update(text.lower().split())
530
+
531
+ context_usage = len(response_words.intersection(context_words))
532
+ if context_usage > 5: # Uses substantial context
533
+ quality_score += 0.1
534
+
535
+ return min(quality_score, 1.0)
536
+
537
+ def get_supported_models(self) -> List[str]:
538
+ """
539
+ Get list of supported models.
540
+
541
+ Returns:
542
+ List of available model names
543
+ """
544
+ models = ["fallback"]
545
+
546
+ if self.gemini_client:
547
+ models.extend(["gemini-2.5-flash-preview-05-20", "gemini-1.5-pro"])
548
+
549
+ if self.openai_client:
550
+ models.extend(["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo"])
551
+
552
+ return models
553
+
554
+ def update_model(self, model_name: str) -> bool:
555
+ """
556
+ Update the model used for generation.
557
+
558
+ Args:
559
+ model_name: Name of the model to use
560
+
561
+ Returns:
562
+ True if model was updated successfully
563
+ """
564
+ try:
565
+ if model_name in self.get_supported_models():
566
+ self.model = model_name
567
+ self.logger.info(f"Model updated to: {model_name}")
568
+ return True
569
+ else:
570
+ self.logger.warning(f"Model {model_name} not supported")
571
+ return False
572
+ except Exception as e:
573
+ self.logger.error(f"❌ Error updating model: {e}")
574
+ return False
575
+
576
+ def get_generation_stats(self) -> Dict[str, Any]:
577
+ """
578
+ Get statistics about response generation.
579
+
580
+ Returns:
581
+ Dictionary with generation statistics
582
+ """
583
+ return {
584
+ "supported_models": self.get_supported_models(),
585
+ "current_model": self.model,
586
+ "gemini_available": self.gemini_client is not None,
587
+ "openai_available": self.openai_client is not None,
588
+ "langchain_available": self.llm is not None,
589
+ "max_tokens": self.max_tokens,
590
+ "temperature": self.temperature,
591
+ }
src/storage/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ Storage module for vector database operations.
3
+
4
+ This module contains components for storing and retrieving
5
+ vector embeddings using Pinecone.
6
+ """
src/storage/vector_db.py ADDED
@@ -0,0 +1,729 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Vector Database Module
3
+
4
+ This module is responsible for storing and indexing vector embeddings
5
+ for efficient retrieval using Pinecone with complete functionality.
6
+
7
+ Technology: Pinecone
8
+ """
9
+
10
+ import logging
11
+ import os
12
+ import time
13
+ import uuid
14
+ import hashlib
15
+ from datetime import datetime
16
+ from typing import Dict, List, Any, Optional, Union
17
+
18
+ # Import Pinecone and related libraries
19
+ try:
20
+ import pinecone
21
+ from pinecone import Pinecone, ServerlessSpec
22
+ except ImportError as e:
23
+ logging.warning(f"Pinecone library not installed: {e}")
24
+
25
+ from utils.error_handler import VectorStorageError, error_handler, ErrorType
26
+
27
+
28
+ class VectorDB:
29
+ """
30
+ Stores and indexes vector embeddings for efficient retrieval using Pinecone with full functionality.
31
+
32
+ Features:
33
+ - Complete Pinecone integration
34
+ - Index management (create, update, delete)
35
+ - Batch upsert operations with optimization
36
+ - Advanced similarity search with metadata filtering
37
+ - Statistics and monitoring
38
+ """
39
+
40
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
41
+ """
42
+ Initialize the VectorDB with configuration.
43
+
44
+ Args:
45
+ config: Configuration dictionary with Pinecone parameters
46
+ """
47
+ self.config = config or {}
48
+ self.logger = logging.getLogger(__name__)
49
+
50
+ # Configuration settings
51
+ self.api_key = self.config.get("api_key", os.environ.get("PINECONE_API_KEY"))
52
+ self.environment = self.config.get("environment", "us-west1-gcp")
53
+ self.index_name = self.config.get("index_name", "rag-ai-index")
54
+ self.dimension = self.config.get(
55
+ "dimension", 3072
56
+ ) # ✅ Fixed: Match Gemini embedding dimension
57
+ self.metric = self.config.get("metric", "cosine")
58
+ self.batch_size = self.config.get("batch_size", 100)
59
+
60
+ # Performance settings
61
+ self.max_metadata_size = self.config.get(
62
+ "max_metadata_size", 40960
63
+ ) # 40KB limit
64
+ self.upsert_timeout = self.config.get("upsert_timeout", 60)
65
+ self.query_timeout = self.config.get("query_timeout", 30)
66
+
67
+ # Statistics tracking
68
+ self.stats = {
69
+ "vectors_stored": 0,
70
+ "vectors_queried": 0,
71
+ "vectors_deleted": 0,
72
+ "batch_operations": 0,
73
+ "failed_operations": 0,
74
+ "start_time": datetime.now(),
75
+ }
76
+
77
+ # Initialize Pinecone client
78
+ self.pc = None
79
+ self.index = None
80
+ self._initialize_client()
81
+
82
+ def _initialize_client(self):
83
+ """Initialize Pinecone client and index with validation."""
84
+ if not self.api_key:
85
+ self.logger.warning(
86
+ "No Pinecone API key provided. Vector storage will not be available."
87
+ )
88
+ return
89
+
90
+ try:
91
+ # Initialize Pinecone client
92
+ self.pc = Pinecone(api_key=self.api_key)
93
+
94
+ # Check if index exists, create if not
95
+ self._ensure_index_exists()
96
+
97
+ # Connect to index
98
+ self.index = self.pc.Index(self.index_name)
99
+
100
+ # Test connection
101
+ self._test_connection()
102
+
103
+ self.logger.info(
104
+ f"Pinecone client initialized successfully with index: {self.index_name}"
105
+ )
106
+
107
+ except Exception as e:
108
+ self.logger.error(f" Failed to initialize Pinecone client: {str(e)}")
109
+ self.pc = None
110
+ self.index = None
111
+
112
+ def _ensure_index_exists(self):
113
+ """Ensure the Pinecone index exists, create if necessary."""
114
+ try:
115
+ # List existing indexes
116
+ existing_indexes = [index.name for index in self.pc.list_indexes()]
117
+
118
+ if self.index_name not in existing_indexes:
119
+ self.logger.info(f"Creating new Pinecone index: {self.index_name}")
120
+
121
+ # Create index with serverless spec
122
+ self.pc.create_index(
123
+ name=self.index_name,
124
+ dimension=self.dimension,
125
+ metric=self.metric,
126
+ spec=ServerlessSpec(cloud="aws", region=self.environment),
127
+ )
128
+
129
+ # Wait for index to be ready
130
+ self._wait_for_index_ready()
131
+
132
+ self.logger.info(f"Index {self.index_name} created successfully")
133
+ else:
134
+ self.logger.info(f"Index {self.index_name} already exists")
135
+
136
+ except Exception as e:
137
+ raise VectorStorageError(f"Failed to ensure index exists: {str(e)}")
138
+
139
+ def _wait_for_index_ready(self, max_wait_time: int = 300):
140
+ """Wait for index to be ready for operations."""
141
+ start_time = time.time()
142
+
143
+ while time.time() - start_time < max_wait_time:
144
+ try:
145
+ index_stats = self.pc.describe_index(self.index_name)
146
+ if index_stats.status.ready:
147
+ self.logger.info(f"Index {self.index_name} is ready")
148
+ return
149
+
150
+ self.logger.info(f"Waiting for index to be ready...")
151
+ time.sleep(10)
152
+
153
+ except Exception as e:
154
+ self.logger.warning(f"Error checking index status: {str(e)}")
155
+ time.sleep(5)
156
+
157
+ raise VectorStorageError(
158
+ f"Index {self.index_name} not ready after {max_wait_time}s"
159
+ )
160
+
161
+ def _test_connection(self):
162
+ """Test connection to Pinecone index."""
163
+ try:
164
+ # Get index stats
165
+ stats = self.index.describe_index_stats()
166
+ self.logger.info(f"Connection test successful. Index stats: {stats}")
167
+
168
+ except Exception as e:
169
+ raise VectorStorageError(f"Connection test failed: {str(e)}")
170
+
171
+ @error_handler(ErrorType.VECTOR_STORAGE)
172
+ def store_embeddings(self, items: List[Dict[str, Any]]) -> bool:
173
+ """
174
+ Store embeddings in the vector database with full functionality.
175
+
176
+ Args:
177
+ items: List of dictionaries containing content, metadata, and embeddings
178
+
179
+ Returns:
180
+ True if successful, False otherwise
181
+ """
182
+ if not self.index or not items:
183
+ self.logger.warning("No index available or empty items list")
184
+ return False
185
+
186
+ # Filter and validate items
187
+ valid_items = self._validate_items(items)
188
+ if not valid_items:
189
+ self.logger.warning("No valid embeddings to store")
190
+ return False
191
+
192
+ self.logger.info(f"Storing {len(valid_items)} embeddings in Pinecone")
193
+ start_time = time.time()
194
+
195
+ try:
196
+ # Process in batches
197
+ total_batches = (len(valid_items) + self.batch_size - 1) // self.batch_size
198
+ successful_batches = 0
199
+
200
+ for i in range(0, len(valid_items), self.batch_size):
201
+ batch_num = (i // self.batch_size) + 1
202
+ batch = valid_items[i : i + self.batch_size]
203
+
204
+ self.logger.info(
205
+ f"Processing batch {batch_num}/{total_batches} ({len(batch)} vectors)"
206
+ )
207
+
208
+ success = self._store_batch(batch)
209
+ if success:
210
+ successful_batches += 1
211
+ self.stats["vectors_stored"] += len(batch)
212
+ else:
213
+ self.stats["failed_operations"] += 1
214
+ self.logger.error(f" Batch {batch_num} failed")
215
+
216
+ # Rate limiting between batches
217
+ if i + self.batch_size < len(valid_items):
218
+ time.sleep(0.1)
219
+
220
+ self.stats["batch_operations"] += total_batches
221
+ processing_time = time.time() - start_time
222
+
223
+ success_rate = successful_batches / total_batches * 100
224
+ self.logger.info(
225
+ f"Storage completed: {successful_batches}/{total_batches} batches successful ({success_rate:.1f}%) in {processing_time:.2f}s"
226
+ )
227
+
228
+ return successful_batches > 0
229
+
230
+ except Exception as e:
231
+ self.stats["failed_operations"] += 1
232
+ raise VectorStorageError(f"Failed to store embeddings: {str(e)}")
233
+
234
+ def _validate_items(self, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
235
+ """
236
+ Validate and prepare items for storage.
237
+
238
+ Args:
239
+ items: List of items to validate
240
+
241
+ Returns:
242
+ List of valid items
243
+ """
244
+ valid_items = []
245
+
246
+ for i, item in enumerate(items):
247
+ try:
248
+ # Check required fields
249
+ if not isinstance(item, dict):
250
+ self.logger.warning(f"Item {i} is not a dictionary")
251
+ continue
252
+
253
+ if "embedding" not in item or not item["embedding"]:
254
+ self.logger.warning(f"Item {i} missing embedding")
255
+ continue
256
+
257
+ embedding = item["embedding"]
258
+ if not isinstance(embedding, list) or len(embedding) != self.dimension:
259
+ self.logger.warning(
260
+ f"Item {i} has invalid embedding dimension: {len(embedding)} != {self.dimension}"
261
+ )
262
+ continue
263
+
264
+ # Prepare item
265
+ processed_item = self._prepare_item_for_storage(item, i)
266
+ valid_items.append(processed_item)
267
+
268
+ except Exception as e:
269
+ self.logger.warning(f"Error validating item {i}: {str(e)}")
270
+ continue
271
+
272
+ return valid_items
273
+
274
+ def _prepare_item_for_storage(
275
+ self, item: Dict[str, Any], index: int
276
+ ) -> Dict[str, Any]:
277
+ """
278
+ Prepare item for Pinecone storage.
279
+
280
+ Args:
281
+ item: Item to prepare
282
+ index: Item index for ID generation
283
+
284
+ Returns:
285
+ Prepared item
286
+ """
287
+ # 🆔 Generate unique ID
288
+ item_id = item.get("id")
289
+ if not item_id:
290
+ # Create ID from content hash + timestamp
291
+ content = item.get("content", "")
292
+ timestamp = str(int(time.time() * 1000))
293
+ content_hash = hashlib.md5(content.encode()).hexdigest()[:8]
294
+ item_id = f"doc_{content_hash}_{timestamp}_{index}"
295
+
296
+ # Prepare metadata
297
+ metadata = item.get("metadata", {}).copy()
298
+
299
+ # Add essential fields to metadata
300
+ metadata.update(
301
+ {
302
+ "content_preview": item.get("content", "")[:500], # First 500 chars
303
+ "content_length": len(item.get("content", "")),
304
+ "stored_at": datetime.now().isoformat(),
305
+ "source": item.get("source", "unknown"),
306
+ "document_type": item.get("document_type", "text"),
307
+ }
308
+ )
309
+
310
+ # Ensure metadata size limit
311
+ metadata = self._truncate_metadata(metadata)
312
+
313
+ return {"id": item_id, "values": item["embedding"], "metadata": metadata}
314
+
315
+ def _truncate_metadata(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
316
+ """
317
+ Truncate metadata to fit Pinecone size limits.
318
+
319
+ Args:
320
+ metadata: Original metadata
321
+
322
+ Returns:
323
+ Truncated metadata
324
+ """
325
+ import json
326
+
327
+ # 📏 Check current size
328
+ metadata_str = json.dumps(metadata, default=str)
329
+ if len(metadata_str.encode()) <= self.max_metadata_size:
330
+ return metadata
331
+
332
+ # Truncate large fields
333
+ truncated = metadata.copy()
334
+
335
+ # Truncate text fields progressively
336
+ text_fields = ["content_preview", "text", "description", "summary"]
337
+ for field in text_fields:
338
+ if field in truncated:
339
+ while (
340
+ len(json.dumps(truncated, default=str).encode())
341
+ > self.max_metadata_size
342
+ ):
343
+ current_length = len(str(truncated[field]))
344
+ if current_length <= 50:
345
+ break
346
+ truncated[field] = (
347
+ str(truncated[field])[: current_length // 2] + "..."
348
+ )
349
+
350
+ return truncated
351
+
352
+ def _store_batch(self, batch: List[Dict[str, Any]]) -> bool:
353
+ """
354
+ Store a batch of embeddings in Pinecone.
355
+
356
+ Args:
357
+ batch: List of prepared items
358
+
359
+ Returns:
360
+ True if successful
361
+ """
362
+ try:
363
+ # Upsert vectors to Pinecone
364
+ upsert_response = self.index.upsert(
365
+ vectors=batch, timeout=self.upsert_timeout
366
+ )
367
+
368
+ # Verify upsert success
369
+ if hasattr(upsert_response, "upserted_count"):
370
+ expected_count = len(batch)
371
+ actual_count = upsert_response.upserted_count
372
+
373
+ if actual_count != expected_count:
374
+ self.logger.warning(
375
+ f"Upsert count mismatch: {actual_count}/{expected_count}"
376
+ )
377
+ return False
378
+
379
+ self.logger.info(f"Successfully stored batch of {len(batch)} vectors")
380
+ return True
381
+
382
+ except Exception as e:
383
+ self.logger.error(f" Error storing batch: {str(e)}")
384
+ return False
385
+
386
+ @error_handler(ErrorType.VECTOR_STORAGE)
387
+ def search(
388
+ self,
389
+ query_embedding: List[float],
390
+ top_k: int = 5,
391
+ filter: Optional[Dict[str, Any]] = None,
392
+ include_metadata: bool = True,
393
+ include_values: bool = False,
394
+ ) -> List[Dict[str, Any]]:
395
+ """
396
+ Search for similar vectors with advanced filtering.
397
+
398
+ Args:
399
+ query_embedding: Query vector to search for
400
+ top_k: Number of results to return
401
+ filter: Optional metadata filter
402
+ include_metadata: Whether to include metadata in results
403
+ include_values: Whether to include vector values in results
404
+
405
+ Returns:
406
+ List of search results with scores and metadata
407
+ """
408
+ if not self.index or not query_embedding:
409
+ self.logger.warning("No index available or empty query embedding")
410
+ return []
411
+
412
+ # Validate query embedding
413
+ if len(query_embedding) != self.dimension:
414
+ raise VectorStorageError(
415
+ f"Query embedding dimension {len(query_embedding)} != {self.dimension}"
416
+ )
417
+
418
+ self.logger.info(f"Searching for similar vectors (top_k={top_k})")
419
+ start_time = time.time()
420
+
421
+ try:
422
+ # Perform similarity search
423
+ search_response = self.index.query(
424
+ vector=query_embedding,
425
+ top_k=top_k,
426
+ filter=filter,
427
+ include_metadata=include_metadata,
428
+ include_values=include_values,
429
+ timeout=self.query_timeout,
430
+ )
431
+
432
+ # Process results
433
+ results = []
434
+ if hasattr(search_response, "matches"):
435
+ for match in search_response.matches:
436
+ result = {
437
+ "id": match.id,
438
+ "score": float(match.score),
439
+ }
440
+
441
+ if include_metadata and hasattr(match, "metadata"):
442
+ result["metadata"] = (
443
+ dict(match.metadata) if match.metadata else {}
444
+ )
445
+
446
+ if include_values and hasattr(match, "values"):
447
+ result["values"] = match.values
448
+
449
+ results.append(result)
450
+
451
+ self.stats["vectors_queried"] += len(results)
452
+ search_time = time.time() - start_time
453
+
454
+ self.logger.info(
455
+ f"Search completed: {len(results)} results in {search_time:.3f}s"
456
+ )
457
+ return results
458
+
459
+ except Exception as e:
460
+ self.stats["failed_operations"] += 1
461
+ raise VectorStorageError(f"Search failed: {str(e)}")
462
+
463
+ @error_handler(ErrorType.VECTOR_STORAGE)
464
+ def delete(
465
+ self,
466
+ ids: Optional[List[str]] = None,
467
+ filter: Optional[Dict[str, Any]] = None,
468
+ delete_all: bool = False,
469
+ ) -> bool:
470
+ """
471
+ Delete vectors from the database.
472
+
473
+ Args:
474
+ ids: Optional list of vector IDs to delete
475
+ filter: Optional metadata filter for vectors to delete
476
+ delete_all: Whether to delete all vectors
477
+
478
+ Returns:
479
+ True if successful
480
+ """
481
+ if not self.index:
482
+ self.logger.warning("No index available")
483
+ return False
484
+
485
+ try:
486
+ if delete_all:
487
+ # Delete all vectors
488
+ self.index.delete(delete_all=True)
489
+ self.logger.info("Deleted all vectors from index")
490
+ self.stats["vectors_deleted"] += 1 # Approximate
491
+
492
+ elif ids:
493
+ # Delete by IDs
494
+ self.index.delete(ids=ids)
495
+ self.logger.info(f"Deleted {len(ids)} vectors by ID")
496
+ self.stats["vectors_deleted"] += len(ids)
497
+
498
+ elif filter:
499
+ # Delete by filter
500
+ self.index.delete(filter=filter)
501
+ self.logger.info(f"Deleted vectors by filter: {filter}")
502
+ self.stats["vectors_deleted"] += 1 # Approximate
503
+
504
+ else:
505
+ self.logger.warning("No deletion criteria provided")
506
+ return False
507
+
508
+ return True
509
+
510
+ except Exception as e:
511
+ self.stats["failed_operations"] += 1
512
+ raise VectorStorageError(f"Delete operation failed: {str(e)}")
513
+
514
+ def get_index_stats(self) -> Dict[str, Any]:
515
+ """
516
+ Get comprehensive index statistics.
517
+
518
+ Returns:
519
+ Dictionary with index statistics
520
+ """
521
+ if not self.index:
522
+ return {}
523
+
524
+ try:
525
+ # Get Pinecone index stats
526
+ pinecone_stats = self.index.describe_index_stats()
527
+
528
+ # Combine with internal stats
529
+ runtime = datetime.now() - self.stats["start_time"]
530
+
531
+ return {
532
+ "pinecone_stats": {
533
+ "total_vector_count": pinecone_stats.total_vector_count,
534
+ "dimension": pinecone_stats.dimension,
535
+ "index_fullness": pinecone_stats.index_fullness,
536
+ "namespaces": (
537
+ dict(pinecone_stats.namespaces)
538
+ if pinecone_stats.namespaces
539
+ else {}
540
+ ),
541
+ },
542
+ "internal_stats": {
543
+ **self.stats,
544
+ "runtime_seconds": runtime.total_seconds(),
545
+ "avg_vectors_per_batch": (
546
+ self.stats["vectors_stored"]
547
+ / max(1, self.stats["batch_operations"])
548
+ ),
549
+ "success_rate": (
550
+ (
551
+ self.stats["batch_operations"]
552
+ - self.stats["failed_operations"]
553
+ )
554
+ / max(1, self.stats["batch_operations"])
555
+ * 100
556
+ ),
557
+ },
558
+ "configuration": {
559
+ "index_name": self.index_name,
560
+ "dimension": self.dimension,
561
+ "metric": self.metric,
562
+ "batch_size": self.batch_size,
563
+ },
564
+ }
565
+
566
+ except Exception as e:
567
+ self.logger.error(f" Error getting index stats: {str(e)}")
568
+ return {"error": str(e)}
569
+
570
+ def health_check(self) -> Dict[str, Any]:
571
+ """
572
+ Perform health check on the vector database.
573
+
574
+ Returns:
575
+ Health check results
576
+ """
577
+ health = {
578
+ "status": "unknown",
579
+ "timestamp": datetime.now().isoformat(),
580
+ "checks": {},
581
+ }
582
+
583
+ try:
584
+ # Check API connection
585
+ if self.pc:
586
+ health["checks"]["api_connection"] = "Connected"
587
+ else:
588
+ health["checks"]["api_connection"] = " Not connected"
589
+ health["status"] = "unhealthy"
590
+ return health
591
+
592
+ # Check index availability
593
+ if self.index:
594
+ health["checks"]["index_available"] = "Available"
595
+ else:
596
+ health["checks"]["index_available"] = " Not available"
597
+ health["status"] = "unhealthy"
598
+ return health
599
+
600
+ # Test query operation
601
+ try:
602
+ test_vector = [0.1] * self.dimension
603
+ self.index.query(vector=test_vector, top_k=1, timeout=5)
604
+ health["checks"]["query_operation"] = "Working"
605
+ except Exception as e:
606
+ health["checks"]["query_operation"] = f" Failed: {str(e)}"
607
+ health["status"] = "degraded"
608
+
609
+ # Check index stats
610
+ try:
611
+ stats = self.index.describe_index_stats()
612
+ health["checks"]["index_stats"] = f"{stats.total_vector_count} vectors"
613
+ except Exception as e:
614
+ health["checks"]["index_stats"] = f" Failed: {str(e)}"
615
+
616
+ # 🎯 Overall status
617
+ if health["status"] == "unknown":
618
+ health["status"] = "healthy"
619
+
620
+ except Exception as e:
621
+ health["status"] = "unhealthy"
622
+ health["error"] = str(e)
623
+
624
+ return health
625
+
626
+ def reset_stats(self):
627
+ """Reset internal statistics."""
628
+ self.stats = {
629
+ "vectors_stored": 0,
630
+ "vectors_queried": 0,
631
+ "vectors_deleted": 0,
632
+ "batch_operations": 0,
633
+ "failed_operations": 0,
634
+ "start_time": datetime.now(),
635
+ }
636
+ self.logger.info("Statistics reset")
637
+
638
+ def get_stats(self) -> Dict[str, Any]:
639
+ """
640
+ Get simplified stats for UI display.
641
+
642
+ Returns:
643
+ Dictionary with basic statistics
644
+ """
645
+ try:
646
+ if not self.index:
647
+ return {"total_vectors": 0, "status": "disconnected"}
648
+
649
+ # Get Pinecone stats
650
+ pinecone_stats = self.index.describe_index_stats()
651
+
652
+ return {
653
+ "total_vectors": pinecone_stats.total_vector_count,
654
+ "dimension": pinecone_stats.dimension,
655
+ "index_fullness": pinecone_stats.index_fullness,
656
+ "status": "connected",
657
+ }
658
+ except Exception as e:
659
+ self.logger.warning(f"Could not get stats: {e}")
660
+ return {"total_vectors": 0, "status": "error", "error": str(e)}
661
+
662
+ def get_unique_sources(self) -> List[Dict[str, Any]]:
663
+ """
664
+ Get unique sources from stored vectors.
665
+
666
+ Returns:
667
+ List of unique sources with metadata
668
+ """
669
+ try:
670
+ if not self.index:
671
+ return []
672
+
673
+ # This is a simplified approach - in a real implementation,
674
+ # you might want to maintain a separate metadata index
675
+ # For now, we'll return mock data based on what might be stored
676
+
677
+ # Try to get some sample vectors to extract sources
678
+ test_vector = [0.1] * self.dimension
679
+ results = self.index.query(
680
+ vector=test_vector,
681
+ top_k=100, # Get more results to find unique sources
682
+ include_metadata=True,
683
+ )
684
+
685
+ sources = {}
686
+ for match in results.matches:
687
+ if hasattr(match, "metadata") and match.metadata:
688
+ source = match.metadata.get("source", "Unknown")
689
+ if source not in sources:
690
+ sources[source] = {
691
+ "source": source,
692
+ "chunk_count": 1,
693
+ "added_date": match.metadata.get("stored_at", "Unknown"),
694
+ }
695
+ else:
696
+ sources[source]["chunk_count"] += 1
697
+
698
+ return list(sources.values())
699
+
700
+ except Exception as e:
701
+ self.logger.warning(f"Could not get unique sources: {e}")
702
+ return []
703
+
704
+ def list_documents(self) -> List[Dict[str, Any]]:
705
+ """
706
+ List all documents in the vector database.
707
+
708
+ Returns:
709
+ List of document information
710
+ """
711
+ try:
712
+ # Get unique sources and format as documents
713
+ sources = self.get_unique_sources()
714
+ documents = []
715
+
716
+ for source_info in sources:
717
+ documents.append(
718
+ {
719
+ "name": source_info["source"],
720
+ "chunks": source_info["chunk_count"],
721
+ "date": source_info["added_date"],
722
+ }
723
+ )
724
+
725
+ return documents
726
+
727
+ except Exception as e:
728
+ self.logger.warning(f"Could not list documents: {e}")
729
+ return []
src/ui/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ UI module for the Gradio user interface.
3
+
4
+ This module contains components for providing an intuitive
5
+ interface for document upload, URL input, and querying.
6
+ """
src/ui/gradio_app.py ADDED
The diff for this file is too large to render. See raw diff
 
src/utils/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ Utils module for configuration management and error handling.
3
+
4
+ This module contains utility components for managing configuration
5
+ and handling errors throughout the application.
6
+ """
src/utils/config_manager.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration Manager Module
3
+
4
+ This module handles loading and managing configuration settings
5
+ for the RAG AI system.
6
+ """
7
+
8
+ import os
9
+ import yaml
10
+ import logging
11
+ from typing import Dict, Any, Optional
12
+ from pathlib import Path
13
+
14
+
15
+ class ConfigManager:
16
+ """
17
+ Manages configuration settings for the RAG AI system.
18
+
19
+ Features:
20
+ - YAML configuration file loading
21
+ - Environment variable override
22
+ - Default configuration values
23
+ - Configuration validation
24
+ """
25
+
26
+ def __init__(self, config_path: Optional[str] = None):
27
+ """
28
+ Initialize the ConfigManager.
29
+
30
+ Args:
31
+ config_path: Path to the configuration file (defaults to config/config.yaml)
32
+ """
33
+ self.logger = logging.getLogger(__name__)
34
+
35
+ # Set default config path
36
+ if config_path is None:
37
+ config_path = os.path.join(
38
+ os.path.dirname(__file__), "..", "..", "config", "config.yaml"
39
+ )
40
+
41
+ self.config_path = Path(config_path)
42
+ self.config = self._load_config()
43
+
44
+ def _load_config(self) -> Dict[str, Any]:
45
+ """
46
+ Load configuration from file and environment variables.
47
+
48
+ Returns:
49
+ Configuration dictionary
50
+ """
51
+ # Start with default configuration
52
+ config = self._get_default_config()
53
+
54
+ # Load from YAML file if it exists
55
+ if self.config_path.exists():
56
+ try:
57
+ with open(self.config_path, "r", encoding="utf-8") as f:
58
+ file_config = yaml.safe_load(f) or {}
59
+ config = self._merge_configs(config, file_config)
60
+ self.logger.info(f"Loaded configuration from {self.config_path}")
61
+ except Exception as e:
62
+ self.logger.warning(
63
+ f"Failed to load config file {self.config_path}: {str(e)}"
64
+ )
65
+ else:
66
+ self.logger.warning(f"Config file not found: {self.config_path}")
67
+
68
+ # Override with environment variables
69
+ config = self._apply_env_overrides(config)
70
+
71
+ # Validate configuration
72
+ self._validate_config(config)
73
+
74
+ return config
75
+
76
+ def _get_default_config(self) -> Dict[str, Any]:
77
+ """
78
+ Get default configuration values.
79
+
80
+ Returns:
81
+ Default configuration dictionary
82
+ """
83
+ return {
84
+ "api_keys": {
85
+ "gemini_api_key": "",
86
+ "pinecone_api_key": "",
87
+ "openai_api_key": "",
88
+ },
89
+ "vector_db": {
90
+ "provider": "pinecone",
91
+ "index_name": "rag-ai-index",
92
+ "dimension": 3072, # ✅ Fixed: Match Gemini embedding dimension
93
+ "metric": "cosine",
94
+ "environment": "us-west1-gcp",
95
+ },
96
+ "embedding": {
97
+ "model": "gemini-embedding-exp-03-07",
98
+ "batch_size": 5,
99
+ "max_retries": 3,
100
+ "retry_delay": 1,
101
+ },
102
+ "document_processing": {
103
+ "chunk_size": 1000,
104
+ "chunk_overlap": 200,
105
+ "min_chunk_size": 100,
106
+ "max_file_size_mb": 50,
107
+ },
108
+ "url_processing": {
109
+ "max_depth": 1,
110
+ "follow_links": True,
111
+ "max_pages": 10,
112
+ "timeout": 10,
113
+ },
114
+ "rag": {
115
+ "top_k": 5,
116
+ "similarity_threshold": 0.7,
117
+ "max_context_length": 4000,
118
+ "model": "gpt-3.5-turbo",
119
+ "max_tokens": 500,
120
+ "temperature": 0.7,
121
+ },
122
+ "ui": {
123
+ "title": "AI Embedded Knowledge Agent",
124
+ "description": "Upload documents or provide URLs to build your knowledge base, then ask questions!",
125
+ "theme": "default",
126
+ "share": False,
127
+ "port": 7860,
128
+ },
129
+ "logging": {
130
+ "level": "INFO",
131
+ "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
132
+ },
133
+ }
134
+
135
+ def _merge_configs(
136
+ self, base: Dict[str, Any], override: Dict[str, Any]
137
+ ) -> Dict[str, Any]:
138
+ """
139
+ Recursively merge two configuration dictionaries.
140
+
141
+ Args:
142
+ base: Base configuration dictionary
143
+ override: Override configuration dictionary
144
+
145
+ Returns:
146
+ Merged configuration dictionary
147
+ """
148
+ result = base.copy()
149
+
150
+ for key, value in override.items():
151
+ if (
152
+ key in result
153
+ and isinstance(result[key], dict)
154
+ and isinstance(value, dict)
155
+ ):
156
+ result[key] = self._merge_configs(result[key], value)
157
+ else:
158
+ result[key] = value
159
+
160
+ return result
161
+
162
+ def _apply_env_overrides(self, config: Dict[str, Any]) -> Dict[str, Any]:
163
+ """
164
+ Apply environment variable overrides to configuration.
165
+
166
+ Args:
167
+ config: Configuration dictionary
168
+
169
+ Returns:
170
+ Configuration with environment overrides applied
171
+ """
172
+ # API Keys
173
+ if os.environ.get("GEMINI_API_KEY"):
174
+ config["api_keys"]["gemini_api_key"] = os.environ["GEMINI_API_KEY"]
175
+
176
+ if os.environ.get("PINECONE_API_KEY"):
177
+ config["api_keys"]["pinecone_api_key"] = os.environ["PINECONE_API_KEY"]
178
+
179
+ if os.environ.get("OPENAI_API_KEY"):
180
+ config["api_keys"]["openai_api_key"] = os.environ["OPENAI_API_KEY"]
181
+
182
+ # Pinecone settings
183
+ if os.environ.get("PINECONE_ENVIRONMENT"):
184
+ config["vector_db"]["environment"] = os.environ["PINECONE_ENVIRONMENT"]
185
+
186
+ if os.environ.get("PINECONE_INDEX_NAME"):
187
+ config["vector_db"]["index_name"] = os.environ["PINECONE_INDEX_NAME"]
188
+
189
+ # UI settings
190
+ if os.environ.get("GRADIO_SHARE"):
191
+ config["ui"]["share"] = os.environ["GRADIO_SHARE"].lower() == "true"
192
+
193
+ if os.environ.get("PORT"):
194
+ try:
195
+ config["ui"]["port"] = int(os.environ["PORT"])
196
+ except ValueError:
197
+ self.logger.warning(f"Invalid PORT value: {os.environ['PORT']}")
198
+
199
+ return config
200
+
201
+ def _validate_config(self, config: Dict[str, Any]) -> None:
202
+ """
203
+ Validate configuration values.
204
+
205
+ Args:
206
+ config: Configuration dictionary to validate
207
+ """
208
+ # Check required API keys
209
+ if not config["api_keys"]["gemini_api_key"]:
210
+ self.logger.warning("Gemini API key not configured")
211
+
212
+ if not config["api_keys"]["pinecone_api_key"]:
213
+ self.logger.warning("Pinecone API key not configured")
214
+
215
+ # Validate numeric values
216
+ if config["document_processing"]["chunk_size"] <= 0:
217
+ raise ValueError("chunk_size must be positive")
218
+
219
+ if config["rag"]["top_k"] <= 0:
220
+ raise ValueError("top_k must be positive")
221
+
222
+ if not 0 <= config["rag"]["similarity_threshold"] <= 1:
223
+ raise ValueError("similarity_threshold must be between 0 and 1")
224
+
225
+ def get(self, key: str, default: Any = None) -> Any:
226
+ """
227
+ Get a configuration value using dot notation.
228
+
229
+ Args:
230
+ key: Configuration key (e.g., 'vector_db.index_name')
231
+ default: Default value if key not found
232
+
233
+ Returns:
234
+ Configuration value
235
+ """
236
+ keys = key.split(".")
237
+ value = self.config
238
+
239
+ try:
240
+ for k in keys:
241
+ value = value[k]
242
+ return value
243
+ except (KeyError, TypeError):
244
+ return default
245
+
246
+ def set(self, key: str, value: Any) -> None:
247
+ """
248
+ Set a configuration value using dot notation.
249
+
250
+ Args:
251
+ key: Configuration key (e.g., 'vector_db.index_name')
252
+ value: Value to set
253
+ """
254
+ keys = key.split(".")
255
+ config = self.config
256
+
257
+ for k in keys[:-1]:
258
+ if k not in config:
259
+ config[k] = {}
260
+ config = config[k]
261
+
262
+ config[keys[-1]] = value
263
+
264
+ def get_section(self, section: str) -> Dict[str, Any]:
265
+ """
266
+ Get an entire configuration section.
267
+
268
+ Args:
269
+ section: Section name
270
+
271
+ Returns:
272
+ Configuration section dictionary
273
+ """
274
+ return self.config.get(section, {})
275
+
276
+ def reload(self) -> None:
277
+ """Reload configuration from file."""
278
+ self.config = self._load_config()
279
+ self.logger.info("Configuration reloaded")
src/utils/error_handler.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Error Handler Module
3
+
4
+ This module provides centralized error handling and logging
5
+ for the RAG AI system.
6
+ """
7
+
8
+ import logging
9
+ import traceback
10
+ import functools
11
+ from typing import Any, Callable, Dict, Optional, Type, Union
12
+ from enum import Enum
13
+
14
+
15
+ class ErrorType(Enum):
16
+ """Enumeration of error types in the system."""
17
+
18
+ DOCUMENT_PROCESSING = "document_processing"
19
+ URL_PROCESSING = "url_processing"
20
+ EMBEDDING_GENERATION = "embedding_generation"
21
+ VECTOR_STORAGE = "vector_storage"
22
+ QUERY_PROCESSING = "query_processing"
23
+ RESPONSE_GENERATION = "response_generation"
24
+ API_ERROR = "api_error"
25
+ CONFIGURATION = "configuration"
26
+ UI_ERROR = "ui_error"
27
+ UNKNOWN = "unknown"
28
+
29
+
30
+ class RAGError(Exception):
31
+ """Base exception class for RAG AI system errors."""
32
+
33
+ def __init__(
34
+ self,
35
+ message: str,
36
+ error_type: ErrorType = ErrorType.UNKNOWN,
37
+ details: Optional[Dict[str, Any]] = None,
38
+ ):
39
+ """
40
+ Initialize RAGError.
41
+
42
+ Args:
43
+ message: Error message
44
+ error_type: Type of error
45
+ details: Additional error details
46
+ """
47
+ super().__init__(message)
48
+ self.error_type = error_type
49
+ self.details = details or {}
50
+ self.message = message
51
+
52
+
53
+ class DocumentProcessingError(RAGError):
54
+ """Exception for document processing errors."""
55
+
56
+ def __init__(
57
+ self,
58
+ message: str,
59
+ file_path: Optional[str] = None,
60
+ details: Optional[Dict[str, Any]] = None,
61
+ ):
62
+ details = details or {}
63
+ if file_path:
64
+ details["file_path"] = file_path
65
+ super().__init__(message, ErrorType.DOCUMENT_PROCESSING, details)
66
+
67
+
68
+ class URLProcessingError(RAGError):
69
+ """Exception for URL processing errors."""
70
+
71
+ def __init__(
72
+ self,
73
+ message: str,
74
+ url: Optional[str] = None,
75
+ details: Optional[Dict[str, Any]] = None,
76
+ ):
77
+ details = details or {}
78
+ if url:
79
+ details["url"] = url
80
+ super().__init__(message, ErrorType.URL_PROCESSING, details)
81
+
82
+
83
+ class EmbeddingError(RAGError):
84
+ """Exception for embedding generation errors."""
85
+
86
+ def __init__(self, message: str, details: Optional[Dict[str, Any]] = None):
87
+ super().__init__(message, ErrorType.EMBEDDING_GENERATION, details)
88
+
89
+
90
+ class VectorStorageError(RAGError):
91
+ """Exception for vector storage errors."""
92
+
93
+ def __init__(self, message: str, details: Optional[Dict[str, Any]] = None):
94
+ super().__init__(message, ErrorType.VECTOR_STORAGE, details)
95
+
96
+
97
+ class QueryProcessingError(RAGError):
98
+ """Exception for query processing errors."""
99
+
100
+ def __init__(
101
+ self,
102
+ message: str,
103
+ query: Optional[str] = None,
104
+ details: Optional[Dict[str, Any]] = None,
105
+ ):
106
+ details = details or {}
107
+ if query:
108
+ details["query"] = query
109
+ super().__init__(message, ErrorType.QUERY_PROCESSING, details)
110
+
111
+
112
+ class ResponseGenerationError(RAGError):
113
+ """Exception for response generation errors."""
114
+
115
+ def __init__(self, message: str, details: Optional[Dict[str, Any]] = None):
116
+ super().__init__(message, ErrorType.RESPONSE_GENERATION, details)
117
+
118
+
119
+ class APIError(RAGError):
120
+ """Exception for API-related errors."""
121
+
122
+ def __init__(
123
+ self,
124
+ message: str,
125
+ api_name: Optional[str] = None,
126
+ status_code: Optional[int] = None,
127
+ details: Optional[Dict[str, Any]] = None,
128
+ ):
129
+ details = details or {}
130
+ if api_name:
131
+ details["api_name"] = api_name
132
+ if status_code:
133
+ details["status_code"] = status_code
134
+ super().__init__(message, ErrorType.API_ERROR, details)
135
+
136
+
137
+ class ConfigurationError(RAGError):
138
+ """Exception for configuration errors."""
139
+
140
+ def __init__(
141
+ self,
142
+ message: str,
143
+ config_key: Optional[str] = None,
144
+ details: Optional[Dict[str, Any]] = None,
145
+ ):
146
+ details = details or {}
147
+ if config_key:
148
+ details["config_key"] = config_key
149
+ super().__init__(message, ErrorType.CONFIGURATION, details)
150
+
151
+
152
+ class ErrorHandler:
153
+ """
154
+ Centralized error handler for the RAG AI system.
155
+
156
+ Features:
157
+ - Error logging with context
158
+ - Error categorization
159
+ - Error recovery suggestions
160
+ - Performance monitoring
161
+ """
162
+
163
+ def __init__(self, logger_name: str = __name__):
164
+ """
165
+ Initialize the ErrorHandler.
166
+
167
+ Args:
168
+ logger_name: Name for the logger instance
169
+ """
170
+ self.logger = logging.getLogger(logger_name)
171
+ self.error_counts = {}
172
+
173
+ def handle_error(
174
+ self, error: Exception, context: Optional[Dict[str, Any]] = None
175
+ ) -> Dict[str, Any]:
176
+ """
177
+ Handle an error with logging and context.
178
+
179
+ Args:
180
+ error: The exception that occurred
181
+ context: Additional context information
182
+
183
+ Returns:
184
+ Dictionary containing error information
185
+ """
186
+ context = context or {}
187
+
188
+ # Determine error type
189
+ if isinstance(error, RAGError):
190
+ error_type = error.error_type
191
+ error_details = error.details
192
+ else:
193
+ error_type = ErrorType.UNKNOWN
194
+ error_details = {}
195
+
196
+ # Create error info
197
+ error_info = {
198
+ "type": error_type.value,
199
+ "message": str(error),
200
+ "details": error_details,
201
+ "context": context,
202
+ "traceback": traceback.format_exc(),
203
+ }
204
+
205
+ # Log the error
206
+ self._log_error(error_info)
207
+
208
+ # Update error counts
209
+ self._update_error_counts(error_type)
210
+
211
+ # Add recovery suggestions
212
+ error_info["recovery_suggestions"] = self._get_recovery_suggestions(error_type)
213
+
214
+ return error_info
215
+
216
+ def _log_error(self, error_info: Dict[str, Any]) -> None:
217
+ """
218
+ Log error information.
219
+
220
+ Args:
221
+ error_info: Error information dictionary
222
+ """
223
+ error_type = error_info["type"]
224
+ message = error_info["message"]
225
+ context = error_info.get("context", {})
226
+
227
+ log_message = f"[{error_type.upper()}] {message}"
228
+
229
+ if context:
230
+ context_str = ", ".join([f"{k}={v}" for k, v in context.items()])
231
+ log_message += f" | Context: {context_str}"
232
+
233
+ self.logger.error(log_message)
234
+
235
+ # Log traceback at debug level
236
+ if error_info.get("traceback"):
237
+ self.logger.debug(f"Traceback: {error_info['traceback']}")
238
+
239
+ def _update_error_counts(self, error_type: ErrorType) -> None:
240
+ """
241
+ Update error count statistics.
242
+
243
+ Args:
244
+ error_type: Type of error that occurred
245
+ """
246
+ if error_type not in self.error_counts:
247
+ self.error_counts[error_type] = 0
248
+ self.error_counts[error_type] += 1
249
+
250
+ def _get_recovery_suggestions(self, error_type: ErrorType) -> list:
251
+ """
252
+ Get recovery suggestions for an error type.
253
+
254
+ Args:
255
+ error_type: Type of error
256
+
257
+ Returns:
258
+ List of recovery suggestions
259
+ """
260
+ suggestions = {
261
+ ErrorType.DOCUMENT_PROCESSING: [
262
+ "Check if the document format is supported",
263
+ "Verify the document is not corrupted",
264
+ "Ensure sufficient disk space for processing",
265
+ ],
266
+ ErrorType.URL_PROCESSING: [
267
+ "Verify the URL is accessible",
268
+ "Check internet connectivity",
269
+ "Ensure the website allows scraping",
270
+ ],
271
+ ErrorType.EMBEDDING_GENERATION: [
272
+ "Check Gemini API key configuration",
273
+ "Verify API quota and rate limits",
274
+ "Ensure text content is not empty",
275
+ ],
276
+ ErrorType.VECTOR_STORAGE: [
277
+ "Check Pinecone API key configuration",
278
+ "Verify Pinecone index exists",
279
+ "Check vector dimensions match index configuration",
280
+ ],
281
+ ErrorType.QUERY_PROCESSING: [
282
+ "Ensure query is not empty",
283
+ "Check if knowledge base has content",
284
+ "Verify embedding generation is working",
285
+ ],
286
+ ErrorType.RESPONSE_GENERATION: [
287
+ "Check language model configuration",
288
+ "Verify retrieved context is valid",
289
+ "Ensure API keys are configured",
290
+ ],
291
+ ErrorType.API_ERROR: [
292
+ "Check API key validity",
293
+ "Verify network connectivity",
294
+ "Check API rate limits and quotas",
295
+ ],
296
+ ErrorType.CONFIGURATION: [
297
+ "Check configuration file syntax",
298
+ "Verify all required settings are present",
299
+ "Ensure environment variables are set",
300
+ ],
301
+ ErrorType.UI_ERROR: [
302
+ "Refresh the page",
303
+ "Check browser compatibility",
304
+ "Verify Gradio is properly installed",
305
+ ],
306
+ }
307
+
308
+ return suggestions.get(error_type, ["Contact support for assistance"])
309
+
310
+ def get_error_statistics(self) -> Dict[str, Any]:
311
+ """
312
+ Get error statistics.
313
+
314
+ Returns:
315
+ Dictionary containing error statistics
316
+ """
317
+ total_errors = sum(self.error_counts.values())
318
+
319
+ return {
320
+ "total_errors": total_errors,
321
+ "error_counts": {
322
+ error_type.value: count
323
+ for error_type, count in self.error_counts.items()
324
+ },
325
+ "most_common_error": (
326
+ max(self.error_counts.items(), key=lambda x: x[1])[0].value
327
+ if self.error_counts
328
+ else None
329
+ ),
330
+ }
331
+
332
+
333
+ def error_handler(
334
+ error_type: ErrorType = ErrorType.UNKNOWN, context: Optional[Dict[str, Any]] = None
335
+ ):
336
+ """
337
+ Decorator for automatic error handling.
338
+
339
+ Args:
340
+ error_type: Type of error to handle
341
+ context: Additional context information
342
+
343
+ Returns:
344
+ Decorated function
345
+ """
346
+
347
+ def decorator(func: Callable) -> Callable:
348
+ @functools.wraps(func)
349
+ def wrapper(*args, **kwargs):
350
+ handler = ErrorHandler()
351
+ try:
352
+ return func(*args, **kwargs)
353
+ except Exception as e:
354
+ error_context = context or {}
355
+ error_context.update(
356
+ {
357
+ "function": func.__name__,
358
+ "args": str(args)[:100], # Truncate for logging
359
+ "kwargs": str(kwargs)[:100],
360
+ }
361
+ )
362
+
363
+ error_info = handler.handle_error(e, error_context)
364
+
365
+ # Re-raise as appropriate RAG error type
366
+ if error_type == ErrorType.DOCUMENT_PROCESSING:
367
+ raise DocumentProcessingError(str(e), details=error_info)
368
+ elif error_type == ErrorType.URL_PROCESSING:
369
+ raise URLProcessingError(str(e), details=error_info)
370
+ elif error_type == ErrorType.EMBEDDING_GENERATION:
371
+ raise EmbeddingError(str(e), details=error_info)
372
+ elif error_type == ErrorType.VECTOR_STORAGE:
373
+ raise VectorStorageError(str(e), details=error_info)
374
+ elif error_type == ErrorType.QUERY_PROCESSING:
375
+ raise QueryProcessingError(str(e), details=error_info)
376
+ elif error_type == ErrorType.RESPONSE_GENERATION:
377
+ raise ResponseGenerationError(str(e), details=error_info)
378
+ else:
379
+ raise RAGError(str(e), error_type, error_info)
380
+
381
+ return wrapper
382
+
383
+ return decorator
src/utils/settings_manager.py ADDED
@@ -0,0 +1,676 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Settings Manager Module
3
+
4
+ This module provides secure environment variable management with UI integration,
5
+ supporting both cache and .env file storage options.
6
+
7
+ Features:
8
+ - 🔐 Secure API key handling with masking
9
+ - ⚡ Real-time validation and testing
10
+ - 💾 Dual storage backends (cache + .env file)
11
+ - 🛡️ Input sanitization and validation
12
+ - 🔄 Live system updates
13
+ """
14
+
15
+ import os
16
+ import re
17
+ import logging
18
+ import json
19
+ import time
20
+ from typing import Dict, Any, Optional, Tuple, List
21
+ from pathlib import Path
22
+ from datetime import datetime
23
+ import tempfile
24
+
25
+
26
+ class SettingsManager:
27
+ """
28
+ Manages environment variables with secure storage and validation.
29
+
30
+ Features:
31
+ - Secure API key masking and validation
32
+ - Real-time connection testing
33
+ - Cache and .env file storage options
34
+ - Integration with existing ConfigManager
35
+ """
36
+
37
+ def __init__(self, config_manager=None):
38
+ """
39
+ Initialize the SettingsManager.
40
+
41
+ Args:
42
+ config_manager: Optional ConfigManager instance for integration
43
+ """
44
+ self.logger = logging.getLogger(__name__)
45
+ self.config_manager = config_manager
46
+
47
+ # 🔧 Cache storage for temporary settings
48
+ self._cache_storage = {}
49
+
50
+ # 📁 Project root for .env file
51
+ self.project_root = Path(__file__).parent.parent.parent
52
+ self.env_file_path = self.project_root / ".env"
53
+
54
+ # 🛡️ Supported environment variables with validation rules
55
+ self.supported_env_vars = {
56
+ "GEMINI_API_KEY": {
57
+ "required": True,
58
+ "description": "Google Gemini API Key for embeddings and LLM",
59
+ "format": r"^AIzaSy[A-Za-z0-9_-]{33}$",
60
+ "mask": True,
61
+ "test_function": self._test_gemini_connection,
62
+ "placeholder": "AIzaSy...",
63
+ "help_url": "https://aistudio.google.com/",
64
+ },
65
+ "PINECONE_API_KEY": {
66
+ "required": False,
67
+ "description": "Pinecone API Key for vector database",
68
+ "format": r"^pc-[A-Za-z0-9]{32}$",
69
+ "mask": True,
70
+ "test_function": self._test_pinecone_connection,
71
+ "placeholder": "pc-...",
72
+ "help_url": "https://www.pinecone.io/",
73
+ },
74
+ "OPENAI_API_KEY": {
75
+ "required": False,
76
+ "description": "OpenAI API Key for alternative LLM",
77
+ "format": r"^sk-[A-Za-z0-9]{48}$",
78
+ "mask": True,
79
+ "test_function": self._test_openai_connection,
80
+ "placeholder": "sk-...",
81
+ "help_url": "https://platform.openai.com/api-keys",
82
+ },
83
+ "TAVILY_API_KEY": {
84
+ "required": False,
85
+ "description": "Tavily API Key for live web search",
86
+ "format": r"^tvly-[A-Za-z0-9-]{20,50}$",
87
+ "mask": True,
88
+ "test_function": self._test_tavily_connection,
89
+ "placeholder": "tvly-dev-...",
90
+ "help_url": "https://app.tavily.com/sign-in",
91
+ },
92
+ "PINECONE_ENVIRONMENT": {
93
+ "required": False,
94
+ "description": "Pinecone environment region",
95
+ "format": r"^[a-z0-9-]+$",
96
+ "mask": False,
97
+ "default": "us-east-1",
98
+ "placeholder": "us-east-1",
99
+ "options": [
100
+ "us-east-1",
101
+ "us-west1-gcp",
102
+ "eu-west1-gcp",
103
+ "asia-southeast1-gcp",
104
+ ],
105
+ },
106
+ "PINECONE_INDEX_NAME": {
107
+ "required": False,
108
+ "description": "Pinecone index name",
109
+ "format": r"^[a-z0-9-]+$",
110
+ "mask": False,
111
+ "default": "rag-ai-index",
112
+ "placeholder": "rag-ai-index",
113
+ },
114
+ "GRADIO_SHARE": {
115
+ "required": False,
116
+ "description": "Enable Gradio public sharing",
117
+ "format": r"^(true|false)$",
118
+ "mask": False,
119
+ "default": "false",
120
+ "options": ["true", "false"],
121
+ },
122
+ "PORT": {
123
+ "required": False,
124
+ "description": "Server port number",
125
+ "format": r"^[1-9][0-9]{3,4}$",
126
+ "mask": False,
127
+ "default": "7860",
128
+ "placeholder": "7860",
129
+ },
130
+ }
131
+
132
+ self.logger.info("SettingsManager initialized successfully")
133
+
134
+ def get_current_settings(self) -> Dict[str, Any]:
135
+ """
136
+ Get current environment variable settings with status.
137
+
138
+ Returns:
139
+ Dictionary with current settings and their status
140
+ """
141
+ settings = {}
142
+
143
+ for var_name, config in self.supported_env_vars.items():
144
+ # 🔍 Get value from cache, environment, or default
145
+ value = self._get_env_value(var_name)
146
+
147
+ settings[var_name] = {
148
+ "value": (
149
+ self._mask_value(value, config.get("mask", False)) if value else ""
150
+ ),
151
+ "raw_value": value or "",
152
+ "is_set": bool(value),
153
+ "is_valid": (
154
+ self._validate_format(value, config.get("format"))
155
+ if value
156
+ else False
157
+ ),
158
+ "is_required": config.get("required", False),
159
+ "description": config.get("description", ""),
160
+ "placeholder": config.get("placeholder", ""),
161
+ "help_url": config.get("help_url", ""),
162
+ "options": config.get("options", []),
163
+ "source": self._get_value_source(var_name),
164
+ "last_tested": self._cache_storage.get(f"{var_name}_last_tested"),
165
+ "test_status": self._cache_storage.get(
166
+ f"{var_name}_test_status", "untested"
167
+ ),
168
+ }
169
+
170
+ return settings
171
+
172
+ def update_setting(
173
+ self, var_name: str, value: str, storage_type: str = "cache"
174
+ ) -> Dict[str, Any]:
175
+ """
176
+ Update an environment variable setting.
177
+
178
+ Args:
179
+ var_name: Environment variable name
180
+ value: New value
181
+ storage_type: "cache" or "env_file"
182
+
183
+ Returns:
184
+ Dictionary with operation result
185
+ """
186
+ try:
187
+ if var_name not in self.supported_env_vars:
188
+ return {
189
+ "success": False,
190
+ "error": f"Unsupported environment variable: {var_name}",
191
+ "status": "❌ Invalid variable",
192
+ }
193
+
194
+ config = self.supported_env_vars[var_name]
195
+
196
+ # 🛡️ Validate format
197
+ if value and not self._validate_format(value, config.get("format")):
198
+ return {
199
+ "success": False,
200
+ "error": f"Invalid format for {var_name}",
201
+ "status": "❌ Invalid format",
202
+ "expected_format": config.get("placeholder", ""),
203
+ }
204
+
205
+ # 💾 Store based on storage type
206
+ if storage_type == "cache":
207
+ self._cache_storage[var_name] = value
208
+ os.environ[var_name] = value # ⚡ Update current session
209
+ status_msg = "💾 Saved to cache"
210
+ elif storage_type == "env_file":
211
+ self._save_to_env_file(var_name, value)
212
+ os.environ[var_name] = value # ⚡ Update current session
213
+ status_msg = "📁 Saved to .env file"
214
+ else:
215
+ return {
216
+ "success": False,
217
+ "error": f"Invalid storage type: {storage_type}",
218
+ "status": "❌ Invalid storage type",
219
+ }
220
+
221
+ # 🔄 Update config manager if available
222
+ if self.config_manager:
223
+ try:
224
+ self.config_manager.reload()
225
+ except Exception as e:
226
+ self.logger.warning(f"Could not reload config manager: {e}")
227
+
228
+ self.logger.info(f"Updated {var_name} via {storage_type}")
229
+
230
+ return {
231
+ "success": True,
232
+ "status": f" {status_msg}",
233
+ "value": self._mask_value(value, config.get("mask", False)),
234
+ "storage_type": storage_type,
235
+ "timestamp": datetime.now().isoformat(),
236
+ }
237
+
238
+ except Exception as e:
239
+ self.logger.error(f"Error updating {var_name}: {e}")
240
+ return {"success": False, "error": str(e), "status": " Update failed"}
241
+
242
+ def test_connection(self, var_name: str) -> Dict[str, Any]:
243
+ """
244
+ Test API connection for a given environment variable.
245
+
246
+ Args:
247
+ var_name: Environment variable name
248
+
249
+ Returns:
250
+ Dictionary with test results
251
+ """
252
+ try:
253
+ if var_name not in self.supported_env_vars:
254
+ return {
255
+ "success": False,
256
+ "error": f"Cannot test {var_name}: not supported",
257
+ "status": "❌ Not testable",
258
+ }
259
+
260
+ config = self.supported_env_vars[var_name]
261
+ test_function = config.get("test_function")
262
+
263
+ if not test_function:
264
+ return {
265
+ "success": False,
266
+ "error": f"No test function available for {var_name}",
267
+ "status": "⚠️ No test available",
268
+ }
269
+
270
+ value = self._get_env_value(var_name)
271
+ if not value:
272
+ return {
273
+ "success": False,
274
+ "error": f"{var_name} is not set",
275
+ "status": "❌ Not configured",
276
+ }
277
+
278
+ # 🧪 Run the test
279
+ self.logger.info(f"Testing connection for {var_name}")
280
+ test_result = test_function(value)
281
+
282
+ # 📊 Cache test results
283
+ timestamp = datetime.now().isoformat()
284
+ self._cache_storage[f"{var_name}_last_tested"] = timestamp
285
+ self._cache_storage[f"{var_name}_test_status"] = (
286
+ "success" if test_result["success"] else "failed"
287
+ )
288
+
289
+ return {**test_result, "timestamp": timestamp, "variable": var_name}
290
+
291
+ except Exception as e:
292
+ self.logger.error(f"Error testing {var_name}: {e}")
293
+ error_result = {
294
+ "success": False,
295
+ "error": str(e),
296
+ "status": "❌ Test failed",
297
+ "timestamp": datetime.now().isoformat(),
298
+ }
299
+
300
+ # 📊 Cache failed test
301
+ self._cache_storage[f"{var_name}_last_tested"] = error_result["timestamp"]
302
+ self._cache_storage[f"{var_name}_test_status"] = "failed"
303
+
304
+ return error_result
305
+
306
+ def load_from_env_file(self) -> Dict[str, Any]:
307
+ """
308
+ Load settings from .env file.
309
+
310
+ Returns:
311
+ Dictionary with load results
312
+ """
313
+ try:
314
+ if not self.env_file_path.exists():
315
+ return {
316
+ "success": False,
317
+ "error": ".env file not found",
318
+ "status": "📁 No .env file found",
319
+ "loaded_count": 0,
320
+ }
321
+
322
+ loaded_vars = []
323
+
324
+ with open(self.env_file_path, "r", encoding="utf-8") as f:
325
+ for line_num, line in enumerate(f, 1):
326
+ line = line.strip()
327
+ if line and not line.startswith("#") and "=" in line:
328
+ try:
329
+ key, value = line.split("=", 1)
330
+ key = key.strip()
331
+ value = value.strip().strip("\"'") # Remove quotes
332
+
333
+ if key in self.supported_env_vars:
334
+ os.environ[key] = value
335
+ loaded_vars.append(key)
336
+ except Exception as e:
337
+ self.logger.warning(
338
+ f"Error parsing line {line_num} in .env: {e}"
339
+ )
340
+
341
+ # 🔄 Reload config manager
342
+ if self.config_manager:
343
+ try:
344
+ self.config_manager.reload()
345
+ except Exception as e:
346
+ self.logger.warning(f"Could not reload config manager: {e}")
347
+
348
+ return {
349
+ "success": True,
350
+ "status": f" Loaded {len(loaded_vars)} variables from .env",
351
+ "loaded_count": len(loaded_vars),
352
+ "loaded_variables": loaded_vars,
353
+ }
354
+
355
+ except Exception as e:
356
+ self.logger.error(f"Error loading from .env file: {e}")
357
+ return {
358
+ "success": False,
359
+ "error": str(e),
360
+ "status": " Failed to load .env file",
361
+ "loaded_count": 0,
362
+ }
363
+
364
+ def clear_cache(self) -> Dict[str, Any]:
365
+ """
366
+ Clear cached settings.
367
+
368
+ Returns:
369
+ Dictionary with operation result
370
+ """
371
+ try:
372
+ # 🗑️ Clear cache but preserve test results
373
+ cached_vars = [
374
+ key
375
+ for key in self._cache_storage.keys()
376
+ if key in self.supported_env_vars
377
+ ]
378
+
379
+ for var in cached_vars:
380
+ if var in self._cache_storage:
381
+ del self._cache_storage[var]
382
+ # Remove from current environment if it was cached
383
+ if var in os.environ:
384
+ del os.environ[var]
385
+
386
+ return {
387
+ "success": True,
388
+ "status": f"🗑️ Cleared {len(cached_vars)} cached variables",
389
+ "cleared_count": len(cached_vars),
390
+ }
391
+
392
+ except Exception as e:
393
+ self.logger.error(f"Error clearing cache: {e}")
394
+ return {
395
+ "success": False,
396
+ "error": str(e),
397
+ "status": " Failed to clear cache",
398
+ }
399
+
400
+ def export_settings(self, include_sensitive: bool = False) -> Dict[str, Any]:
401
+ """
402
+ Export current settings for backup/sharing.
403
+
404
+ Args:
405
+ include_sensitive: Whether to include API keys (masked)
406
+
407
+ Returns:
408
+ Dictionary with exported settings
409
+ """
410
+ try:
411
+ settings = self.get_current_settings()
412
+ exported = {}
413
+
414
+ for var_name, config in settings.items():
415
+ var_config = self.supported_env_vars[var_name]
416
+
417
+ # 🔐 Skip sensitive data if not requested
418
+ if var_config.get("mask", False) and not include_sensitive:
419
+ continue
420
+
421
+ exported[var_name] = {
422
+ "value": (
423
+ config["value"] if include_sensitive else config["raw_value"]
424
+ ),
425
+ "is_set": config["is_set"],
426
+ "source": config["source"],
427
+ "description": config["description"],
428
+ }
429
+
430
+ return {
431
+ "success": True,
432
+ "settings": exported,
433
+ "export_timestamp": datetime.now().isoformat(),
434
+ "include_sensitive": include_sensitive,
435
+ }
436
+
437
+ except Exception as e:
438
+ self.logger.error(f"Error exporting settings: {e}")
439
+ return {"success": False, "error": str(e)}
440
+
441
+ # 🔧 Private helper methods
442
+
443
+ def _get_env_value(self, var_name: str) -> Optional[str]:
444
+ """Get environment variable value from cache or environment."""
445
+ # Priority: cache > environment > default
446
+ if var_name in self._cache_storage:
447
+ return self._cache_storage[var_name]
448
+
449
+ env_value = os.environ.get(var_name)
450
+ if env_value:
451
+ return env_value
452
+
453
+ return self.supported_env_vars[var_name].get("default")
454
+
455
+ def _get_value_source(self, var_name: str) -> str:
456
+ """Determine the source of an environment variable value."""
457
+ if var_name in self._cache_storage:
458
+ return "cache"
459
+ elif os.environ.get(var_name):
460
+ return "environment"
461
+ elif self.supported_env_vars[var_name].get("default"):
462
+ return "default"
463
+ else:
464
+ return "unset"
465
+
466
+ def _mask_value(self, value: str, should_mask: bool) -> str:
467
+ """Mask sensitive values for display."""
468
+ if not value or not should_mask:
469
+ return value
470
+
471
+ if len(value) <= 8:
472
+ return "*" * len(value)
473
+
474
+ return value[:4] + "*" * (len(value) - 8) + value[-4:]
475
+
476
+ def _validate_format(self, value: str, format_pattern: Optional[str]) -> bool:
477
+ """Validate value against format pattern."""
478
+ if not format_pattern or not value:
479
+ return True
480
+
481
+ try:
482
+ return bool(re.match(format_pattern, value))
483
+ except Exception:
484
+ return False
485
+
486
+ def _save_to_env_file(self, var_name: str, value: str):
487
+ """Save environment variable to .env file."""
488
+ env_vars = {}
489
+
490
+ # 📖 Read existing .env file
491
+ if self.env_file_path.exists():
492
+ with open(self.env_file_path, "r", encoding="utf-8") as f:
493
+ for line in f:
494
+ line = line.strip()
495
+ if line and not line.startswith("#") and "=" in line:
496
+ try:
497
+ key, val = line.split("=", 1)
498
+ env_vars[key.strip()] = val.strip().strip("\"'")
499
+ except Exception as e:
500
+ self.logger.warning(f"Error parsing line in .env: {e}")
501
+
502
+ # ✏️ Update the variable
503
+ env_vars[var_name] = value
504
+
505
+ # 💾 Write back to file
506
+ with open(self.env_file_path, "w", encoding="utf-8") as f:
507
+ f.write("# Environment Variables for RAG AI System\n")
508
+ f.write(f"# Generated on {datetime.now().isoformat()}\n\n")
509
+
510
+ for key, val in env_vars.items():
511
+ # 🔐 Quote values that contain spaces or special characters
512
+ if " " in val or any(char in val for char in ["$", '"', "'"]):
513
+ f.write(f'{key}="{val}"\n')
514
+ else:
515
+ f.write(f"{key}={val}\n")
516
+
517
+ # 🧪 API Testing Functions
518
+
519
+ # Cache for Gemini client to avoid recreating it
520
+ _gemini_client_cache = None
521
+ _gemini_client_key = None
522
+ _gemini_last_test_time = None
523
+ _gemini_test_cooldown = 10 # seconds between tests
524
+
525
+ def _test_gemini_connection(self, api_key: str) -> Dict[str, Any]:
526
+ """Test Gemini API connection with caching and optimization."""
527
+ try:
528
+ # Check if we've tested this key recently
529
+ current_time = time.time()
530
+ if (
531
+ self._gemini_last_test_time
532
+ and api_key == self._gemini_client_key
533
+ and current_time - self._gemini_last_test_time
534
+ < self._gemini_test_cooldown
535
+ ):
536
+
537
+ self.logger.info(
538
+ "Using cached Gemini test result (within cooldown period)"
539
+ )
540
+ return {
541
+ "success": True,
542
+ "status": "✅ Gemini API connected (cached)",
543
+ "details": "Using cached test result",
544
+ }
545
+
546
+ import google.generativeai as genai
547
+
548
+ # Use cached client if the API key is the same
549
+ if api_key == self._gemini_client_key and self._gemini_client_cache:
550
+ self.logger.info("Using cached Gemini client")
551
+ client = self._gemini_client_cache
552
+ else:
553
+ # Configure new client
554
+ genai.configure(api_key=api_key)
555
+ self._gemini_client_cache = genai
556
+ self._gemini_client_key = api_key
557
+ client = genai
558
+
559
+ # 🧪 Simple test call - use embedding API instead of GenerativeModel
560
+ # This is faster and more efficient for testing connection
561
+ test_result = client.embed_content(
562
+ model="gemini-embedding-exp-03-07",
563
+ content="test connection",
564
+ task_type="retrieval_document",
565
+ )
566
+
567
+ # Update last test time
568
+ self._gemini_last_test_time = current_time
569
+
570
+ if test_result and "embedding" in test_result:
571
+ return {
572
+ "success": True,
573
+ "status": "✅ Gemini API connected",
574
+ "details": "API key is valid and working",
575
+ }
576
+ else:
577
+ return {
578
+ "success": False,
579
+ "status": "❌ Gemini API failed",
580
+ "error": "No embedding in response",
581
+ }
582
+
583
+ except Exception as e:
584
+ return {
585
+ "success": False,
586
+ "status": "❌ Gemini connection failed",
587
+ "error": str(e),
588
+ }
589
+
590
+ def _test_pinecone_connection(self, api_key: str) -> Dict[str, Any]:
591
+ """Test Pinecone API connection."""
592
+ try:
593
+ from pinecone import Pinecone
594
+
595
+ pc = Pinecone(api_key=api_key)
596
+
597
+ # 🧪 Test by listing indexes
598
+ indexes = pc.list_indexes()
599
+
600
+ return {
601
+ "success": True,
602
+ "status": "✅ Pinecone API connected",
603
+ "details": f"Found {len(indexes)} indexes",
604
+ }
605
+
606
+ except Exception as e:
607
+ return {
608
+ "success": False,
609
+ "status": "❌ Pinecone connection failed",
610
+ "error": str(e),
611
+ }
612
+
613
+ def _test_openai_connection(self, api_key: str) -> Dict[str, Any]:
614
+ """Test OpenAI API connection."""
615
+ try:
616
+ import openai
617
+
618
+ client = openai.OpenAI(api_key=api_key)
619
+
620
+ # 🧪 Test with a simple completion
621
+ response = client.chat.completions.create(
622
+ model="gpt-3.5-turbo",
623
+ messages=[{"role": "user", "content": "Hello"}],
624
+ max_tokens=5,
625
+ )
626
+
627
+ if response and response.choices:
628
+ return {
629
+ "success": True,
630
+ "status": "✅ OpenAI API connected",
631
+ "details": "API key is valid and working",
632
+ }
633
+ else:
634
+ return {
635
+ "success": False,
636
+ "status": "❌ OpenAI API failed",
637
+ "error": "No response from API",
638
+ }
639
+
640
+ except Exception as e:
641
+ return {
642
+ "success": False,
643
+ "status": " OpenAI connection failed",
644
+ "error": str(e),
645
+ }
646
+
647
+ def _test_tavily_connection(self, api_key: str) -> Dict[str, Any]:
648
+ """Test Tavily API connection."""
649
+ try:
650
+ from tavily import TavilyClient
651
+
652
+ # 🧪 Initialize client and test with a simple search
653
+ client = TavilyClient(api_key=api_key)
654
+
655
+ # Test with a minimal search query
656
+ response = client.search(query="test", max_results=1, search_depth="basic")
657
+
658
+ if response and isinstance(response, dict):
659
+ return {
660
+ "success": True,
661
+ "status": "✅ Tavily API connected",
662
+ "details": "API key is valid and working",
663
+ }
664
+ else:
665
+ return {
666
+ "success": False,
667
+ "status": "❌ Tavily API failed",
668
+ "error": "No valid response from API",
669
+ }
670
+
671
+ except Exception as e:
672
+ return {
673
+ "success": False,
674
+ "status": "❌ Tavily connection failed",
675
+ "error": str(e),
676
+ }