Spaces:
Sleeping
Sleeping
Upload 31 files
Browse files- .gitattributes +35 -35
- .gitignore +10 -0
- API_KEYS_SETUP.md +146 -0
- README.md +49 -12
- app.py +884 -0
- config/config.yaml +269 -0
- docs/architecture.md +291 -0
- requirements.txt +103 -0
- src/embedding/__init__.py +6 -0
- src/embedding/embedding_generator.py +462 -0
- src/ingestion/__init__.py +6 -0
- src/ingestion/document_processor.py +668 -0
- src/ingestion/pipeline.py +495 -0
- src/ingestion/text_extractor.py +526 -0
- src/ingestion/url_processor.py +603 -0
- src/integrations/__init__.py +13 -0
- src/integrations/mcp_tavily_integration.py +308 -0
- src/rag/__init__.py +6 -0
- src/rag/live_search.py +523 -0
- src/rag/optimized_query_processor.py +275 -0
- src/rag/query_processor.py +427 -0
- src/rag/query_router.py +587 -0
- src/rag/response_generator.py +591 -0
- src/storage/__init__.py +6 -0
- src/storage/vector_db.py +729 -0
- src/ui/__init__.py +6 -0
- src/ui/gradio_app.py +0 -0
- src/utils/__init__.py +6 -0
- src/utils/config_manager.py +279 -0
- src/utils/error_handler.py +383 -0
- src/utils/settings_manager.py +676 -0
.gitattributes
CHANGED
@@ -1,35 +1,35 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.env*
|
2 |
+
!.env.example
|
3 |
+
|
4 |
+
__pycache__/
|
5 |
+
*.py[cod]
|
6 |
+
|
7 |
+
logs/
|
8 |
+
|
9 |
+
**__pycache__/
|
10 |
+
|
API_KEYS_SETUP.md
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🔑 API Keys Setup Guide
|
2 |
+
|
3 |
+
## How to Get Pinecone API Key
|
4 |
+
|
5 |
+
### Step 1: Create Pinecone Account
|
6 |
+
|
7 |
+
1. Go to [https://www.pinecone.io/](https://www.pinecone.io/)
|
8 |
+
2. Click **"Sign Up"** or **"Get Started Free"**
|
9 |
+
3. Create account with your email or sign up with Google/GitHub
|
10 |
+
4. Verify your email address if required
|
11 |
+
|
12 |
+
### Step 2: Access Dashboard
|
13 |
+
|
14 |
+
1. Log into your Pinecone account
|
15 |
+
2. You'll be taken to the Pinecone Console/Dashboard
|
16 |
+
3. Look for **"API Keys"** in the left sidebar or navigation menu
|
17 |
+
|
18 |
+
### Step 3: Get Your API Key
|
19 |
+
|
20 |
+
1. Click on **"API Keys"** in the dashboard
|
21 |
+
2. You'll see your default API key listed
|
22 |
+
3. Click **"Copy"** or the copy icon next to the API key
|
23 |
+
4. Save this key securely - you'll need it for the application
|
24 |
+
|
25 |
+
## How to Get Gemini API Key
|
26 |
+
|
27 |
+
### Step 1: Go to Google AI Studio
|
28 |
+
|
29 |
+
1. Visit [https://aistudio.google.com/](https://aistudio.google.com/)
|
30 |
+
2. Sign in with your Google account
|
31 |
+
|
32 |
+
### Step 2: Get API Key
|
33 |
+
|
34 |
+
1. Click **"Get API Key"** in the top navigation
|
35 |
+
2. Click **"Create API Key"**
|
36 |
+
3. Select your Google Cloud project (or create a new one)
|
37 |
+
4. Copy the generated API key
|
38 |
+
|
39 |
+
## How to Get Tavily API Key
|
40 |
+
|
41 |
+
### Step 1: Create Tavily Account
|
42 |
+
|
43 |
+
1. Go to [https://app.tavily.com/](https://app.tavily.com/)
|
44 |
+
2. Click **"Sign Up"** and register with your email or use a social login
|
45 |
+
3. Verify your email address if prompted
|
46 |
+
|
47 |
+
### Step 2: Access API Keys
|
48 |
+
|
49 |
+
1. Log into your Tavily account
|
50 |
+
2. Navigate to the **"API Keys"** section in your dashboard
|
51 |
+
3. Click **"Create API Key"** if you don't have one yet
|
52 |
+
4. Copy the generated API key and store it securely
|
53 |
+
|
54 |
+
## 🚀 Quick Start Guide
|
55 |
+
|
56 |
+
### Option 1: Set Environment Variables Temporarily
|
57 |
+
|
58 |
+
**Windows Command Prompt:**
|
59 |
+
|
60 |
+
```cmd
|
61 |
+
set PINECONE_API_KEY=pc-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
62 |
+
set GEMINI_API_KEY=AIzaSyxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
63 |
+
set TAVILY_API_KEY=tvly_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
64 |
+
python app.py
|
65 |
+
```
|
66 |
+
|
67 |
+
**Windows PowerShell:**
|
68 |
+
|
69 |
+
```powershell
|
70 |
+
$env:PINECONE_API_KEY="pc-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
71 |
+
$env:GEMINI_API_KEY="AIzaSyxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
72 |
+
$env:TAVILY_API_KEY="tvly_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
73 |
+
python app.py
|
74 |
+
```
|
75 |
+
|
76 |
+
### Option 2: Create .env File (Recommended)
|
77 |
+
|
78 |
+
1. Create a file named `.env` in your project root:
|
79 |
+
|
80 |
+
```
|
81 |
+
PINECONE_API_KEY=pc-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
82 |
+
GEMINI_API_KEY=AIzaSyxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
83 |
+
TAVILY_API_KEY=tvly_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
84 |
+
OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx # Optional
|
85 |
+
```
|
86 |
+
|
87 |
+
2. Run the application:
|
88 |
+
|
89 |
+
```cmd
|
90 |
+
python app.py
|
91 |
+
```
|
92 |
+
|
93 |
+
## 📋 Free Tier Information
|
94 |
+
|
95 |
+
### Pinecone Free Tier:
|
96 |
+
|
97 |
+
- ✅ 1 project
|
98 |
+
- ✅ 1 index
|
99 |
+
- ✅ 100K vectors
|
100 |
+
- ✅ Perfect for hackathons and testing
|
101 |
+
|
102 |
+
### Gemini Free Tier:
|
103 |
+
|
104 |
+
- ✅ 15 requests per minute
|
105 |
+
- ✅ 1 million tokens per day
|
106 |
+
- ✅ Sufficient for development and demos
|
107 |
+
|
108 |
+
### Tavily Free Tier:
|
109 |
+
|
110 |
+
- ✅ Generous free tier for testing and development
|
111 |
+
- ✅ Check [Tavily Pricing](https://www.tavily.com/pricing) for current limits
|
112 |
+
|
113 |
+
## 🔧 Troubleshooting
|
114 |
+
|
115 |
+
### If you get "Invalid API Key" errors:
|
116 |
+
|
117 |
+
1. Double-check the API key is copied correctly
|
118 |
+
2. Make sure there are no extra spaces
|
119 |
+
3. Verify the environment variable is set: `echo %PINECONE_API_KEY%`
|
120 |
+
|
121 |
+
### If Pinecone connection fails:
|
122 |
+
|
123 |
+
1. Check your internet connection
|
124 |
+
2. Verify your Pinecone account is active
|
125 |
+
3. Make sure you're using the correct region (default is usually fine)
|
126 |
+
|
127 |
+
## 🎯 Ready to Launch
|
128 |
+
|
129 |
+
Once you have all API keys:
|
130 |
+
|
131 |
+
1. **Set the environment variables**
|
132 |
+
2. **Run the application:**
|
133 |
+
```cmd
|
134 |
+
python app.py
|
135 |
+
```
|
136 |
+
3. **Open your browser to:** `http://localhost:7860`
|
137 |
+
4. **Start uploading documents and asking questions!**
|
138 |
+
|
139 |
+
The application will now have full functionality with:
|
140 |
+
|
141 |
+
- ✅ Document processing and embedding
|
142 |
+
- ✅ Vector storage in Pinecone
|
143 |
+
- ✅ AI-powered question answering
|
144 |
+
- ✅ Beautiful Gradio interface
|
145 |
+
|
146 |
+
**🎉 Your AI Embedded Knowledge Agent will be fully operational!**
|
README.md
CHANGED
@@ -1,12 +1,49 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🧠 AI Embedded Knowledge Agent
|
2 |
+
|
3 |
+
A comprehensive Retrieval-Augmented Generation (RAG) system that allows you to upload documents, process URLs, and ask intelligent questions about your knowledge base. Built with modern AI technologies and optimized for deployment on Hugging Face Spaces.
|
4 |
+
|
5 |
+
## 🎥 Demo Video
|
6 |
+
|
7 |
+
Watch our comprehensive demo showcasing the live search capabilities and RAG system in action:
|
8 |
+
|
9 |
+
[](https://www.loom.com/share/f1a3c79b75ad4b65b528b2973612cdd9?sid=a0932972-2926-42ab-a031-3149e40b1b97)
|
10 |
+
|
11 |
+
_See how the system processes documents, performs live web searches, and generates intelligent responses with real-time data integration._
|
12 |
+
|
13 |
+
## ✨ Features
|
14 |
+
|
15 |
+
### 🔥 Core Capabilities
|
16 |
+
|
17 |
+
- **📄 Document Processing**: Support for PDF, DOCX, CSV, XLSX, PPTX, TXT, MD, and more
|
18 |
+
- **🌐 URL Processing**: Extract content from web pages with intelligent crawling
|
19 |
+
- **🔍 Live Web Search**: Real-time web search using Tavily API for up-to-date information
|
20 |
+
- **🧠 Smart Q&A**: Ask questions and get contextual answers with source attribution
|
21 |
+
- **🎯 High Accuracy**: Advanced embedding and similarity search for precise results
|
22 |
+
- **⚡ Real-time Processing**: Fast document ingestion and query processing
|
23 |
+
|
24 |
+
### 🚀 Advanced Features
|
25 |
+
|
26 |
+
- **🤖 Multiple LLM Support**: Gemini 2.5 Flash, OpenAI GPT models with automatic fallback
|
27 |
+
- **📊 Analytics Dashboard**: Query analytics, system metrics, and performance monitoring
|
28 |
+
- **🔍 Smart Query Processing**: Query expansion, caching, and suggestion system
|
29 |
+
- **📚 Knowledge Base Management**: View, manage, and export your knowledge base
|
30 |
+
- **🛡️ Robust Error Handling**: Graceful degradation and comprehensive error recovery
|
31 |
+
- **🎨 Beautiful UI**: Modern Gradio interface optimized for user experience
|
32 |
+
|
33 |
+
### 🏗️ Technical Excellence
|
34 |
+
|
35 |
+
- **🔧 Modular Architecture**: Clean, maintainable, and extensible codebase
|
36 |
+
- **⚙️ Configurable**: Comprehensive YAML configuration for all components
|
37 |
+
- **🔒 Secure**: Input sanitization, rate limiting, and security best practices
|
38 |
+
- **📈 Scalable**: Designed for production deployment with monitoring and health checks
|
39 |
+
- **🧪 Well-tested**: Comprehensive test suite and example usage
|
40 |
+
|
41 |
+
## 🎯 Project Origin & Agentic Vision
|
42 |
+
|
43 |
+
This project was initially conceived and developed as part of the **`agents-mcp-hackathon`**, showcasing a robust **`agent-demo-track`** for intelligent knowledge retrieval and generation. Our vision was to build an autonomous AI agent capable of:
|
44 |
+
|
45 |
+
- **🧠 Intelligent Information Retrieval**: Acting as a smart agent to fetch, process, and synthesize information from diverse sources (documents, URLs, live web).
|
46 |
+
- **🚀 Dynamic Query Routing**: Intelligently deciding between local knowledge base retrieval and real-time web search based on query intent.
|
47 |
+
- **💡 Autonomous Knowledge Management**: Providing a self-contained system for building and querying a dynamic knowledge base.
|
48 |
+
|
49 |
+
This system embodies the principles of agentic AI, offering a powerful, self-sufficient solution for complex information needs.
|
app.py
ADDED
@@ -0,0 +1,884 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
AI Embedded Knowledge Agent - Main Application Entry Point
|
3 |
+
|
4 |
+
This is the main entry point for the RAG AI system that integrates all components
|
5 |
+
and launches the Gradio interface for deployment on Hugging Face.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import nltk
|
9 |
+
|
10 |
+
nltk.download("punkt_tab")
|
11 |
+
|
12 |
+
|
13 |
+
import spacy.cli
|
14 |
+
|
15 |
+
spacy.cli.download("en_core_web_sm")
|
16 |
+
nlp = spacy.load("en_core_web_sm")
|
17 |
+
|
18 |
+
|
19 |
+
import os
|
20 |
+
import sys
|
21 |
+
import logging
|
22 |
+
from pathlib import Path
|
23 |
+
from typing import Optional
|
24 |
+
|
25 |
+
# Load environment variables from .env file
|
26 |
+
try:
|
27 |
+
from dotenv import load_dotenv
|
28 |
+
|
29 |
+
load_dotenv()
|
30 |
+
except ImportError:
|
31 |
+
print(
|
32 |
+
"python-dotenv not installed. Please install it with: pip install python-dotenv"
|
33 |
+
)
|
34 |
+
|
35 |
+
# Add src directory to Python path
|
36 |
+
src_path = Path(__file__).parent / "src"
|
37 |
+
sys.path.insert(0, str(src_path))
|
38 |
+
|
39 |
+
# Import all components
|
40 |
+
from utils.config_manager import ConfigManager
|
41 |
+
from utils.error_handler import ErrorHandler, ErrorType
|
42 |
+
from ingestion.document_processor import DocumentProcessor
|
43 |
+
from ingestion.url_processor import URLProcessor
|
44 |
+
from ingestion.text_extractor import TextExtractor
|
45 |
+
from embedding.embedding_generator import EmbeddingGenerator
|
46 |
+
from storage.vector_db import VectorDB
|
47 |
+
from rag.optimized_query_processor import OptimizedQueryProcessor
|
48 |
+
from rag.response_generator import ResponseGenerator
|
49 |
+
from rag.live_search import LiveSearchProcessor
|
50 |
+
from rag.query_router import QueryRouter
|
51 |
+
from ui.gradio_app import GradioApp
|
52 |
+
|
53 |
+
|
54 |
+
class RAGSystem:
|
55 |
+
"""
|
56 |
+
Main RAG AI system that orchestrates all components.
|
57 |
+
|
58 |
+
This class integrates document processing, embedding generation,
|
59 |
+
vector storage, and query processing into a unified system.
|
60 |
+
"""
|
61 |
+
|
62 |
+
def __init__(self, config_path: Optional[str] = None):
|
63 |
+
"""
|
64 |
+
Initialize the RAG system with all components.
|
65 |
+
|
66 |
+
Args:
|
67 |
+
config_path: Path to configuration file
|
68 |
+
"""
|
69 |
+
# Initialize configuration
|
70 |
+
self.config_manager = ConfigManager(config_path)
|
71 |
+
self.config = self.config_manager.config
|
72 |
+
|
73 |
+
# Setup logging
|
74 |
+
self._setup_logging()
|
75 |
+
self.logger = logging.getLogger(__name__)
|
76 |
+
self.logger.info("Initializing RAG AI System...")
|
77 |
+
|
78 |
+
# Initialize error handler
|
79 |
+
self.error_handler = ErrorHandler()
|
80 |
+
|
81 |
+
# Validate environment and configuration
|
82 |
+
self._validate_environment()
|
83 |
+
|
84 |
+
# Initialize components
|
85 |
+
self._initialize_components()
|
86 |
+
|
87 |
+
# Run health checks
|
88 |
+
self._run_startup_health_checks()
|
89 |
+
|
90 |
+
self.logger.info("RAG AI System initialized successfully! ")
|
91 |
+
|
92 |
+
def _setup_logging(self):
|
93 |
+
"""Setup comprehensive logging configuration."""
|
94 |
+
log_config = self.config.get("logging", {})
|
95 |
+
log_level = getattr(logging, log_config.get("level", "INFO").upper())
|
96 |
+
log_format = log_config.get(
|
97 |
+
"format", "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
98 |
+
)
|
99 |
+
|
100 |
+
# Configure root logger with UTF-8 encoding
|
101 |
+
import io
|
102 |
+
|
103 |
+
utf8_stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
|
104 |
+
logging.basicConfig(
|
105 |
+
level=log_level,
|
106 |
+
format=log_format,
|
107 |
+
handlers=[logging.StreamHandler(utf8_stdout)],
|
108 |
+
)
|
109 |
+
|
110 |
+
# Create logs directory if specified
|
111 |
+
log_file = log_config.get("file")
|
112 |
+
if log_file:
|
113 |
+
log_dir = Path(log_file).parent
|
114 |
+
log_dir.mkdir(parents=True, exist_ok=True)
|
115 |
+
|
116 |
+
# Add file handler with rotation
|
117 |
+
try:
|
118 |
+
from logging.handlers import RotatingFileHandler
|
119 |
+
|
120 |
+
file_handler = RotatingFileHandler(
|
121 |
+
log_file,
|
122 |
+
maxBytes=log_config.get("max_file_size_mb", 10) * 1024 * 1024,
|
123 |
+
backupCount=log_config.get("backup_count", 5),
|
124 |
+
)
|
125 |
+
file_handler.setFormatter(logging.Formatter(log_format))
|
126 |
+
logging.getLogger().addHandler(file_handler)
|
127 |
+
except Exception as e:
|
128 |
+
self.logger.warning(f"Could not setup file logging: {e}")
|
129 |
+
|
130 |
+
def _validate_environment(self):
|
131 |
+
"""Validate environment variables and configuration."""
|
132 |
+
self.logger.info("Validating environment...")
|
133 |
+
|
134 |
+
# Check required API keys
|
135 |
+
required_keys = ["GEMINI_API_KEY"]
|
136 |
+
optional_keys = ["PINECONE_API_KEY", "OPENAI_API_KEY"]
|
137 |
+
|
138 |
+
missing_required = []
|
139 |
+
for key in required_keys:
|
140 |
+
if not os.getenv(key):
|
141 |
+
missing_required.append(key)
|
142 |
+
|
143 |
+
if missing_required:
|
144 |
+
self.logger.error(
|
145 |
+
f" Missing required environment variables: {missing_required}"
|
146 |
+
)
|
147 |
+
self.logger.error(
|
148 |
+
"Please set the required API keys as environment variables"
|
149 |
+
)
|
150 |
+
# Don't raise error in demo mode, just warn
|
151 |
+
self.logger.warning("Running in demo mode with limited functionality")
|
152 |
+
|
153 |
+
# Check optional keys
|
154 |
+
missing_optional = []
|
155 |
+
for key in optional_keys:
|
156 |
+
if not os.getenv(key):
|
157 |
+
missing_optional.append(key)
|
158 |
+
|
159 |
+
if missing_optional:
|
160 |
+
self.logger.warning(
|
161 |
+
f"Missing optional environment variables: {missing_optional}"
|
162 |
+
)
|
163 |
+
self.logger.warning("Some features may be limited without these keys")
|
164 |
+
|
165 |
+
# Validate configuration
|
166 |
+
self._validate_configuration()
|
167 |
+
|
168 |
+
self.logger.info("Environment validation completed")
|
169 |
+
|
170 |
+
def _validate_configuration(self):
|
171 |
+
"""Validate configuration settings."""
|
172 |
+
try:
|
173 |
+
# Check embedding configuration
|
174 |
+
embedding_config = self.config.get("embedding", {})
|
175 |
+
if not embedding_config.get("model"):
|
176 |
+
self.logger.warning("Embedding model not specified, using default")
|
177 |
+
|
178 |
+
# Check vector database configuration
|
179 |
+
vector_db_config = self.config.get("vector_db", {})
|
180 |
+
if not vector_db_config.get("provider"):
|
181 |
+
self.logger.warning(
|
182 |
+
"Vector database provider not specified, using default"
|
183 |
+
)
|
184 |
+
|
185 |
+
# Check RAG configuration
|
186 |
+
rag_config = self.config.get("rag", {})
|
187 |
+
if rag_config.get("top_k", 5) <= 0:
|
188 |
+
self.logger.warning("Invalid top_k value, using default")
|
189 |
+
|
190 |
+
self.logger.info("Configuration validation completed")
|
191 |
+
|
192 |
+
except Exception as e:
|
193 |
+
self.logger.warning(f"Configuration validation warning: {e}")
|
194 |
+
|
195 |
+
def _initialize_components(self):
|
196 |
+
"""Initialize all system components with error handling."""
|
197 |
+
try:
|
198 |
+
self.logger.info("Initializing system components...")
|
199 |
+
|
200 |
+
# Document processing components
|
201 |
+
self.logger.info(" Initializing document processing components...")
|
202 |
+
self.document_processor = DocumentProcessor(
|
203 |
+
self.config_manager.get_section("document_processing")
|
204 |
+
)
|
205 |
+
|
206 |
+
self.url_processor = URLProcessor(
|
207 |
+
self.config_manager.get_section("url_processing")
|
208 |
+
)
|
209 |
+
|
210 |
+
self.text_extractor = TextExtractor(
|
211 |
+
self.config_manager.get_section("document_processing")
|
212 |
+
)
|
213 |
+
|
214 |
+
# Embedding and storage components
|
215 |
+
self.logger.info("Initializing embedding and storage components...")
|
216 |
+
embedding_config = self.config_manager.get_section("embedding")
|
217 |
+
embedding_config["api_key"] = os.getenv("GEMINI_API_KEY")
|
218 |
+
|
219 |
+
self.embedding_generator = EmbeddingGenerator(embedding_config)
|
220 |
+
|
221 |
+
vector_db_config = self.config_manager.get_section("vector_db")
|
222 |
+
vector_db_config["api_key"] = os.getenv("PINECONE_API_KEY")
|
223 |
+
|
224 |
+
self.vector_db = VectorDB(vector_db_config)
|
225 |
+
|
226 |
+
# RAG components
|
227 |
+
self.logger.info("Initializing RAG components...")
|
228 |
+
self.query_processor = OptimizedQueryProcessor(
|
229 |
+
self.embedding_generator,
|
230 |
+
self.vector_db,
|
231 |
+
self.config_manager.get_section("rag"),
|
232 |
+
)
|
233 |
+
|
234 |
+
rag_config = self.config_manager.get_section("rag")
|
235 |
+
# Add API keys to RAG config for LLM initialization
|
236 |
+
rag_config["gemini_api_key"] = os.getenv("GEMINI_API_KEY")
|
237 |
+
rag_config["openai_api_key"] = os.getenv("OPENAI_API_KEY")
|
238 |
+
|
239 |
+
self.response_generator = ResponseGenerator(rag_config)
|
240 |
+
|
241 |
+
# Live Search components
|
242 |
+
self.logger.info("Initializing Live Search components...")
|
243 |
+
live_search_config = self.config_manager.get_section("live_search") or {}
|
244 |
+
self.live_search_processor = LiveSearchProcessor(live_search_config)
|
245 |
+
|
246 |
+
# Query Router for intelligent routing
|
247 |
+
router_config = self.config_manager.get_section("query_router") or {}
|
248 |
+
self.query_router = QueryRouter(
|
249 |
+
self.query_processor, self.live_search_processor, router_config
|
250 |
+
)
|
251 |
+
|
252 |
+
self.logger.info("All components initialized successfully")
|
253 |
+
|
254 |
+
except Exception as e:
|
255 |
+
self.logger.error(f" Failed to initialize components: {str(e)}")
|
256 |
+
# Don't raise in demo mode, continue with limited functionality
|
257 |
+
self.logger.warning("Some components may not be fully functional")
|
258 |
+
|
259 |
+
def _run_startup_health_checks(self):
|
260 |
+
"""Run health checks on all components."""
|
261 |
+
self.logger.info("Running startup health checks...")
|
262 |
+
|
263 |
+
health_status = {
|
264 |
+
"document_processor": False,
|
265 |
+
"url_processor": False,
|
266 |
+
"text_extractor": False,
|
267 |
+
"embedding_generator": False,
|
268 |
+
"vector_db": False,
|
269 |
+
"query_processor": False,
|
270 |
+
"response_generator": False,
|
271 |
+
}
|
272 |
+
|
273 |
+
# Check each component
|
274 |
+
try:
|
275 |
+
if hasattr(self, "document_processor"):
|
276 |
+
health_status["document_processor"] = True
|
277 |
+
self.logger.info("Document processor: Healthy")
|
278 |
+
except:
|
279 |
+
self.logger.warning("Document processor: Not available")
|
280 |
+
|
281 |
+
try:
|
282 |
+
if hasattr(self, "url_processor"):
|
283 |
+
health_status["url_processor"] = True
|
284 |
+
self.logger.info("URL processor: Healthy")
|
285 |
+
except:
|
286 |
+
self.logger.warning("URL processor: Not available")
|
287 |
+
|
288 |
+
try:
|
289 |
+
if hasattr(self, "text_extractor"):
|
290 |
+
health_status["text_extractor"] = True
|
291 |
+
self.logger.info("Text extractor: Healthy")
|
292 |
+
except:
|
293 |
+
self.logger.warning("Text extractor: Not available")
|
294 |
+
|
295 |
+
try:
|
296 |
+
if hasattr(self, "embedding_generator"):
|
297 |
+
health_status["embedding_generator"] = True
|
298 |
+
self.logger.info("Embedding generator: Healthy")
|
299 |
+
except:
|
300 |
+
self.logger.warning("Embedding generator: Not available")
|
301 |
+
|
302 |
+
try:
|
303 |
+
if hasattr(self, "vector_db"):
|
304 |
+
health_status["vector_db"] = True
|
305 |
+
self.logger.info("Vector database: Healthy")
|
306 |
+
except:
|
307 |
+
self.logger.warning("Vector database: Not available")
|
308 |
+
|
309 |
+
try:
|
310 |
+
if hasattr(self, "query_processor"):
|
311 |
+
health_status["query_processor"] = True
|
312 |
+
self.logger.info("Query processor: Healthy")
|
313 |
+
except:
|
314 |
+
self.logger.warning("Query processor: Not available")
|
315 |
+
|
316 |
+
try:
|
317 |
+
if hasattr(self, "response_generator"):
|
318 |
+
health_status["response_generator"] = True
|
319 |
+
self.logger.info("Response generator: Healthy")
|
320 |
+
except:
|
321 |
+
self.logger.warning("Response generator: Not available")
|
322 |
+
|
323 |
+
# Overall health
|
324 |
+
healthy_components = sum(health_status.values())
|
325 |
+
total_components = len(health_status)
|
326 |
+
|
327 |
+
self.logger.info(
|
328 |
+
f"Health check complete: {healthy_components}/{total_components} components healthy"
|
329 |
+
)
|
330 |
+
|
331 |
+
if healthy_components < total_components:
|
332 |
+
self.logger.warning("Some components are not fully functional")
|
333 |
+
self.logger.warning("The system will run with limited capabilities")
|
334 |
+
|
335 |
+
def process_document(self, file_path: str) -> dict:
|
336 |
+
"""
|
337 |
+
Process a document through the complete pipeline.
|
338 |
+
|
339 |
+
Args:
|
340 |
+
file_path: Path to the document file
|
341 |
+
|
342 |
+
Returns:
|
343 |
+
Dictionary with processing results
|
344 |
+
"""
|
345 |
+
try:
|
346 |
+
self.logger.info(f" Processing document: {file_path}")
|
347 |
+
|
348 |
+
# Check if components are available
|
349 |
+
if not all(
|
350 |
+
hasattr(self, attr)
|
351 |
+
for attr in [
|
352 |
+
"document_processor",
|
353 |
+
"text_extractor",
|
354 |
+
"embedding_generator",
|
355 |
+
"vector_db",
|
356 |
+
]
|
357 |
+
):
|
358 |
+
return {
|
359 |
+
"status": "error",
|
360 |
+
"error": "Required components not available",
|
361 |
+
"chunks_processed": 0,
|
362 |
+
}
|
363 |
+
|
364 |
+
# Step 1: Extract content from document
|
365 |
+
doc_result = self.document_processor.process_document(file_path)
|
366 |
+
|
367 |
+
if not doc_result or "content" not in doc_result:
|
368 |
+
return {
|
369 |
+
"status": "error",
|
370 |
+
"error": "Failed to extract content from document",
|
371 |
+
"chunks_processed": 0,
|
372 |
+
}
|
373 |
+
|
374 |
+
# Step 2: Extract and chunk text
|
375 |
+
text_chunks = self.text_extractor.process_text(
|
376 |
+
doc_result["content"], doc_result.get("metadata", {})
|
377 |
+
)
|
378 |
+
|
379 |
+
if not text_chunks:
|
380 |
+
return {
|
381 |
+
"status": "error",
|
382 |
+
"error": "No text chunks generated",
|
383 |
+
"chunks_processed": 0,
|
384 |
+
}
|
385 |
+
|
386 |
+
# Step 3: Generate embeddings
|
387 |
+
embedded_chunks = self.embedding_generator.generate_embeddings(text_chunks)
|
388 |
+
|
389 |
+
if not embedded_chunks:
|
390 |
+
return {
|
391 |
+
"status": "error",
|
392 |
+
"error": "Failed to generate embeddings",
|
393 |
+
"chunks_processed": len(text_chunks),
|
394 |
+
}
|
395 |
+
|
396 |
+
# Step 4: Store in vector database
|
397 |
+
storage_success = self.vector_db.store_embeddings(embedded_chunks)
|
398 |
+
|
399 |
+
return {
|
400 |
+
"status": "success" if storage_success else "partial_success",
|
401 |
+
"chunks_processed": len(text_chunks),
|
402 |
+
"chunks_stored": len(embedded_chunks) if storage_success else 0,
|
403 |
+
"source": file_path,
|
404 |
+
}
|
405 |
+
|
406 |
+
except Exception as e:
|
407 |
+
self.logger.error(f" Error processing document: {str(e)}")
|
408 |
+
error_info = self.error_handler.handle_error(e, {"file_path": file_path})
|
409 |
+
return {
|
410 |
+
"status": "error",
|
411 |
+
"error": str(e),
|
412 |
+
"error_info": error_info,
|
413 |
+
"chunks_processed": 0,
|
414 |
+
}
|
415 |
+
|
416 |
+
def process_url(
|
417 |
+
self, url: str, max_depth: int = 1, follow_links: bool = True
|
418 |
+
) -> dict:
|
419 |
+
"""
|
420 |
+
Process a URL through the complete pipeline with advanced options.
|
421 |
+
|
422 |
+
Args:
|
423 |
+
url: URL to process
|
424 |
+
max_depth: Maximum crawling depth
|
425 |
+
follow_links: Whether to follow links
|
426 |
+
|
427 |
+
Returns:
|
428 |
+
Dictionary with processing results
|
429 |
+
"""
|
430 |
+
try:
|
431 |
+
self.logger.info(f"Processing URL: {url}")
|
432 |
+
|
433 |
+
# Check if components are available
|
434 |
+
if not all(
|
435 |
+
hasattr(self, attr)
|
436 |
+
for attr in [
|
437 |
+
"url_processor",
|
438 |
+
"text_extractor",
|
439 |
+
"embedding_generator",
|
440 |
+
"vector_db",
|
441 |
+
]
|
442 |
+
):
|
443 |
+
return {
|
444 |
+
"status": "error",
|
445 |
+
"error": "Required components not available",
|
446 |
+
"chunks_processed": 0,
|
447 |
+
}
|
448 |
+
|
449 |
+
# Step 1: Configure URL processor with advanced options
|
450 |
+
# Update URL processor configuration dynamically
|
451 |
+
self.url_processor.max_depth = max_depth
|
452 |
+
self.url_processor.follow_links = follow_links
|
453 |
+
|
454 |
+
# Reset processor state for fresh crawl
|
455 |
+
self.url_processor.reset()
|
456 |
+
|
457 |
+
# Extract content from URL
|
458 |
+
url_result = self.url_processor.process_url(url)
|
459 |
+
|
460 |
+
if not url_result or "content" not in url_result:
|
461 |
+
return {
|
462 |
+
"status": "error",
|
463 |
+
"error": "Failed to extract content from URL",
|
464 |
+
"chunks_processed": 0,
|
465 |
+
}
|
466 |
+
|
467 |
+
# Step 2: Extract and chunk text
|
468 |
+
text_chunks = self.text_extractor.process_text(
|
469 |
+
url_result["content"], url_result.get("metadata", {})
|
470 |
+
)
|
471 |
+
|
472 |
+
if not text_chunks:
|
473 |
+
return {
|
474 |
+
"status": "error",
|
475 |
+
"error": "No text chunks generated",
|
476 |
+
"chunks_processed": 0,
|
477 |
+
}
|
478 |
+
|
479 |
+
# Step 3: Generate embeddings
|
480 |
+
embedded_chunks = self.embedding_generator.generate_embeddings(text_chunks)
|
481 |
+
|
482 |
+
if not embedded_chunks:
|
483 |
+
return {
|
484 |
+
"status": "error",
|
485 |
+
"error": "Failed to generate embeddings",
|
486 |
+
"chunks_processed": len(text_chunks),
|
487 |
+
}
|
488 |
+
|
489 |
+
# Step 4: Store in vector database
|
490 |
+
storage_success = self.vector_db.store_embeddings(embedded_chunks)
|
491 |
+
|
492 |
+
# Process linked documents if any
|
493 |
+
linked_processed = 0
|
494 |
+
for linked_doc in url_result.get("linked_documents", []):
|
495 |
+
if linked_doc and "content" in linked_doc:
|
496 |
+
try:
|
497 |
+
linked_chunks = self.text_extractor.process_text(
|
498 |
+
linked_doc["content"], linked_doc.get("metadata", {})
|
499 |
+
)
|
500 |
+
if linked_chunks:
|
501 |
+
linked_embedded = (
|
502 |
+
self.embedding_generator.generate_embeddings(
|
503 |
+
linked_chunks
|
504 |
+
)
|
505 |
+
)
|
506 |
+
if linked_embedded and self.vector_db.store_embeddings(
|
507 |
+
linked_embedded
|
508 |
+
):
|
509 |
+
linked_processed += 1
|
510 |
+
except Exception as e:
|
511 |
+
self.logger.warning(f"Failed to process linked document: {e}")
|
512 |
+
|
513 |
+
return {
|
514 |
+
"status": "success" if storage_success else "partial_success",
|
515 |
+
"chunks_processed": len(text_chunks),
|
516 |
+
"chunks_stored": len(embedded_chunks) if storage_success else 0,
|
517 |
+
"linked_documents_processed": linked_processed,
|
518 |
+
"source": url,
|
519 |
+
}
|
520 |
+
|
521 |
+
except Exception as e:
|
522 |
+
self.logger.error(f" Error processing URL: {str(e)}")
|
523 |
+
error_info = self.error_handler.handle_error(e, {"url": url})
|
524 |
+
return {
|
525 |
+
"status": "error",
|
526 |
+
"error": str(e),
|
527 |
+
"error_info": error_info,
|
528 |
+
"chunks_processed": 0,
|
529 |
+
}
|
530 |
+
|
531 |
+
def query(
|
532 |
+
self,
|
533 |
+
question: str,
|
534 |
+
max_results: int = 5,
|
535 |
+
use_live_search: bool = False,
|
536 |
+
search_mode: str = "auto",
|
537 |
+
) -> dict:
|
538 |
+
"""
|
539 |
+
Process a query and generate a response with enhanced search control.
|
540 |
+
|
541 |
+
Args:
|
542 |
+
question: User question
|
543 |
+
max_results: Maximum number of results to retrieve
|
544 |
+
use_live_search: Whether to enable live web search (uses hybrid approach)
|
545 |
+
search_mode: Search mode - "auto", "local_only", "live_only", "hybrid"
|
546 |
+
|
547 |
+
Returns:
|
548 |
+
Dictionary with response and metadata
|
549 |
+
"""
|
550 |
+
try:
|
551 |
+
self.logger.info(
|
552 |
+
f"Processing query: {question[:100]}... (live_search: {use_live_search})"
|
553 |
+
)
|
554 |
+
|
555 |
+
# Check if components are available
|
556 |
+
if not all(
|
557 |
+
hasattr(self, attr)
|
558 |
+
for attr in ["query_processor", "response_generator"]
|
559 |
+
):
|
560 |
+
return {
|
561 |
+
"query": question,
|
562 |
+
"response": "Query processing components not available. Please check system configuration.",
|
563 |
+
"sources": [],
|
564 |
+
"confidence": 0.0,
|
565 |
+
"error": "Components not available",
|
566 |
+
}
|
567 |
+
|
568 |
+
# Use Query Router for intelligent routing if available
|
569 |
+
if hasattr(self, "query_router") and (
|
570 |
+
use_live_search or search_mode != "auto"
|
571 |
+
):
|
572 |
+
self.logger.info(f" Using Query Router with mode: {search_mode}")
|
573 |
+
|
574 |
+
search_options = {"search_depth": "basic", "time_range": "month"}
|
575 |
+
|
576 |
+
router_result = self.query_router.route_query(
|
577 |
+
question,
|
578 |
+
use_live_search=use_live_search,
|
579 |
+
max_results=max_results,
|
580 |
+
search_options=search_options,
|
581 |
+
search_mode=search_mode,
|
582 |
+
)
|
583 |
+
|
584 |
+
# Convert router result to standard format
|
585 |
+
if router_result.get("results"):
|
586 |
+
# Format sources from router results
|
587 |
+
sources = []
|
588 |
+
for result in router_result["results"]:
|
589 |
+
sources.append(
|
590 |
+
{
|
591 |
+
"title": result.get("title", ""),
|
592 |
+
"source": result.get("source", ""),
|
593 |
+
"content": result.get("content", ""),
|
594 |
+
"score": result.get("score", 0.0),
|
595 |
+
"type": result.get("type", "unknown"),
|
596 |
+
}
|
597 |
+
)
|
598 |
+
|
599 |
+
# Generate response using response generator
|
600 |
+
context_items = []
|
601 |
+
for result in router_result["results"]:
|
602 |
+
context_items.append(
|
603 |
+
{
|
604 |
+
"text": result.get("content", ""),
|
605 |
+
"source": result.get("source", ""),
|
606 |
+
"score": result.get("score", 0.0),
|
607 |
+
"metadata": result.get("metadata", {}),
|
608 |
+
}
|
609 |
+
)
|
610 |
+
|
611 |
+
response_result = self.response_generator.generate_response(
|
612 |
+
question, context_items
|
613 |
+
)
|
614 |
+
|
615 |
+
return {
|
616 |
+
"query": question,
|
617 |
+
"response": response_result.get(
|
618 |
+
"response", "No response generated"
|
619 |
+
),
|
620 |
+
"sources": sources,
|
621 |
+
"confidence": response_result.get("confidence", 0.0),
|
622 |
+
"context_items": len(context_items),
|
623 |
+
"processing_time": router_result.get("processing_time", 0),
|
624 |
+
"generation_time": response_result.get("generation_time", 0),
|
625 |
+
"model_used": response_result.get("model_used", "unknown"),
|
626 |
+
"routing_decision": router_result.get(
|
627 |
+
"routing_decision", "unknown"
|
628 |
+
),
|
629 |
+
"search_type": "routed_search",
|
630 |
+
}
|
631 |
+
else:
|
632 |
+
# Fallback to local search if router fails
|
633 |
+
self.logger.warning(
|
634 |
+
"Router returned no results, falling back to local search"
|
635 |
+
)
|
636 |
+
|
637 |
+
# Traditional local search path
|
638 |
+
# Step 1: Process query and retrieve context with max_results
|
639 |
+
# Update query processor config temporarily
|
640 |
+
original_top_k = self.query_processor.top_k
|
641 |
+
self.query_processor.top_k = max_results
|
642 |
+
|
643 |
+
query_result = self.query_processor.process_query(question)
|
644 |
+
|
645 |
+
# Restore original top_k
|
646 |
+
self.query_processor.top_k = original_top_k
|
647 |
+
|
648 |
+
if query_result.get("error"):
|
649 |
+
return {
|
650 |
+
"query": question,
|
651 |
+
"response": f"Query processing failed: {query_result['error']}",
|
652 |
+
"sources": [],
|
653 |
+
"confidence": 0.0,
|
654 |
+
"error": query_result["error"],
|
655 |
+
}
|
656 |
+
|
657 |
+
# Step 2: Generate response
|
658 |
+
response_result = self.response_generator.generate_response(
|
659 |
+
question, query_result.get("context", [])
|
660 |
+
)
|
661 |
+
|
662 |
+
# Combine results
|
663 |
+
return {
|
664 |
+
"query": question,
|
665 |
+
"response": response_result.get("response", "No response generated"),
|
666 |
+
"sources": response_result.get("sources", []),
|
667 |
+
"confidence": response_result.get("confidence", 0.0),
|
668 |
+
"context_items": query_result.get("total_results", 0),
|
669 |
+
"processing_time": query_result.get("processing_time", 0),
|
670 |
+
"generation_time": response_result.get("generation_time", 0),
|
671 |
+
"model_used": response_result.get("model_used", "unknown"),
|
672 |
+
"search_type": "local_search",
|
673 |
+
}
|
674 |
+
|
675 |
+
except Exception as e:
|
676 |
+
self.logger.error(f"Error processing query: {str(e)}")
|
677 |
+
error_info = self.error_handler.handle_error(e, {"query": question})
|
678 |
+
return {
|
679 |
+
"query": question,
|
680 |
+
"response": "I encountered an error while processing your question. Please try again.",
|
681 |
+
"sources": [],
|
682 |
+
"confidence": 0.0,
|
683 |
+
"error": str(e),
|
684 |
+
"error_info": error_info,
|
685 |
+
}
|
686 |
+
|
687 |
+
def get_system_status(self) -> dict:
|
688 |
+
"""
|
689 |
+
Get comprehensive system status.
|
690 |
+
|
691 |
+
Returns:
|
692 |
+
Dictionary with system status information
|
693 |
+
"""
|
694 |
+
try:
|
695 |
+
status = {
|
696 |
+
"overall_status": "healthy",
|
697 |
+
"components": {},
|
698 |
+
"configuration": {},
|
699 |
+
"environment": {},
|
700 |
+
}
|
701 |
+
|
702 |
+
# Check component status
|
703 |
+
components = [
|
704 |
+
"document_processor",
|
705 |
+
"url_processor",
|
706 |
+
"text_extractor",
|
707 |
+
"embedding_generator",
|
708 |
+
"vector_db",
|
709 |
+
"query_processor",
|
710 |
+
"response_generator",
|
711 |
+
]
|
712 |
+
|
713 |
+
for component in components:
|
714 |
+
status["components"][component] = hasattr(self, component)
|
715 |
+
|
716 |
+
# Configuration info
|
717 |
+
status["configuration"] = {
|
718 |
+
"embedding_model": self.config.get("embedding", {}).get(
|
719 |
+
"model", "unknown"
|
720 |
+
),
|
721 |
+
"vector_db_provider": self.config.get("vector_db", {}).get(
|
722 |
+
"provider", "unknown"
|
723 |
+
),
|
724 |
+
"rag_top_k": self.config.get("rag", {}).get("top_k", 5),
|
725 |
+
}
|
726 |
+
|
727 |
+
# Environment info
|
728 |
+
status["environment"] = {
|
729 |
+
"gemini_api_available": bool(os.getenv("GEMINI_API_KEY")),
|
730 |
+
"pinecone_api_available": bool(os.getenv("PINECONE_API_KEY")),
|
731 |
+
"openai_api_available": bool(os.getenv("OPENAI_API_KEY")),
|
732 |
+
}
|
733 |
+
|
734 |
+
# Overall status
|
735 |
+
healthy_components = sum(status["components"].values())
|
736 |
+
total_components = len(status["components"])
|
737 |
+
|
738 |
+
if healthy_components < total_components * 0.8:
|
739 |
+
status["overall_status"] = "degraded"
|
740 |
+
elif healthy_components < total_components * 0.5:
|
741 |
+
status["overall_status"] = "unhealthy"
|
742 |
+
|
743 |
+
return status
|
744 |
+
|
745 |
+
except Exception as e:
|
746 |
+
self.logger.error(f" Error getting system status: {e}")
|
747 |
+
return {"overall_status": "error", "error": str(e)}
|
748 |
+
|
749 |
+
|
750 |
+
def create_app():
|
751 |
+
"""
|
752 |
+
Create and configure the RAG application.
|
753 |
+
|
754 |
+
Returns:
|
755 |
+
Tuple of (RAG system instance, Gradio app instance)
|
756 |
+
"""
|
757 |
+
try:
|
758 |
+
# Initialize the RAG system
|
759 |
+
rag_system = RAGSystem()
|
760 |
+
|
761 |
+
# Create Gradio interface
|
762 |
+
ui_config = rag_system.config_manager.get_section("ui")
|
763 |
+
gradio_app = GradioApp(rag_system, ui_config)
|
764 |
+
|
765 |
+
return rag_system, gradio_app
|
766 |
+
|
767 |
+
except Exception as e:
|
768 |
+
print(f" Failed to create application: {str(e)}")
|
769 |
+
# Create a minimal system for demo purposes
|
770 |
+
print("Creating minimal demo system...")
|
771 |
+
|
772 |
+
# Create minimal config
|
773 |
+
minimal_config = {
|
774 |
+
"ui": {
|
775 |
+
"title": "AI Embedded Knowledge Agent (Demo Mode)",
|
776 |
+
"description": "Demo mode - some features may be limited. Please configure API keys for full functionality.",
|
777 |
+
}
|
778 |
+
}
|
779 |
+
|
780 |
+
# Create minimal RAG system
|
781 |
+
class MinimalRAGSystem:
|
782 |
+
def __init__(self):
|
783 |
+
self.config_manager = type(
|
784 |
+
"ConfigManager",
|
785 |
+
(),
|
786 |
+
{
|
787 |
+
"get_section": lambda self, section: minimal_config.get(
|
788 |
+
section, {}
|
789 |
+
)
|
790 |
+
},
|
791 |
+
)()
|
792 |
+
|
793 |
+
def process_document(self, file_path):
|
794 |
+
return {
|
795 |
+
"status": "error",
|
796 |
+
"error": "Demo mode - document processing not available",
|
797 |
+
}
|
798 |
+
|
799 |
+
def process_url(self, url):
|
800 |
+
return {
|
801 |
+
"status": "error",
|
802 |
+
"error": "Demo mode - URL processing not available",
|
803 |
+
}
|
804 |
+
|
805 |
+
def query(self, question):
|
806 |
+
return {
|
807 |
+
"query": question,
|
808 |
+
"response": "Demo mode: Please configure your API keys (GEMINI_API_KEY, PINECONE_API_KEY) to enable full functionality.",
|
809 |
+
"sources": [],
|
810 |
+
"confidence": 0.0,
|
811 |
+
}
|
812 |
+
|
813 |
+
rag_system = MinimalRAGSystem()
|
814 |
+
gradio_app = GradioApp(rag_system, minimal_config.get("ui", {}))
|
815 |
+
|
816 |
+
return rag_system, gradio_app
|
817 |
+
|
818 |
+
|
819 |
+
def main():
|
820 |
+
"""Main function to run the application."""
|
821 |
+
try:
|
822 |
+
print("Starting AI Embedded Knowledge Agent...")
|
823 |
+
print("=" * 50)
|
824 |
+
|
825 |
+
# Create the application
|
826 |
+
rag_system, gradio_app = create_app()
|
827 |
+
|
828 |
+
# Get launch configuration
|
829 |
+
try:
|
830 |
+
ui_config = rag_system.config_manager.get_section("ui")
|
831 |
+
except:
|
832 |
+
ui_config = {}
|
833 |
+
|
834 |
+
# Launch the Gradio interface
|
835 |
+
base_port = ui_config.get("port", 7860)
|
836 |
+
launch_config = {
|
837 |
+
"server_name": ui_config.get("server_name", "0.0.0.0"),
|
838 |
+
"server_port": base_port,
|
839 |
+
"share": ui_config.get("share", False),
|
840 |
+
"show_error": True,
|
841 |
+
"quiet": False,
|
842 |
+
}
|
843 |
+
|
844 |
+
# Try different ports if the default is in use
|
845 |
+
for port_offset in range(10): # Try ports 7860-7869
|
846 |
+
try:
|
847 |
+
current_port = base_port + port_offset
|
848 |
+
launch_config["server_port"] = current_port
|
849 |
+
|
850 |
+
print(
|
851 |
+
f"Launching interface on {launch_config['server_name']}:{current_port}"
|
852 |
+
)
|
853 |
+
print("=" * 50)
|
854 |
+
|
855 |
+
gradio_app.launch(**launch_config)
|
856 |
+
break # If successful, break out of the loop
|
857 |
+
|
858 |
+
except Exception as e:
|
859 |
+
if (
|
860 |
+
"bind" in str(e).lower()
|
861 |
+
or "address already in use" in str(e).lower()
|
862 |
+
):
|
863 |
+
print(f"Port {current_port} is in use, trying next port...")
|
864 |
+
continue
|
865 |
+
else:
|
866 |
+
# If it's a different error, re-raise it
|
867 |
+
raise e
|
868 |
+
else:
|
869 |
+
# If we've tried all ports without success
|
870 |
+
print(
|
871 |
+
"Could not find an available port. Please close other applications using ports 7860-7869."
|
872 |
+
)
|
873 |
+
raise Exception("No available ports found")
|
874 |
+
|
875 |
+
except KeyboardInterrupt:
|
876 |
+
print("\n👋 Shutting down gracefully...")
|
877 |
+
except Exception as e:
|
878 |
+
print(f" Failed to start application: {str(e)}")
|
879 |
+
print("Please check your configuration and API keys.")
|
880 |
+
sys.exit(1)
|
881 |
+
|
882 |
+
|
883 |
+
if __name__ == "__main__":
|
884 |
+
main()
|
config/config.yaml
ADDED
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
api_keys:
|
2 |
+
gemini_api_key: ""
|
3 |
+
openai_api_key: ""
|
4 |
+
pinecone_api_key: ""
|
5 |
+
backup:
|
6 |
+
enabled: false
|
7 |
+
include_configuration: true
|
8 |
+
include_documents: true
|
9 |
+
include_logs: false
|
10 |
+
include_vector_db: true
|
11 |
+
interval_hours: 24
|
12 |
+
retention_days: 30
|
13 |
+
storage_path: backups/
|
14 |
+
customization:
|
15 |
+
custom_css: ""
|
16 |
+
default_query_examples:
|
17 |
+
- What is the main topic of the uploaded documents?
|
18 |
+
- Can you summarize the key points?
|
19 |
+
- What are the important findings mentioned?
|
20 |
+
favicon_url: ""
|
21 |
+
footer_text: ""
|
22 |
+
help_text: ""
|
23 |
+
logo_url: ""
|
24 |
+
welcome_message: ""
|
25 |
+
deployment:
|
26 |
+
auto_scale: true
|
27 |
+
development:
|
28 |
+
debug_mode: true
|
29 |
+
enable_profiling: true
|
30 |
+
log_level: DEBUG
|
31 |
+
enable_metrics: true
|
32 |
+
graceful_shutdown_timeout: 30
|
33 |
+
health_check_interval: 60
|
34 |
+
health_endpoint: /health
|
35 |
+
max_cpu_percent: 80
|
36 |
+
max_disk_usage_mb: 5120
|
37 |
+
max_memory_mb: 2048
|
38 |
+
metrics_endpoint: /metrics
|
39 |
+
platform: huggingface
|
40 |
+
production:
|
41 |
+
debug_mode: false
|
42 |
+
enable_profiling: false
|
43 |
+
log_level: WARNING
|
44 |
+
staging:
|
45 |
+
debug_mode: true
|
46 |
+
enable_profiling: true
|
47 |
+
log_level: INFO
|
48 |
+
development:
|
49 |
+
debug_mode: false
|
50 |
+
enable_test_endpoints: false
|
51 |
+
mock_apis: false
|
52 |
+
profiling_enabled: false
|
53 |
+
save_intermediate_results: false
|
54 |
+
test_data_path: data/test_data
|
55 |
+
test_mode: false
|
56 |
+
document_processing:
|
57 |
+
chunk_overlap: 200
|
58 |
+
chunk_size: 1000
|
59 |
+
detect_language: true
|
60 |
+
extract_images: false
|
61 |
+
extract_metadata: true
|
62 |
+
max_file_size_mb: 50
|
63 |
+
min_chunk_size: 100
|
64 |
+
preserve_formatting: true
|
65 |
+
supported_formats:
|
66 |
+
- .pdf
|
67 |
+
- .docx
|
68 |
+
- .doc
|
69 |
+
- .csv
|
70 |
+
- .xlsx
|
71 |
+
- .xls
|
72 |
+
- .pptx
|
73 |
+
- .txt
|
74 |
+
- .md
|
75 |
+
supported_languages:
|
76 |
+
- en
|
77 |
+
- es
|
78 |
+
- fr
|
79 |
+
- de
|
80 |
+
- it
|
81 |
+
- pt
|
82 |
+
- ru
|
83 |
+
- zh
|
84 |
+
- ja
|
85 |
+
- ko
|
86 |
+
embedding:
|
87 |
+
batch_size: 1
|
88 |
+
cache_embeddings: true
|
89 |
+
fallback_model: sentence-transformers
|
90 |
+
max_retries: 3
|
91 |
+
max_tokens: 8192
|
92 |
+
model: gemini-embedding-exp-03-07
|
93 |
+
output_dimensionality: 3072
|
94 |
+
rate_limit_delay: 1.0
|
95 |
+
retry_delay: 2
|
96 |
+
task_type: RETRIEVAL_DOCUMENT
|
97 |
+
title: ""
|
98 |
+
features:
|
99 |
+
async_processing: false
|
100 |
+
audio_processing: false
|
101 |
+
auto_summarization: false
|
102 |
+
batch_processing: true
|
103 |
+
content_recommendation: false
|
104 |
+
document_upload: true
|
105 |
+
image_processing: false
|
106 |
+
live_search: true
|
107 |
+
multi_language_support: false
|
108 |
+
query_processing: true
|
109 |
+
question_generation: false
|
110 |
+
real_time_updates: false
|
111 |
+
url_processing: true
|
112 |
+
video_processing: false
|
113 |
+
integrations:
|
114 |
+
aws_s3:
|
115 |
+
access_key: ""
|
116 |
+
bucket_name: ""
|
117 |
+
enabled: false
|
118 |
+
secret_key: ""
|
119 |
+
google_analytics:
|
120 |
+
enabled: false
|
121 |
+
tracking_id: ""
|
122 |
+
huggingface:
|
123 |
+
api_key: ""
|
124 |
+
enabled: false
|
125 |
+
models: []
|
126 |
+
postgresql:
|
127 |
+
connection_string: ""
|
128 |
+
enabled: false
|
129 |
+
sentry:
|
130 |
+
dsn: ""
|
131 |
+
enabled: false
|
132 |
+
logging:
|
133 |
+
backup_count: 5
|
134 |
+
component_levels:
|
135 |
+
document_processing: INFO
|
136 |
+
embedding: INFO
|
137 |
+
rag: INFO
|
138 |
+
ui: INFO
|
139 |
+
url_processing: INFO
|
140 |
+
vector_db: INFO
|
141 |
+
file: logs/rag_ai.log
|
142 |
+
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
143 |
+
level: INFO
|
144 |
+
max_file_size_mb: 10
|
145 |
+
notifications:
|
146 |
+
email:
|
147 |
+
enabled: false
|
148 |
+
from_address: ""
|
149 |
+
password: ""
|
150 |
+
smtp_port: 587
|
151 |
+
smtp_server: ""
|
152 |
+
to_addresses: []
|
153 |
+
username: ""
|
154 |
+
enabled: false
|
155 |
+
webhook:
|
156 |
+
enabled: false
|
157 |
+
events:
|
158 |
+
- error
|
159 |
+
- system_health
|
160 |
+
- processing_complete
|
161 |
+
url: ""
|
162 |
+
performance:
|
163 |
+
batch_processing_size: 10
|
164 |
+
cache_ttl: 3600
|
165 |
+
enable_caching: true
|
166 |
+
enable_parallel_processing: true
|
167 |
+
garbage_collection_interval: 300
|
168 |
+
max_concurrent_requests: 5
|
169 |
+
max_memory_usage_mb: 1024
|
170 |
+
max_worker_threads: 4
|
171 |
+
request_timeout: 30
|
172 |
+
rag:
|
173 |
+
confidence_threshold: 0.3
|
174 |
+
context_window_overlap: 0.1
|
175 |
+
deduplicate_results: true
|
176 |
+
enable_query_caching: true
|
177 |
+
enable_query_expansion: true
|
178 |
+
fallback_model: gpt-3.5-turbo
|
179 |
+
include_sources: true
|
180 |
+
max_context_length: 8000
|
181 |
+
max_response_length: 2000
|
182 |
+
max_tokens: 500
|
183 |
+
model: gemini-2.5-flash-preview-05-20
|
184 |
+
query_cache_ttl: 7200
|
185 |
+
rerank_results: true
|
186 |
+
similarity_threshold: 0.4
|
187 |
+
temperature: 0.7
|
188 |
+
top_k: 10
|
189 |
+
top_p: 0.9
|
190 |
+
live_search:
|
191 |
+
enabled: true
|
192 |
+
enable_caching: true
|
193 |
+
include_raw_content: true
|
194 |
+
max_results: 10
|
195 |
+
search_depth: basic
|
196 |
+
time_range: month
|
197 |
+
query_router:
|
198 |
+
confidence_threshold: 0.5
|
199 |
+
enable_hybrid_search: true
|
200 |
+
live_weight: 0.4
|
201 |
+
local_weight: 0.6
|
202 |
+
max_hybrid_results: 10
|
203 |
+
security:
|
204 |
+
allowed_domains: []
|
205 |
+
blocked_content_types:
|
206 |
+
- executable
|
207 |
+
- script
|
208 |
+
blocked_domains:
|
209 |
+
- localhost
|
210 |
+
- 127.0.0.1
|
211 |
+
- 0.0.0.0
|
212 |
+
enable_content_filtering: true
|
213 |
+
enable_rate_limiting: true
|
214 |
+
max_text_length: 1000000
|
215 |
+
max_upload_size_mb: 100
|
216 |
+
requests_per_hour: 1000
|
217 |
+
requests_per_minute: 60
|
218 |
+
sanitize_input: true
|
219 |
+
ui:
|
220 |
+
demo_mode: false
|
221 |
+
description:
|
222 |
+
Upload documents or provide URLs to build your knowledge base, then
|
223 |
+
ask questions!
|
224 |
+
features:
|
225 |
+
analytics_dashboard: true
|
226 |
+
confidence_display: true
|
227 |
+
file_upload: true
|
228 |
+
knowledge_base_management: true
|
229 |
+
query_interface: true
|
230 |
+
source_display: true
|
231 |
+
system_health_monitoring: true
|
232 |
+
url_input: true
|
233 |
+
max_file_uploads: 10
|
234 |
+
max_query_length: 1000
|
235 |
+
port: 7860
|
236 |
+
sample_documents: []
|
237 |
+
server_name: 0.0.0.0
|
238 |
+
share: false
|
239 |
+
show_advanced_options: true
|
240 |
+
theme: default
|
241 |
+
title: "\xF0\u0178\xA7\_ AI Embedded Knowledge Agent"
|
242 |
+
url_processing:
|
243 |
+
allowed_domains: []
|
244 |
+
blocked_domains:
|
245 |
+
- localhost
|
246 |
+
- 127.0.0.1
|
247 |
+
- 0.0.0.0
|
248 |
+
delay_between_requests: 0.5
|
249 |
+
extract_main_content: true
|
250 |
+
follow_links: true
|
251 |
+
max_depth: 1
|
252 |
+
max_pages: 10
|
253 |
+
remove_ads: true
|
254 |
+
remove_navigation: true
|
255 |
+
requests_per_second: 2
|
256 |
+
respect_robots_txt: true
|
257 |
+
timeout: 10
|
258 |
+
user_agent: RAG-AI-Bot/1.0
|
259 |
+
vector_db:
|
260 |
+
batch_size: 100
|
261 |
+
create_index_if_not_exists: true
|
262 |
+
dimension: 3072
|
263 |
+
environment: us-east-1
|
264 |
+
fallback_provider: memory
|
265 |
+
index_name: rag-ai-index
|
266 |
+
max_retries: 3
|
267 |
+
metric: cosine
|
268 |
+
provider: pinecone
|
269 |
+
retry_delay: 1
|
docs/architecture.md
ADDED
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AI Embedded Knowledge Agent - Architecture Document
|
2 |
+
|
3 |
+
## 1. System Overview
|
4 |
+
|
5 |
+
The AI Embedded Knowledge Agent is a versatile knowledge management system designed to ingest, process, and retrieve information from various document types and web sources. Built for a hackathon and deployable on Hugging Face, this system enables users to upload documents or provide URLs, which are then processed, embedded, and stored for intelligent retrieval.
|
6 |
+
|
7 |
+
```mermaid
|
8 |
+
graph TD
|
9 |
+
A[User Interface - Gradio] --> B[Document Processor]
|
10 |
+
A --> C[URL Processor]
|
11 |
+
B --> D[Text Extractor]
|
12 |
+
C --> D
|
13 |
+
D --> E[Embedding Generator - Gemini]
|
14 |
+
E --> F[Vector Database - Pinecone]
|
15 |
+
A --> G[Query Processor]
|
16 |
+
G --> E
|
17 |
+
G --> F
|
18 |
+
G --> H[Response Generator - LangChain RAG]
|
19 |
+
H --> A
|
20 |
+
```
|
21 |
+
|
22 |
+
## 2. Core Components
|
23 |
+
|
24 |
+
### 2.1 Document Ingestion System
|
25 |
+
|
26 |
+
This component handles the intake of various document formats and web content.
|
27 |
+
|
28 |
+
#### Document Processor
|
29 |
+
|
30 |
+
- **Responsibility**: Process uploaded documents (PDF, DOCX, CSV, PPTX, Excel)
|
31 |
+
- **Technologies**: PyMuPDF, python-docx, pandas, python-pptx, pdfplumber
|
32 |
+
- **Input**: Raw document files
|
33 |
+
- **Output**: Extracted text content
|
34 |
+
|
35 |
+
#### URL Processor
|
36 |
+
|
37 |
+
- **Responsibility**: Crawl and extract content from provided URLs, including nested documents and links
|
38 |
+
- **Technologies**: BeautifulSoup, requests, trafilatura
|
39 |
+
- **Input**: URLs
|
40 |
+
- **Output**: Extracted text content from web pages and linked documents
|
41 |
+
|
42 |
+
### 2.2 Knowledge Processing System
|
43 |
+
|
44 |
+
This component transforms raw text into queryable knowledge.
|
45 |
+
|
46 |
+
#### Text Extractor
|
47 |
+
|
48 |
+
- **Responsibility**: Clean, normalize, and chunk text from various sources
|
49 |
+
- **Technologies**: NLTK, spaCy, regex
|
50 |
+
- **Input**: Raw text from documents and web pages
|
51 |
+
- **Output**: Cleaned, normalized text chunks ready for embedding
|
52 |
+
|
53 |
+
#### Embedding Generator
|
54 |
+
|
55 |
+
- **Responsibility**: Generate vector embeddings for text chunks
|
56 |
+
- **Technology**: Gemini Embedding v3 (gemini-embedding-exp-03-07)
|
57 |
+
- **Input**: Processed text chunks
|
58 |
+
- **Output**: Vector embeddings
|
59 |
+
|
60 |
+
### 2.3 Knowledge Storage System
|
61 |
+
|
62 |
+
This component manages the storage and retrieval of vector embeddings.
|
63 |
+
|
64 |
+
#### Vector Database
|
65 |
+
|
66 |
+
- **Responsibility**: Store and index vector embeddings for efficient retrieval
|
67 |
+
- **Technology**: Pinecone
|
68 |
+
- **Input**: Vector embeddings with metadata
|
69 |
+
- **Output**: Retrieved relevant vectors based on similarity
|
70 |
+
|
71 |
+
### 2.4 Query Processing System
|
72 |
+
|
73 |
+
This component handles user queries and generates responses.
|
74 |
+
|
75 |
+
#### Query Processor
|
76 |
+
|
77 |
+
- **Responsibility**: Process user queries and convert them to vector embeddings
|
78 |
+
- **Technologies**: Gemini Embedding v3, LangChain
|
79 |
+
- **Input**: User queries
|
80 |
+
- **Output**: Query vector embeddings
|
81 |
+
|
82 |
+
#### Response Generator
|
83 |
+
|
84 |
+
- **Responsibility**: Generate coherent responses based on retrieved knowledge
|
85 |
+
- **Technology**: LangChain RAG (Retrieval Augmented Generation)
|
86 |
+
- **Input**: Retrieved relevant text chunks
|
87 |
+
- **Output**: Natural language responses
|
88 |
+
|
89 |
+
### 2.5 User Interface System
|
90 |
+
|
91 |
+
This component provides the user-facing interface.
|
92 |
+
|
93 |
+
#### Gradio UI
|
94 |
+
|
95 |
+
- **Responsibility**: Provide intuitive interface for document upload, URL input, and querying
|
96 |
+
- **Technology**: Gradio
|
97 |
+
- **Features**:
|
98 |
+
- Document upload area
|
99 |
+
- URL input field
|
100 |
+
- Query input and response display
|
101 |
+
- System status indicators
|
102 |
+
|
103 |
+
## 3. Data Flow
|
104 |
+
|
105 |
+
```mermaid
|
106 |
+
sequenceDiagram
|
107 |
+
participant User
|
108 |
+
participant UI as Gradio UI
|
109 |
+
participant DP as Document Processor
|
110 |
+
participant UP as URL Processor
|
111 |
+
participant TE as Text Extractor
|
112 |
+
participant EG as Embedding Generator
|
113 |
+
participant VDB as Vector Database
|
114 |
+
participant QP as Query Processor
|
115 |
+
participant RG as Response Generator
|
116 |
+
|
117 |
+
%% Document Upload Flow
|
118 |
+
User->>UI: Upload Document
|
119 |
+
UI->>DP: Process Document
|
120 |
+
DP->>TE: Extract Text
|
121 |
+
TE->>EG: Generate Embeddings
|
122 |
+
EG->>VDB: Store Embeddings
|
123 |
+
|
124 |
+
%% URL Processing Flow
|
125 |
+
User->>UI: Input URL
|
126 |
+
UI->>UP: Process URL
|
127 |
+
UP->>TE: Extract Text
|
128 |
+
TE->>EG: Generate Embeddings
|
129 |
+
EG->>VDB: Store Embeddings
|
130 |
+
|
131 |
+
%% Query Flow
|
132 |
+
User->>UI: Submit Query
|
133 |
+
UI->>QP: Process Query
|
134 |
+
QP->>EG: Generate Query Embedding
|
135 |
+
QP->>VDB: Retrieve Relevant Embeddings
|
136 |
+
VDB->>QP: Return Relevant Chunks
|
137 |
+
QP->>RG: Generate Response
|
138 |
+
RG->>UI: Display Response
|
139 |
+
UI->>User: Show Answer
|
140 |
+
```
|
141 |
+
|
142 |
+
## 4. Technical Architecture
|
143 |
+
|
144 |
+
### 4.1 Technology Stack
|
145 |
+
|
146 |
+
| Component | Technology | Purpose |
|
147 |
+
| ------------------ | ----------------------------------------------------- | ------------------------------------------ |
|
148 |
+
| Document Parsing | PyMuPDF, python-docx, pandas, python-pptx, pdfplumber | Extract text from various document formats |
|
149 |
+
| Web Scraping | BeautifulSoup, requests, trafilatura | Extract content from web pages |
|
150 |
+
| Text Processing | NLTK, spaCy, regex | Clean and chunk text |
|
151 |
+
| Embedding | Gemini Embedding v3 (gemini-embedding-exp-03-07) | Generate vector embeddings |
|
152 |
+
| Vector Storage | Pinecone | Store and retrieve vector embeddings |
|
153 |
+
| RAG Implementation | LangChain | Implement retrieval augmented generation |
|
154 |
+
| User Interface | Gradio | Provide user-friendly interface |
|
155 |
+
|
156 |
+
### 4.2 Integration Points
|
157 |
+
|
158 |
+
- **Document Processing → Text Extraction**: Raw text extraction from documents
|
159 |
+
- **URL Processing → Text Extraction**: Raw text extraction from web pages
|
160 |
+
- **Text Extraction → Embedding Generation**: Processed text chunks for embedding
|
161 |
+
- **Embedding Generation → Vector Database**: Storage of embeddings
|
162 |
+
- **Query Processing → Embedding Generation**: Query embedding generation
|
163 |
+
- **Query Processing → Vector Database**: Retrieval of relevant embeddings
|
164 |
+
- **Query Processing → Response Generation**: Generation of coherent responses
|
165 |
+
- **Response Generation → UI**: Display of responses to user
|
166 |
+
|
167 |
+
## 5. Deployment Architecture
|
168 |
+
|
169 |
+
The system is designed to be deployed on Hugging Face using their Spaces feature, which supports Gradio applications.
|
170 |
+
|
171 |
+
```mermaid
|
172 |
+
graph TD
|
173 |
+
A[User] --> B[Hugging Face Space]
|
174 |
+
B --> C[Gradio Application]
|
175 |
+
C --> D[Document Processing]
|
176 |
+
C --> E[URL Processing]
|
177 |
+
C --> F[Query Processing]
|
178 |
+
D --> G[Gemini API]
|
179 |
+
E --> G
|
180 |
+
F --> G
|
181 |
+
D --> H[Pinecone API]
|
182 |
+
E --> H
|
183 |
+
F --> H
|
184 |
+
```
|
185 |
+
|
186 |
+
### 5.1 Deployment Considerations
|
187 |
+
|
188 |
+
- **API Keys**: Secure storage of Gemini and Pinecone API keys
|
189 |
+
- **Rate Limiting**: Handling API rate limits for both Gemini and Pinecone
|
190 |
+
- **Memory Management**: Efficient memory usage within Hugging Face constraints
|
191 |
+
- **Statelessness**: Designing components to be stateless where possible
|
192 |
+
- **Error Handling**: Robust error handling for API failures and timeouts
|
193 |
+
|
194 |
+
## 6. Scalability and Performance
|
195 |
+
|
196 |
+
For the hackathon version, the focus is on functionality rather than scalability. However, the architecture is designed with the following considerations:
|
197 |
+
|
198 |
+
- **Document Size Limits**: Implement reasonable limits on document sizes
|
199 |
+
- **Chunking Strategy**: Optimize text chunking for better retrieval performance
|
200 |
+
- **Caching**: Implement basic caching for frequently accessed embeddings
|
201 |
+
- **Asynchronous Processing**: Use asynchronous processing where appropriate
|
202 |
+
|
203 |
+
## 7. Future Enhancements
|
204 |
+
|
205 |
+
While not implemented in the hackathon version, the architecture supports future enhancements:
|
206 |
+
|
207 |
+
- **Authentication**: User authentication and document access control
|
208 |
+
- **Document Versioning**: Track changes to documents over time
|
209 |
+
- **Advanced RAG Techniques**: Implement more sophisticated RAG approaches
|
210 |
+
- **Multi-Modal Support**: Add support for images and other non-text content
|
211 |
+
- **Collaborative Features**: Allow multiple users to collaborate on knowledge bases
|
212 |
+
- **Custom Training**: Fine-tune models for specific domains
|
213 |
+
|
214 |
+
## 8. Folder Structure
|
215 |
+
|
216 |
+
```
|
217 |
+
rag-ai/
|
218 |
+
├── src/
|
219 |
+
│ ├── ingestion/
|
220 |
+
│ │ ├── document_processor.py
|
221 |
+
│ │ ├── url_processor.py
|
222 |
+
│ │ └── text_extractor.py
|
223 |
+
│ ├── embedding/
|
224 |
+
│ │ └── embedding_generator.py
|
225 |
+
│ ├── storage/
|
226 |
+
│ │ └── vector_db.py
|
227 |
+
│ ├── rag/
|
228 |
+
│ │ ├── query_processor.py
|
229 |
+
│ │ └── response_generator.py
|
230 |
+
│ ├── ui/
|
231 |
+
│ │ └── gradio_app.py
|
232 |
+
│ └── utils/
|
233 |
+
│ ├── config_manager.py
|
234 |
+
│ └── error_handler.py
|
235 |
+
├── config/
|
236 |
+
│ └── config.yaml
|
237 |
+
├── docs/
|
238 |
+
│ ├── architecture.md
|
239 |
+
│ └── api_documentation.md
|
240 |
+
├── tests/
|
241 |
+
│ ├── test_document_processor.py
|
242 |
+
│ ├── test_url_processor.py
|
243 |
+
│ └── ...
|
244 |
+
├── scripts/
|
245 |
+
│ ├── setup.py
|
246 |
+
│ └── deploy_to_huggingface.py
|
247 |
+
├── data/
|
248 |
+
│ ├── sample_documents/
|
249 |
+
│ └── test_data/
|
250 |
+
├── .gitignore
|
251 |
+
├── requirements.txt
|
252 |
+
├── README.md
|
253 |
+
└── app.py
|
254 |
+
```
|
255 |
+
|
256 |
+
## 9. Implementation Roadmap
|
257 |
+
|
258 |
+
1. **Phase 1: Core Infrastructure**
|
259 |
+
|
260 |
+
- Set up project structure
|
261 |
+
- Implement basic document processing
|
262 |
+
- Set up Pinecone integration
|
263 |
+
|
264 |
+
2. **Phase 2: Knowledge Processing**
|
265 |
+
|
266 |
+
- Implement text extraction and chunking
|
267 |
+
- Integrate Gemini embedding API
|
268 |
+
- Develop vector storage and retrieval
|
269 |
+
|
270 |
+
3. **Phase 3: Query System**
|
271 |
+
|
272 |
+
- Implement query processing
|
273 |
+
- Develop RAG response generation
|
274 |
+
- Integrate components
|
275 |
+
|
276 |
+
4. **Phase 4: User Interface**
|
277 |
+
|
278 |
+
- Develop Gradio UI
|
279 |
+
- Integrate UI with backend components
|
280 |
+
- Add error handling and user feedback
|
281 |
+
|
282 |
+
5. **Phase 5: URL Processing**
|
283 |
+
|
284 |
+
- Implement URL crawling
|
285 |
+
- Add nested document extraction
|
286 |
+
- Integrate with existing components
|
287 |
+
|
288 |
+
6. **Phase 6: Testing and Deployment**
|
289 |
+
- Comprehensive testing
|
290 |
+
- Optimization for Hugging Face deployment
|
291 |
+
- Documentation and demo preparation
|
requirements.txt
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core Dependencies
|
2 |
+
gradio
|
3 |
+
pyyaml
|
4 |
+
python-dotenv
|
5 |
+
|
6 |
+
# Document Processing
|
7 |
+
PyPDF2
|
8 |
+
PyMuPDF
|
9 |
+
pdfplumber
|
10 |
+
python-docx
|
11 |
+
pandas
|
12 |
+
openpyxl
|
13 |
+
python-pptx
|
14 |
+
|
15 |
+
# Web Scraping and URL Processing
|
16 |
+
requests
|
17 |
+
beautifulsoup4
|
18 |
+
lxml
|
19 |
+
html2text
|
20 |
+
trafilatura
|
21 |
+
|
22 |
+
# Text Processing
|
23 |
+
nltk
|
24 |
+
spacy
|
25 |
+
textstat
|
26 |
+
langdetect
|
27 |
+
|
28 |
+
# Embedding and Vector Database
|
29 |
+
google-generativeai
|
30 |
+
pinecone
|
31 |
+
sentence-transformers
|
32 |
+
|
33 |
+
# LangChain and LLM Integration
|
34 |
+
langchain
|
35 |
+
langchain-google-genai
|
36 |
+
langchain-openai
|
37 |
+
langchain-community
|
38 |
+
openai
|
39 |
+
|
40 |
+
# Live Search Integration
|
41 |
+
tavily-python
|
42 |
+
|
43 |
+
# Vector Operations and ML
|
44 |
+
numpy
|
45 |
+
scikit-learn
|
46 |
+
faiss-cpu
|
47 |
+
|
48 |
+
# Async and Performance
|
49 |
+
aiohttp
|
50 |
+
asyncio
|
51 |
+
|
52 |
+
# Logging and Monitoring
|
53 |
+
structlog
|
54 |
+
prometheus-client
|
55 |
+
|
56 |
+
# Development and Testing
|
57 |
+
pytest
|
58 |
+
pytest-asyncio
|
59 |
+
black
|
60 |
+
flake8
|
61 |
+
mypy
|
62 |
+
|
63 |
+
# Optional Dependencies for Enhanced Features
|
64 |
+
# Uncomment if needed:
|
65 |
+
|
66 |
+
# Advanced NLP
|
67 |
+
# transformers
|
68 |
+
# torch
|
69 |
+
|
70 |
+
# Image Processing (if document images need processing)
|
71 |
+
# Pillow
|
72 |
+
# pytesseract
|
73 |
+
|
74 |
+
# Audio Processing (for future features)
|
75 |
+
# librosa
|
76 |
+
# soundfile
|
77 |
+
|
78 |
+
# Database Support
|
79 |
+
# psycopg2-binary
|
80 |
+
# sqlalchemy
|
81 |
+
|
82 |
+
# Cloud Storage
|
83 |
+
# boto3
|
84 |
+
# google-cloud-storage
|
85 |
+
|
86 |
+
# Monitoring and Analytics
|
87 |
+
# sentry-sdk
|
88 |
+
|
89 |
+
# Additional Text Processing
|
90 |
+
# langdetect
|
91 |
+
# polyglot
|
92 |
+
|
93 |
+
# Web Framework (if API endpoints needed)
|
94 |
+
# fastapi
|
95 |
+
# uvicorn
|
96 |
+
|
97 |
+
# Caching
|
98 |
+
# redis
|
99 |
+
# diskcache
|
100 |
+
|
101 |
+
# Configuration Management
|
102 |
+
# hydra-core
|
103 |
+
# omegaconf
|
src/embedding/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Embedding module for generating vector embeddings.
|
3 |
+
|
4 |
+
This module contains components for generating vector embeddings
|
5 |
+
from text chunks using Gemini Embedding v3.
|
6 |
+
"""
|
src/embedding/embedding_generator.py
ADDED
@@ -0,0 +1,462 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Embedding Generator Module
|
3 |
+
|
4 |
+
This module is responsible for generating vector embeddings for text chunks
|
5 |
+
using Gemini Embedding v3 with complete API integration.
|
6 |
+
|
7 |
+
Technology: Gemini Embedding v3 (gemini-embedding-exp-03-07)
|
8 |
+
"""
|
9 |
+
|
10 |
+
import logging
|
11 |
+
import os
|
12 |
+
import time
|
13 |
+
import hashlib
|
14 |
+
from datetime import datetime, timedelta
|
15 |
+
from typing import Dict, List, Any, Optional, Union
|
16 |
+
import json
|
17 |
+
|
18 |
+
# Import Gemini API and caching libraries
|
19 |
+
try:
|
20 |
+
import google.generativeai as genai
|
21 |
+
from cachetools import TTLCache
|
22 |
+
except ImportError as e:
|
23 |
+
logging.warning(f"Some embedding libraries are not installed: {e}")
|
24 |
+
|
25 |
+
from utils.error_handler import EmbeddingError, error_handler, ErrorType
|
26 |
+
|
27 |
+
|
28 |
+
class EmbeddingGenerator:
|
29 |
+
"""
|
30 |
+
Generates vector embeddings for text chunks using Gemini Embedding v3 with full functionality.
|
31 |
+
|
32 |
+
Features:
|
33 |
+
- Gemini Embedding v3 API integration
|
34 |
+
- Batch processing with rate limiting
|
35 |
+
- Intelligent retry logic with exponential backoff
|
36 |
+
- Embedding caching mechanism
|
37 |
+
- Cost optimization
|
38 |
+
"""
|
39 |
+
|
40 |
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
41 |
+
"""
|
42 |
+
Initialize the EmbeddingGenerator with configuration.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
config: Configuration dictionary with API parameters
|
46 |
+
"""
|
47 |
+
self.config = config or {}
|
48 |
+
self.logger = logging.getLogger(__name__)
|
49 |
+
|
50 |
+
# API Configuration
|
51 |
+
self.api_key = self.config.get("api_key", os.environ.get("GEMINI_API_KEY"))
|
52 |
+
self.model = self.config.get("model", "gemini-embedding-exp-03-07")
|
53 |
+
self.batch_size = self.config.get("batch_size", 5)
|
54 |
+
self.max_retries = self.config.get("max_retries", 3)
|
55 |
+
self.retry_delay = self.config.get("retry_delay", 1)
|
56 |
+
|
57 |
+
# Performance settings
|
58 |
+
self.rate_limit_delay = self.config.get("rate_limit_delay", 0.1)
|
59 |
+
self.max_text_length = self.config.get(
|
60 |
+
"max_text_length", 8192
|
61 |
+
) # ✨ 8K token limit for latest model
|
62 |
+
self.enable_caching = self.config.get("enable_caching", True)
|
63 |
+
self.cache_ttl = self.config.get("cache_ttl", 3600) # 1 hour
|
64 |
+
|
65 |
+
# Statistics tracking
|
66 |
+
self.stats = {
|
67 |
+
"total_requests": 0,
|
68 |
+
"successful_requests": 0,
|
69 |
+
"failed_requests": 0,
|
70 |
+
"cache_hits": 0,
|
71 |
+
"total_tokens_processed": 0,
|
72 |
+
"start_time": datetime.now(),
|
73 |
+
}
|
74 |
+
|
75 |
+
# Initialize cache
|
76 |
+
if self.enable_caching:
|
77 |
+
self.cache = TTLCache(maxsize=1000, ttl=self.cache_ttl)
|
78 |
+
else:
|
79 |
+
self.cache = None
|
80 |
+
|
81 |
+
# Validate and initialize API client
|
82 |
+
self._initialize_client()
|
83 |
+
|
84 |
+
def _initialize_client(self):
|
85 |
+
"""Initialize Gemini API client with validation."""
|
86 |
+
if not self.api_key:
|
87 |
+
self.logger.warning(
|
88 |
+
"No Gemini API key provided. Embeddings will not be generated."
|
89 |
+
)
|
90 |
+
self.client = None
|
91 |
+
return
|
92 |
+
|
93 |
+
try:
|
94 |
+
# Configure Gemini API
|
95 |
+
genai.configure(api_key=self.api_key)
|
96 |
+
|
97 |
+
# Test API connection
|
98 |
+
self._test_api_connection()
|
99 |
+
|
100 |
+
self.client = genai
|
101 |
+
self.logger.info("Gemini API client initialized successfully")
|
102 |
+
|
103 |
+
except Exception as e:
|
104 |
+
self.logger.error(f"Failed to initialize Gemini API client: {str(e)}")
|
105 |
+
self.client = None
|
106 |
+
|
107 |
+
def _test_api_connection(self):
|
108 |
+
"""Test API connection with a simple request."""
|
109 |
+
try:
|
110 |
+
# Test with a simple embedding request
|
111 |
+
test_result = genai.embed_content(
|
112 |
+
model=self.model,
|
113 |
+
content="test connection",
|
114 |
+
task_type="retrieval_document",
|
115 |
+
)
|
116 |
+
|
117 |
+
if not test_result.get("embedding"):
|
118 |
+
raise Exception("No embedding returned from test request")
|
119 |
+
|
120 |
+
self.logger.info("API connection test successful")
|
121 |
+
|
122 |
+
except Exception as e:
|
123 |
+
raise EmbeddingError(f"API connection test failed: {str(e)}")
|
124 |
+
|
125 |
+
@error_handler(ErrorType.EMBEDDING_GENERATION)
|
126 |
+
def generate_embeddings(self, texts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
127 |
+
"""
|
128 |
+
Generate embeddings for a list of text chunks with full functionality.
|
129 |
+
|
130 |
+
Args:
|
131 |
+
texts: List of dictionaries containing text chunks and metadata
|
132 |
+
Each dict should have 'content' and 'metadata' keys
|
133 |
+
|
134 |
+
Returns:
|
135 |
+
List of dictionaries with original content, metadata, and embeddings
|
136 |
+
"""
|
137 |
+
if not self.client or not texts:
|
138 |
+
self.logger.warning("No API client or empty text list")
|
139 |
+
return texts
|
140 |
+
|
141 |
+
self.logger.info(f"Generating embeddings for {len(texts)} text chunks")
|
142 |
+
start_time = time.time()
|
143 |
+
|
144 |
+
# Filter and validate texts
|
145 |
+
valid_texts = self._validate_texts(texts)
|
146 |
+
if not valid_texts:
|
147 |
+
self.logger.warning("No valid texts to process")
|
148 |
+
return texts
|
149 |
+
|
150 |
+
# Process in batches to respect API limits
|
151 |
+
results = []
|
152 |
+
total_batches = (len(valid_texts) + self.batch_size - 1) // self.batch_size
|
153 |
+
|
154 |
+
for i in range(0, len(valid_texts), self.batch_size):
|
155 |
+
batch_num = (i // self.batch_size) + 1
|
156 |
+
batch = valid_texts[i : i + self.batch_size]
|
157 |
+
|
158 |
+
self.logger.info(
|
159 |
+
f"Processing batch {batch_num}/{total_batches} ({len(batch)} items)"
|
160 |
+
)
|
161 |
+
|
162 |
+
try:
|
163 |
+
batch_results = self._process_batch(batch)
|
164 |
+
results.extend(batch_results)
|
165 |
+
|
166 |
+
# Rate limiting between batches
|
167 |
+
if i + self.batch_size < len(valid_texts):
|
168 |
+
time.sleep(self.rate_limit_delay)
|
169 |
+
|
170 |
+
except Exception as e:
|
171 |
+
self.logger.error(f"Batch {batch_num} failed: {str(e)}")
|
172 |
+
# Add original items without embeddings
|
173 |
+
for item in batch:
|
174 |
+
item_copy = item.copy()
|
175 |
+
item_copy["embedding"] = []
|
176 |
+
item_copy["embedding_error"] = str(e)
|
177 |
+
results.append(item_copy)
|
178 |
+
|
179 |
+
# Update statistics
|
180 |
+
processing_time = time.time() - start_time
|
181 |
+
self.logger.info(f"Embedding generation completed in {processing_time:.2f}s")
|
182 |
+
|
183 |
+
return results
|
184 |
+
|
185 |
+
def _validate_texts(self, texts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
186 |
+
"""
|
187 |
+
Validate and filter text inputs.
|
188 |
+
|
189 |
+
Args:
|
190 |
+
texts: List of text dictionaries
|
191 |
+
|
192 |
+
Returns:
|
193 |
+
List of valid text dictionaries
|
194 |
+
"""
|
195 |
+
valid_texts = []
|
196 |
+
|
197 |
+
for i, item in enumerate(texts):
|
198 |
+
if not isinstance(item, dict) or "content" not in item:
|
199 |
+
self.logger.warning(f"Invalid item at index {i}: missing 'content' key")
|
200 |
+
continue
|
201 |
+
|
202 |
+
content = item["content"]
|
203 |
+
if not content or not isinstance(content, str):
|
204 |
+
self.logger.warning(
|
205 |
+
f"Invalid content at index {i}: empty or non-string"
|
206 |
+
)
|
207 |
+
continue
|
208 |
+
|
209 |
+
# Truncate if too long
|
210 |
+
if len(content) > self.max_text_length:
|
211 |
+
self.logger.warning(
|
212 |
+
f"Truncating text at index {i}: {len(content)} -> {self.max_text_length} chars"
|
213 |
+
)
|
214 |
+
item = item.copy()
|
215 |
+
item["content"] = content[: self.max_text_length]
|
216 |
+
item["metadata"] = item.get("metadata", {})
|
217 |
+
item["metadata"]["truncated"] = True
|
218 |
+
item["metadata"]["original_length"] = len(content)
|
219 |
+
|
220 |
+
valid_texts.append(item)
|
221 |
+
|
222 |
+
return valid_texts
|
223 |
+
|
224 |
+
def _process_batch(self, batch: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
225 |
+
"""
|
226 |
+
Process a batch of text chunks to generate embeddings.
|
227 |
+
|
228 |
+
Args:
|
229 |
+
batch: List of dictionaries containing text chunks and metadata
|
230 |
+
|
231 |
+
Returns:
|
232 |
+
List of dictionaries with original content, metadata, and embeddings
|
233 |
+
"""
|
234 |
+
# Extract content and check cache
|
235 |
+
contents = []
|
236 |
+
cache_results = {}
|
237 |
+
|
238 |
+
for i, item in enumerate(batch):
|
239 |
+
content = item["content"]
|
240 |
+
|
241 |
+
# Check cache first
|
242 |
+
if self.cache is not None:
|
243 |
+
cache_key = self._get_cache_key(content)
|
244 |
+
if cache_key in self.cache:
|
245 |
+
cache_results[i] = self.cache[cache_key]
|
246 |
+
self.stats["cache_hits"] += 1
|
247 |
+
continue
|
248 |
+
|
249 |
+
contents.append((i, content))
|
250 |
+
|
251 |
+
# Generate embeddings for non-cached content
|
252 |
+
embeddings_map = {}
|
253 |
+
if contents:
|
254 |
+
content_texts = [content for _, content in contents]
|
255 |
+
embeddings = self._generate_with_retry(content_texts)
|
256 |
+
|
257 |
+
# Map embeddings back to indices
|
258 |
+
for j, (original_index, content) in enumerate(contents):
|
259 |
+
if j < len(embeddings):
|
260 |
+
embedding = embeddings[j]
|
261 |
+
embeddings_map[original_index] = embedding
|
262 |
+
|
263 |
+
# Cache the result
|
264 |
+
if self.cache is not None:
|
265 |
+
cache_key = self._get_cache_key(content)
|
266 |
+
self.cache[cache_key] = embedding
|
267 |
+
|
268 |
+
# 🔗 Combine results
|
269 |
+
results = []
|
270 |
+
for i, item in enumerate(batch):
|
271 |
+
result = item.copy()
|
272 |
+
|
273 |
+
# Add embedding from cache or new generation
|
274 |
+
if i in cache_results:
|
275 |
+
result["embedding"] = cache_results[i]
|
276 |
+
result["embedding_source"] = "cache"
|
277 |
+
elif i in embeddings_map:
|
278 |
+
result["embedding"] = embeddings_map[i]
|
279 |
+
result["embedding_source"] = "api"
|
280 |
+
else:
|
281 |
+
result["embedding"] = []
|
282 |
+
result["embedding_source"] = "failed"
|
283 |
+
self.logger.warning(f"Missing embedding for batch item {i}")
|
284 |
+
|
285 |
+
# Add embedding metadata
|
286 |
+
if result["embedding"]:
|
287 |
+
result["metadata"] = result.get("metadata", {})
|
288 |
+
result["metadata"].update(
|
289 |
+
{
|
290 |
+
"embedding_model": self.model,
|
291 |
+
"embedding_dimension": len(result["embedding"]),
|
292 |
+
"embedding_generated_at": datetime.now().isoformat(),
|
293 |
+
}
|
294 |
+
)
|
295 |
+
|
296 |
+
results.append(result)
|
297 |
+
|
298 |
+
return results
|
299 |
+
|
300 |
+
def _generate_with_retry(self, texts: List[str]) -> List[List[float]]:
|
301 |
+
"""
|
302 |
+
Generate embeddings with intelligent retry logic.
|
303 |
+
|
304 |
+
Args:
|
305 |
+
texts: List of text strings to embed
|
306 |
+
|
307 |
+
Returns:
|
308 |
+
List of embedding vectors (each is a list of floats)
|
309 |
+
"""
|
310 |
+
for attempt in range(self.max_retries):
|
311 |
+
try:
|
312 |
+
self.stats["total_requests"] += 1
|
313 |
+
|
314 |
+
# Generate embeddings using Gemini API
|
315 |
+
embeddings = []
|
316 |
+
|
317 |
+
for text in texts:
|
318 |
+
try:
|
319 |
+
# Track tokens
|
320 |
+
self.stats["total_tokens_processed"] += len(text.split())
|
321 |
+
|
322 |
+
# Call Gemini API
|
323 |
+
result = self.client.embed_content(
|
324 |
+
model=self.model,
|
325 |
+
content=text,
|
326 |
+
task_type="retrieval_document",
|
327 |
+
title="Document chunk for RAG system",
|
328 |
+
)
|
329 |
+
|
330 |
+
if result and "embedding" in result:
|
331 |
+
embeddings.append(result["embedding"])
|
332 |
+
else:
|
333 |
+
self.logger.warning(
|
334 |
+
f"No embedding in API response for text: {text[:50]}..."
|
335 |
+
)
|
336 |
+
embeddings.append([])
|
337 |
+
|
338 |
+
except Exception as e:
|
339 |
+
self.logger.warning(
|
340 |
+
f"Failed to embed individual text: {str(e)}"
|
341 |
+
)
|
342 |
+
embeddings.append([])
|
343 |
+
|
344 |
+
self.stats["successful_requests"] += 1
|
345 |
+
return embeddings
|
346 |
+
|
347 |
+
except Exception as e:
|
348 |
+
self.stats["failed_requests"] += 1
|
349 |
+
self.logger.warning(
|
350 |
+
f"Embedding generation failed (attempt {attempt+1}/{self.max_retries}): {str(e)}"
|
351 |
+
)
|
352 |
+
|
353 |
+
if attempt < self.max_retries - 1:
|
354 |
+
# Exponential backoff with jitter
|
355 |
+
delay = self.retry_delay * (2**attempt) + (time.time() % 1)
|
356 |
+
self.logger.info(f"Retrying in {delay:.1f} seconds...")
|
357 |
+
time.sleep(delay)
|
358 |
+
|
359 |
+
# All retries failed
|
360 |
+
self.logger.error("All embedding generation attempts failed")
|
361 |
+
return [[] for _ in texts]
|
362 |
+
|
363 |
+
@error_handler(ErrorType.EMBEDDING_GENERATION)
|
364 |
+
def generate_query_embedding(self, query: str) -> List[float]:
|
365 |
+
"""
|
366 |
+
Generate embedding for a single query string.
|
367 |
+
|
368 |
+
Args:
|
369 |
+
query: Query text to embed
|
370 |
+
|
371 |
+
Returns:
|
372 |
+
Embedding vector as a list of floats
|
373 |
+
"""
|
374 |
+
if not self.client or not query:
|
375 |
+
return []
|
376 |
+
|
377 |
+
self.logger.info(f"Generating embedding for query: {query[:50]}...")
|
378 |
+
|
379 |
+
# Check cache first
|
380 |
+
if self.cache is not None:
|
381 |
+
cache_key = self._get_cache_key(query, "query")
|
382 |
+
if cache_key in self.cache:
|
383 |
+
self.stats["cache_hits"] += 1
|
384 |
+
return self.cache[cache_key]
|
385 |
+
|
386 |
+
# Generate embedding
|
387 |
+
embeddings = self._generate_with_retry([query])
|
388 |
+
embedding = embeddings[0] if embeddings else []
|
389 |
+
|
390 |
+
# Cache the result
|
391 |
+
if embedding and self.cache is not None:
|
392 |
+
cache_key = self._get_cache_key(query, "query")
|
393 |
+
self.cache[cache_key] = embedding
|
394 |
+
|
395 |
+
return embedding
|
396 |
+
|
397 |
+
def _get_cache_key(self, text: str, prefix: str = "doc") -> str:
|
398 |
+
"""
|
399 |
+
Generate cache key for text.
|
400 |
+
|
401 |
+
Args:
|
402 |
+
text: Text content
|
403 |
+
prefix: Key prefix
|
404 |
+
|
405 |
+
Returns:
|
406 |
+
Cache key string
|
407 |
+
"""
|
408 |
+
# 🔐 Create hash of text + model for unique key
|
409 |
+
content_hash = hashlib.md5(f"{self.model}:{text}".encode()).hexdigest()
|
410 |
+
return f"{prefix}:{content_hash}"
|
411 |
+
|
412 |
+
def get_statistics(self) -> Dict[str, Any]:
|
413 |
+
"""
|
414 |
+
Get embedding generation statistics.
|
415 |
+
|
416 |
+
Returns:
|
417 |
+
Dictionary with statistics
|
418 |
+
"""
|
419 |
+
runtime = datetime.now() - self.stats["start_time"]
|
420 |
+
|
421 |
+
return {
|
422 |
+
**self.stats,
|
423 |
+
"runtime_seconds": runtime.total_seconds(),
|
424 |
+
"cache_hit_rate": (
|
425 |
+
self.stats["cache_hits"] / max(1, self.stats["total_requests"]) * 100
|
426 |
+
),
|
427 |
+
"success_rate": (
|
428 |
+
self.stats["successful_requests"]
|
429 |
+
/ max(1, self.stats["total_requests"])
|
430 |
+
* 100
|
431 |
+
),
|
432 |
+
"avg_tokens_per_request": (
|
433 |
+
self.stats["total_tokens_processed"]
|
434 |
+
/ max(1, self.stats["total_requests"])
|
435 |
+
),
|
436 |
+
"cache_size": len(self.cache) if self.cache else 0,
|
437 |
+
"model": self.model,
|
438 |
+
"batch_size": self.batch_size,
|
439 |
+
}
|
440 |
+
|
441 |
+
def clear_cache(self):
|
442 |
+
"""Clear the embedding cache."""
|
443 |
+
if self.cache:
|
444 |
+
self.cache.clear()
|
445 |
+
self.logger.info("Embedding cache cleared")
|
446 |
+
|
447 |
+
def warm_up_cache(self, sample_texts: List[str]):
|
448 |
+
"""
|
449 |
+
🔥 Warm up the cache with sample texts.
|
450 |
+
|
451 |
+
Args:
|
452 |
+
sample_texts: List of sample texts to pre-generate embeddings
|
453 |
+
"""
|
454 |
+
if not sample_texts:
|
455 |
+
return
|
456 |
+
|
457 |
+
self.logger.info(f"🔥 Warming up cache with {len(sample_texts)} sample texts")
|
458 |
+
|
459 |
+
sample_items = [{"content": text, "metadata": {}} for text in sample_texts]
|
460 |
+
self.generate_embeddings(sample_items)
|
461 |
+
|
462 |
+
self.logger.info("Cache warm-up completed")
|
src/ingestion/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Ingestion module for processing documents and URLs.
|
3 |
+
|
4 |
+
This module contains components for processing various document formats
|
5 |
+
and extracting content from web URLs.
|
6 |
+
"""
|
src/ingestion/document_processor.py
ADDED
@@ -0,0 +1,668 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Document Processor Module
|
3 |
+
|
4 |
+
This module is responsible for processing various document formats including
|
5 |
+
PDF, DOCX, CSV, PPTX, and Excel files with complete functionality.
|
6 |
+
|
7 |
+
Technologies: PyMuPDF, python-docx, pandas, python-pptx, pdfplumber
|
8 |
+
"""
|
9 |
+
|
10 |
+
import os
|
11 |
+
import time
|
12 |
+
from datetime import datetime
|
13 |
+
from pathlib import Path
|
14 |
+
from typing import Dict, List, Any, Optional, Union
|
15 |
+
import logging
|
16 |
+
|
17 |
+
# Import document processing libraries
|
18 |
+
try:
|
19 |
+
import fitz # PyMuPDF
|
20 |
+
import docx
|
21 |
+
import pandas as pd
|
22 |
+
import pptx
|
23 |
+
import pdfplumber
|
24 |
+
from openpyxl import load_workbook
|
25 |
+
except ImportError as e:
|
26 |
+
logging.warning(f"Some document processing libraries are not installed: {e}")
|
27 |
+
|
28 |
+
from utils.error_handler import DocumentProcessingError, error_handler, ErrorType
|
29 |
+
|
30 |
+
|
31 |
+
class DocumentProcessor:
|
32 |
+
"""
|
33 |
+
Processes various document formats and extracts text content with full functionality.
|
34 |
+
|
35 |
+
Supported formats:
|
36 |
+
- PDF (using PyMuPDF and pdfplumber)
|
37 |
+
- DOCX (using python-docx)
|
38 |
+
- CSV/Excel (using pandas)
|
39 |
+
- PPTX (using python-pptx)
|
40 |
+
"""
|
41 |
+
|
42 |
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
43 |
+
"""
|
44 |
+
Initialize the DocumentProcessor with configuration.
|
45 |
+
|
46 |
+
Args:
|
47 |
+
config: Configuration dictionary with processing parameters
|
48 |
+
"""
|
49 |
+
self.config = config or {}
|
50 |
+
self.logger = logging.getLogger(__name__)
|
51 |
+
|
52 |
+
# Configuration settings
|
53 |
+
self.max_file_size_mb = self.config.get("max_file_size_mb", 50)
|
54 |
+
self.supported_formats = self.config.get(
|
55 |
+
"supported_formats",
|
56 |
+
[".pdf", ".docx", ".csv", ".xlsx", ".xls", ".pptx", ".txt", ".md"],
|
57 |
+
)
|
58 |
+
|
59 |
+
@error_handler(ErrorType.DOCUMENT_PROCESSING)
|
60 |
+
def process_document(self, file_path: str) -> Dict[str, Any]:
|
61 |
+
"""
|
62 |
+
Process a document and extract its text content with metadata.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
file_path: Path to the document file
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
Dictionary containing extracted text and metadata
|
69 |
+
"""
|
70 |
+
if not os.path.exists(file_path):
|
71 |
+
raise DocumentProcessingError(f"Document not found: {file_path}", file_path)
|
72 |
+
|
73 |
+
# Validate file size
|
74 |
+
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
75 |
+
if file_size_mb > self.max_file_size_mb:
|
76 |
+
raise DocumentProcessingError(
|
77 |
+
f"File too large: {file_size_mb:.1f}MB (max: {self.max_file_size_mb}MB)",
|
78 |
+
file_path,
|
79 |
+
)
|
80 |
+
|
81 |
+
file_extension = os.path.splitext(file_path)[1].lower()
|
82 |
+
|
83 |
+
# Validate file format
|
84 |
+
if file_extension not in self.supported_formats:
|
85 |
+
raise DocumentProcessingError(
|
86 |
+
f"Unsupported file format: {file_extension}", file_path
|
87 |
+
)
|
88 |
+
|
89 |
+
self.logger.info(f"Processing document: {file_path} ({file_size_mb:.1f}MB)")
|
90 |
+
|
91 |
+
try:
|
92 |
+
if file_extension == ".pdf":
|
93 |
+
return self._process_pdf(file_path)
|
94 |
+
elif file_extension == ".docx":
|
95 |
+
return self._process_docx(file_path)
|
96 |
+
elif file_extension in [".csv", ".xlsx", ".xls"]:
|
97 |
+
return self._process_spreadsheet(file_path)
|
98 |
+
elif file_extension == ".pptx":
|
99 |
+
return self._process_pptx(file_path)
|
100 |
+
elif file_extension in [".txt", ".md"]:
|
101 |
+
return self._process_text_file(file_path)
|
102 |
+
except Exception as e:
|
103 |
+
raise DocumentProcessingError(
|
104 |
+
f"Error processing document: {str(e)}", file_path
|
105 |
+
)
|
106 |
+
|
107 |
+
def process_batch(self, file_paths: List[str]) -> List[Dict[str, Any]]:
|
108 |
+
"""
|
109 |
+
Process multiple documents in batch.
|
110 |
+
|
111 |
+
Args:
|
112 |
+
file_paths: List of file paths to process
|
113 |
+
|
114 |
+
Returns:
|
115 |
+
List of processed document results
|
116 |
+
"""
|
117 |
+
results = []
|
118 |
+
self.logger.info(f"Processing batch of {len(file_paths)} documents")
|
119 |
+
|
120 |
+
for i, file_path in enumerate(file_paths):
|
121 |
+
try:
|
122 |
+
result = self.process_document(file_path)
|
123 |
+
results.append(result)
|
124 |
+
self.logger.info(f"Processed {i+1}/{len(file_paths)}: {file_path}")
|
125 |
+
except Exception as e:
|
126 |
+
self.logger.error(f"❌ Failed to process {file_path}: {str(e)}")
|
127 |
+
# Continue with other files
|
128 |
+
continue
|
129 |
+
|
130 |
+
return results
|
131 |
+
|
132 |
+
def _extract_metadata(self, file_path: str) -> Dict[str, Any]:
|
133 |
+
"""
|
134 |
+
Extract common metadata from file.
|
135 |
+
|
136 |
+
Args:
|
137 |
+
file_path: Path to the file
|
138 |
+
|
139 |
+
Returns:
|
140 |
+
Dictionary containing file metadata
|
141 |
+
"""
|
142 |
+
file_stat = os.stat(file_path)
|
143 |
+
file_path_obj = Path(file_path)
|
144 |
+
|
145 |
+
return {
|
146 |
+
"filename": file_path_obj.name,
|
147 |
+
"file_extension": file_path_obj.suffix.lower(),
|
148 |
+
"file_size_bytes": file_stat.st_size,
|
149 |
+
"file_size_mb": round(file_stat.st_size / (1024 * 1024), 2),
|
150 |
+
"created_time": datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
|
151 |
+
"modified_time": datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
|
152 |
+
"processed_time": datetime.now().isoformat(),
|
153 |
+
}
|
154 |
+
|
155 |
+
def _process_pdf(self, file_path: str) -> Dict[str, Any]:
|
156 |
+
"""
|
157 |
+
📄 Extract text from a PDF document using PyMuPDF with fallback to pdfplumber.
|
158 |
+
|
159 |
+
Args:
|
160 |
+
file_path: Path to the PDF file
|
161 |
+
|
162 |
+
Returns:
|
163 |
+
Dictionary with extracted text and metadata
|
164 |
+
"""
|
165 |
+
self.logger.info(f"Processing PDF: {file_path}")
|
166 |
+
|
167 |
+
text_content = []
|
168 |
+
metadata = self._extract_metadata(file_path)
|
169 |
+
|
170 |
+
try:
|
171 |
+
# Primary method: PyMuPDF (faster)
|
172 |
+
doc = fitz.open(file_path)
|
173 |
+
metadata.update(
|
174 |
+
{
|
175 |
+
"page_count": doc.page_count,
|
176 |
+
"title": doc.metadata.get("title", ""),
|
177 |
+
"author": doc.metadata.get("author", ""),
|
178 |
+
"subject": doc.metadata.get("subject", ""),
|
179 |
+
"creator": doc.metadata.get("creator", ""),
|
180 |
+
}
|
181 |
+
)
|
182 |
+
|
183 |
+
for page_num in range(doc.page_count):
|
184 |
+
page = doc[page_num]
|
185 |
+
text = page.get_text()
|
186 |
+
if text.strip(): # Only add non-empty pages
|
187 |
+
text_content.append({"page": page_num + 1, "content": text.strip()})
|
188 |
+
|
189 |
+
doc.close()
|
190 |
+
|
191 |
+
except Exception as e:
|
192 |
+
self.logger.warning(f"PyMuPDF failed, trying pdfplumber: {str(e)}")
|
193 |
+
|
194 |
+
# Fallback method: pdfplumber (more robust for complex PDFs)
|
195 |
+
try:
|
196 |
+
with pdfplumber.open(file_path) as pdf:
|
197 |
+
metadata["page_count"] = len(pdf.pages)
|
198 |
+
|
199 |
+
for page_num, page in enumerate(pdf.pages):
|
200 |
+
text = page.extract_text()
|
201 |
+
if text and text.strip():
|
202 |
+
text_content.append(
|
203 |
+
{"page": page_num + 1, "content": text.strip()}
|
204 |
+
)
|
205 |
+
|
206 |
+
except Exception as fallback_error:
|
207 |
+
raise DocumentProcessingError(
|
208 |
+
f"Both PDF extraction methods failed: {str(fallback_error)}",
|
209 |
+
file_path,
|
210 |
+
)
|
211 |
+
|
212 |
+
# Final content processing
|
213 |
+
full_text = "\n\n".join([item["content"] for item in text_content])
|
214 |
+
metadata["total_characters"] = len(full_text)
|
215 |
+
metadata["total_words"] = len(full_text.split())
|
216 |
+
|
217 |
+
return {
|
218 |
+
"content": full_text,
|
219 |
+
"pages": text_content,
|
220 |
+
"metadata": metadata,
|
221 |
+
"source": file_path,
|
222 |
+
"document_type": "pdf",
|
223 |
+
}
|
224 |
+
|
225 |
+
def _process_docx(self, file_path: str) -> Dict[str, Any]:
|
226 |
+
"""
|
227 |
+
Extract text from a DOCX document using python-docx.
|
228 |
+
|
229 |
+
Args:
|
230 |
+
file_path: Path to the DOCX file
|
231 |
+
|
232 |
+
Returns:
|
233 |
+
Dictionary with extracted text and metadata
|
234 |
+
"""
|
235 |
+
self.logger.info(f"Processing DOCX: {file_path}")
|
236 |
+
|
237 |
+
try:
|
238 |
+
doc = docx.Document(file_path)
|
239 |
+
metadata = self._extract_metadata(file_path)
|
240 |
+
|
241 |
+
# Extract document properties
|
242 |
+
core_props = doc.core_properties
|
243 |
+
metadata.update(
|
244 |
+
{
|
245 |
+
"title": core_props.title or "",
|
246 |
+
"author": core_props.author or "",
|
247 |
+
"subject": core_props.subject or "",
|
248 |
+
"created": (
|
249 |
+
core_props.created.isoformat() if core_props.created else ""
|
250 |
+
),
|
251 |
+
"modified": (
|
252 |
+
core_props.modified.isoformat() if core_props.modified else ""
|
253 |
+
),
|
254 |
+
"paragraph_count": len(doc.paragraphs),
|
255 |
+
}
|
256 |
+
)
|
257 |
+
|
258 |
+
# Extract text content
|
259 |
+
paragraphs = []
|
260 |
+
full_text_parts = []
|
261 |
+
|
262 |
+
for i, paragraph in enumerate(doc.paragraphs):
|
263 |
+
text = paragraph.text.strip()
|
264 |
+
if text: # Only include non-empty paragraphs
|
265 |
+
paragraphs.append({"paragraph": i + 1, "content": text})
|
266 |
+
full_text_parts.append(text)
|
267 |
+
|
268 |
+
# Extract tables if present
|
269 |
+
tables_content = []
|
270 |
+
for table_idx, table in enumerate(doc.tables):
|
271 |
+
table_data = []
|
272 |
+
for row in table.rows:
|
273 |
+
row_data = [cell.text.strip() for cell in row.cells]
|
274 |
+
if any(row_data): # Only include non-empty rows
|
275 |
+
table_data.append(row_data)
|
276 |
+
|
277 |
+
if table_data:
|
278 |
+
tables_content.append({"table": table_idx + 1, "data": table_data})
|
279 |
+
# Add table content to full text
|
280 |
+
table_text = "\n".join([" | ".join(row) for row in table_data])
|
281 |
+
full_text_parts.append(f"\n[Table {table_idx + 1}]\n{table_text}")
|
282 |
+
|
283 |
+
full_text = "\n\n".join(full_text_parts)
|
284 |
+
metadata.update(
|
285 |
+
{
|
286 |
+
"total_characters": len(full_text),
|
287 |
+
"total_words": len(full_text.split()),
|
288 |
+
"table_count": len(tables_content),
|
289 |
+
}
|
290 |
+
)
|
291 |
+
|
292 |
+
return {
|
293 |
+
"content": full_text,
|
294 |
+
"paragraphs": paragraphs,
|
295 |
+
"tables": tables_content,
|
296 |
+
"metadata": metadata,
|
297 |
+
"source": file_path,
|
298 |
+
"document_type": "docx",
|
299 |
+
}
|
300 |
+
|
301 |
+
except Exception as e:
|
302 |
+
raise DocumentProcessingError(f"Error processing DOCX: {str(e)}", file_path)
|
303 |
+
|
304 |
+
def _process_spreadsheet(self, file_path: str) -> Dict[str, Any]:
|
305 |
+
"""
|
306 |
+
Extract text from a CSV or Excel file using pandas.
|
307 |
+
|
308 |
+
Args:
|
309 |
+
file_path: Path to the spreadsheet file
|
310 |
+
|
311 |
+
Returns:
|
312 |
+
Dictionary with extracted text and metadata
|
313 |
+
"""
|
314 |
+
file_extension = os.path.splitext(file_path)[1].lower()
|
315 |
+
self.logger.info(f"Processing spreadsheet: {file_path}")
|
316 |
+
|
317 |
+
try:
|
318 |
+
metadata = self._extract_metadata(file_path)
|
319 |
+
sheets_data = []
|
320 |
+
|
321 |
+
if file_extension == ".csv":
|
322 |
+
# 📄 Process CSV file
|
323 |
+
df = pd.read_csv(file_path, encoding="utf-8")
|
324 |
+
sheet_content = self._process_dataframe(df, "Sheet1")
|
325 |
+
sheets_data.append(sheet_content)
|
326 |
+
metadata["sheet_count"] = 1
|
327 |
+
|
328 |
+
else:
|
329 |
+
# Process Excel file
|
330 |
+
excel_file = pd.ExcelFile(file_path)
|
331 |
+
metadata["sheet_count"] = len(excel_file.sheet_names)
|
332 |
+
|
333 |
+
for sheet_name in excel_file.sheet_names:
|
334 |
+
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
335 |
+
sheet_content = self._process_dataframe(df, sheet_name)
|
336 |
+
sheets_data.append(sheet_content)
|
337 |
+
|
338 |
+
# 🔗 Combine all sheets content
|
339 |
+
full_text_parts = []
|
340 |
+
for sheet in sheets_data:
|
341 |
+
full_text_parts.append(f"[{sheet['sheet_name']}]\n{sheet['content']}")
|
342 |
+
|
343 |
+
full_text = "\n\n".join(full_text_parts)
|
344 |
+
metadata.update(
|
345 |
+
{
|
346 |
+
"total_characters": len(full_text),
|
347 |
+
"total_words": len(full_text.split()),
|
348 |
+
"total_rows": sum(sheet["row_count"] for sheet in sheets_data),
|
349 |
+
"total_columns": (
|
350 |
+
max(sheet["column_count"] for sheet in sheets_data)
|
351 |
+
if sheets_data
|
352 |
+
else 0
|
353 |
+
),
|
354 |
+
}
|
355 |
+
)
|
356 |
+
|
357 |
+
return {
|
358 |
+
"content": full_text,
|
359 |
+
"sheets": sheets_data,
|
360 |
+
"metadata": metadata,
|
361 |
+
"source": file_path,
|
362 |
+
"document_type": "spreadsheet",
|
363 |
+
}
|
364 |
+
|
365 |
+
except Exception as e:
|
366 |
+
raise DocumentProcessingError(
|
367 |
+
f"Error processing spreadsheet: {str(e)}", file_path
|
368 |
+
)
|
369 |
+
|
370 |
+
def _process_dataframe(self, df: pd.DataFrame, sheet_name: str) -> Dict[str, Any]:
|
371 |
+
"""
|
372 |
+
Process a pandas DataFrame into text content.
|
373 |
+
|
374 |
+
Args:
|
375 |
+
df: Pandas DataFrame
|
376 |
+
sheet_name: Name of the sheet
|
377 |
+
|
378 |
+
Returns:
|
379 |
+
Dictionary with processed sheet data
|
380 |
+
"""
|
381 |
+
# Clean the dataframe
|
382 |
+
df = df.dropna(how="all") # Remove completely empty rows
|
383 |
+
df = df.fillna("") # Fill NaN with empty strings
|
384 |
+
|
385 |
+
# Create text representation
|
386 |
+
content_parts = []
|
387 |
+
|
388 |
+
# Add headers
|
389 |
+
headers = df.columns.tolist()
|
390 |
+
content_parts.append(" | ".join(str(h) for h in headers))
|
391 |
+
content_parts.append("-" * 50) # Separator
|
392 |
+
|
393 |
+
# Add data rows
|
394 |
+
for _, row in df.iterrows():
|
395 |
+
row_text = " | ".join(str(cell) for cell in row.values)
|
396 |
+
content_parts.append(row_text)
|
397 |
+
|
398 |
+
content = "\n".join(content_parts)
|
399 |
+
|
400 |
+
return {
|
401 |
+
"sheet_name": sheet_name,
|
402 |
+
"content": content,
|
403 |
+
"headers": headers,
|
404 |
+
"row_count": len(df),
|
405 |
+
"column_count": len(df.columns),
|
406 |
+
"data": df.to_dict("records"), # For structured access
|
407 |
+
}
|
408 |
+
|
409 |
+
def _process_pptx(self, file_path: str) -> Dict[str, Any]:
|
410 |
+
"""
|
411 |
+
🎯 Extract text from a PowerPoint presentation using python-pptx.
|
412 |
+
|
413 |
+
Args:
|
414 |
+
file_path: Path to the PPTX file
|
415 |
+
|
416 |
+
Returns:
|
417 |
+
Dictionary with extracted text and metadata
|
418 |
+
"""
|
419 |
+
self.logger.info(f" Processing PPTX: {file_path}")
|
420 |
+
|
421 |
+
try:
|
422 |
+
presentation = pptx.Presentation(file_path)
|
423 |
+
metadata = self._extract_metadata(file_path)
|
424 |
+
|
425 |
+
# Extract presentation metadata
|
426 |
+
core_props = presentation.core_properties
|
427 |
+
metadata.update(
|
428 |
+
{
|
429 |
+
"title": core_props.title or "",
|
430 |
+
"author": core_props.author or "",
|
431 |
+
"subject": core_props.subject or "",
|
432 |
+
"created": (
|
433 |
+
core_props.created.isoformat() if core_props.created else ""
|
434 |
+
),
|
435 |
+
"modified": (
|
436 |
+
core_props.modified.isoformat() if core_props.modified else ""
|
437 |
+
),
|
438 |
+
"slide_count": len(presentation.slides),
|
439 |
+
}
|
440 |
+
)
|
441 |
+
|
442 |
+
# 🎯 Extract content from slides
|
443 |
+
slides_content = []
|
444 |
+
full_text_parts = []
|
445 |
+
|
446 |
+
for slide_idx, slide in enumerate(presentation.slides):
|
447 |
+
slide_text_parts = []
|
448 |
+
|
449 |
+
# Extract text from all shapes in the slide
|
450 |
+
for shape in slide.shapes:
|
451 |
+
if hasattr(shape, "text") and shape.text.strip():
|
452 |
+
slide_text_parts.append(shape.text.strip())
|
453 |
+
|
454 |
+
if slide_text_parts:
|
455 |
+
slide_content = "\n".join(slide_text_parts)
|
456 |
+
slides_content.append(
|
457 |
+
{"slide": slide_idx + 1, "content": slide_content}
|
458 |
+
)
|
459 |
+
full_text_parts.append(f"[Slide {slide_idx + 1}]\n{slide_content}")
|
460 |
+
|
461 |
+
full_text = "\n\n".join(full_text_parts)
|
462 |
+
metadata.update(
|
463 |
+
{
|
464 |
+
"total_characters": len(full_text),
|
465 |
+
"total_words": len(full_text.split()),
|
466 |
+
"slides_with_content": len(slides_content),
|
467 |
+
}
|
468 |
+
)
|
469 |
+
|
470 |
+
return {
|
471 |
+
"content": full_text,
|
472 |
+
"slides": slides_content,
|
473 |
+
"metadata": metadata,
|
474 |
+
"source": file_path,
|
475 |
+
"document_type": "pptx",
|
476 |
+
}
|
477 |
+
|
478 |
+
except Exception as e:
|
479 |
+
raise DocumentProcessingError(f"Error processing PPTX: {str(e)}", file_path)
|
480 |
+
|
481 |
+
def _process_text_file(self, file_path: str) -> Dict[str, Any]:
|
482 |
+
"""
|
483 |
+
📝 Extract text from plain text files (.txt, .md).
|
484 |
+
|
485 |
+
Args:
|
486 |
+
file_path: Path to the text file
|
487 |
+
|
488 |
+
Returns:
|
489 |
+
Dictionary with extracted text and metadata
|
490 |
+
"""
|
491 |
+
file_extension = os.path.splitext(file_path)[1].lower()
|
492 |
+
self.logger.info(f" Processing text file: {file_path}")
|
493 |
+
|
494 |
+
try:
|
495 |
+
metadata = self._extract_metadata(file_path)
|
496 |
+
|
497 |
+
# Try different encodings for robust text reading
|
498 |
+
encodings = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
|
499 |
+
content = None
|
500 |
+
|
501 |
+
for encoding in encodings:
|
502 |
+
try:
|
503 |
+
with open(file_path, "r", encoding=encoding) as file:
|
504 |
+
content = file.read()
|
505 |
+
self.logger.info(
|
506 |
+
f" Successfully read file with {encoding} encoding"
|
507 |
+
)
|
508 |
+
break
|
509 |
+
except UnicodeDecodeError:
|
510 |
+
continue
|
511 |
+
except Exception as e:
|
512 |
+
self.logger.warning(f"Failed to read with {encoding}: {str(e)}")
|
513 |
+
continue
|
514 |
+
|
515 |
+
if content is None:
|
516 |
+
raise DocumentProcessingError(
|
517 |
+
f"Could not read file with any supported encoding", file_path
|
518 |
+
)
|
519 |
+
|
520 |
+
# Clean and process content
|
521 |
+
content = content.strip()
|
522 |
+
if not content:
|
523 |
+
raise DocumentProcessingError(
|
524 |
+
f"File is empty or contains no readable text", file_path
|
525 |
+
)
|
526 |
+
|
527 |
+
# Split content into logical sections for better processing
|
528 |
+
sections = []
|
529 |
+
if file_extension == ".md":
|
530 |
+
# 📋 For Markdown files, split by headers
|
531 |
+
sections = self._split_markdown_content(content)
|
532 |
+
else:
|
533 |
+
# 📄 For plain text, split by paragraphs
|
534 |
+
sections = self._split_text_content(content)
|
535 |
+
|
536 |
+
# Update metadata with text-specific information
|
537 |
+
lines = content.split("\n")
|
538 |
+
metadata.update(
|
539 |
+
{
|
540 |
+
"file_type": (
|
541 |
+
"markdown" if file_extension == ".md" else "plain_text"
|
542 |
+
),
|
543 |
+
"line_count": len(lines),
|
544 |
+
"paragraph_count": len(
|
545 |
+
[p for p in content.split("\n\n") if p.strip()]
|
546 |
+
),
|
547 |
+
"total_characters": len(content),
|
548 |
+
"total_words": len(content.split()),
|
549 |
+
"encoding_used": encoding if "encoding" in locals() else "utf-8",
|
550 |
+
"sections_count": len(sections),
|
551 |
+
}
|
552 |
+
)
|
553 |
+
|
554 |
+
return {
|
555 |
+
"content": content,
|
556 |
+
"sections": sections,
|
557 |
+
"metadata": metadata,
|
558 |
+
"source": file_path,
|
559 |
+
"document_type": "markdown" if file_extension == ".md" else "text",
|
560 |
+
}
|
561 |
+
|
562 |
+
except Exception as e:
|
563 |
+
raise DocumentProcessingError(
|
564 |
+
f"Error processing text file: {str(e)}", file_path
|
565 |
+
)
|
566 |
+
|
567 |
+
def _split_markdown_content(self, content: str) -> List[Dict[str, Any]]:
|
568 |
+
"""
|
569 |
+
Split Markdown content by headers for better organization.
|
570 |
+
|
571 |
+
Args:
|
572 |
+
content: Markdown content
|
573 |
+
|
574 |
+
Returns:
|
575 |
+
List of sections with headers and content
|
576 |
+
"""
|
577 |
+
sections = []
|
578 |
+
lines = content.split("\n")
|
579 |
+
current_section = {"header": "", "content": [], "level": 0}
|
580 |
+
|
581 |
+
for line in lines:
|
582 |
+
# Check for markdown headers
|
583 |
+
if line.strip().startswith("#"):
|
584 |
+
# Save previous section if it has content
|
585 |
+
if current_section["content"] or current_section["header"]:
|
586 |
+
section_content = "\n".join(current_section["content"]).strip()
|
587 |
+
if section_content or current_section["header"]:
|
588 |
+
sections.append(
|
589 |
+
{
|
590 |
+
"header": current_section["header"],
|
591 |
+
"content": section_content,
|
592 |
+
"level": current_section["level"],
|
593 |
+
"section_index": len(sections),
|
594 |
+
}
|
595 |
+
)
|
596 |
+
|
597 |
+
# Start new section
|
598 |
+
header_level = len(line) - len(line.lstrip("#"))
|
599 |
+
header_text = line.lstrip("#").strip()
|
600 |
+
current_section = {
|
601 |
+
"header": header_text,
|
602 |
+
"content": [],
|
603 |
+
"level": header_level,
|
604 |
+
}
|
605 |
+
else:
|
606 |
+
current_section["content"].append(line)
|
607 |
+
|
608 |
+
# Add the last section
|
609 |
+
if current_section["content"] or current_section["header"]:
|
610 |
+
section_content = "\n".join(current_section["content"]).strip()
|
611 |
+
if section_content or current_section["header"]:
|
612 |
+
sections.append(
|
613 |
+
{
|
614 |
+
"header": current_section["header"],
|
615 |
+
"content": section_content,
|
616 |
+
"level": current_section["level"],
|
617 |
+
"section_index": len(sections),
|
618 |
+
}
|
619 |
+
)
|
620 |
+
|
621 |
+
# If no headers found, treat entire content as one section
|
622 |
+
if not sections:
|
623 |
+
sections.append(
|
624 |
+
{
|
625 |
+
"header": "Document Content",
|
626 |
+
"content": content.strip(),
|
627 |
+
"level": 1,
|
628 |
+
"section_index": 0,
|
629 |
+
}
|
630 |
+
)
|
631 |
+
|
632 |
+
return sections
|
633 |
+
|
634 |
+
def _split_text_content(self, content: str) -> List[Dict[str, Any]]:
|
635 |
+
"""
|
636 |
+
Split plain text content by paragraphs.
|
637 |
+
|
638 |
+
Args:
|
639 |
+
content: Plain text content
|
640 |
+
|
641 |
+
Returns:
|
642 |
+
List of paragraph sections
|
643 |
+
"""
|
644 |
+
sections = []
|
645 |
+
paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
|
646 |
+
|
647 |
+
for i, paragraph in enumerate(paragraphs):
|
648 |
+
sections.append(
|
649 |
+
{
|
650 |
+
"header": f"Paragraph {i + 1}",
|
651 |
+
"content": paragraph,
|
652 |
+
"level": 1,
|
653 |
+
"section_index": i,
|
654 |
+
}
|
655 |
+
)
|
656 |
+
|
657 |
+
# If no clear paragraphs, treat as single section
|
658 |
+
if not sections:
|
659 |
+
sections.append(
|
660 |
+
{
|
661 |
+
"header": "Document Content",
|
662 |
+
"content": content.strip(),
|
663 |
+
"level": 1,
|
664 |
+
"section_index": 0,
|
665 |
+
}
|
666 |
+
)
|
667 |
+
|
668 |
+
return sections
|
src/ingestion/pipeline.py
ADDED
@@ -0,0 +1,495 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Ingestion Pipeline Module
|
3 |
+
|
4 |
+
This module orchestrates the complete document ingestion process,
|
5 |
+
integrating all components for a seamless workflow.
|
6 |
+
|
7 |
+
Components: DocumentProcessor, URLProcessor, TextExtractor, EmbeddingGenerator, VectorDB
|
8 |
+
"""
|
9 |
+
|
10 |
+
import logging
|
11 |
+
from typing import Dict, List, Any, Optional, Union
|
12 |
+
from pathlib import Path
|
13 |
+
import asyncio
|
14 |
+
from datetime import datetime
|
15 |
+
|
16 |
+
from .document_processor import DocumentProcessor
|
17 |
+
from .url_processor import URLProcessor
|
18 |
+
from ingestion.text_extractor import TextExtractor
|
19 |
+
from embedding.embedding_generator import EmbeddingGenerator
|
20 |
+
from storage.vector_db import VectorDB
|
21 |
+
from utils.config_manager import ConfigManager
|
22 |
+
from utils.error_handler import error_handler, ErrorType, RAGError
|
23 |
+
|
24 |
+
|
25 |
+
class IngestionPipeline:
|
26 |
+
"""
|
27 |
+
Complete ingestion pipeline that orchestrates document processing, text extraction,
|
28 |
+
embedding generation, and vector storage.
|
29 |
+
|
30 |
+
Features:
|
31 |
+
- End-to-end document ingestion
|
32 |
+
- URL content processing
|
33 |
+
- Batch processing capabilities
|
34 |
+
- Progress tracking and statistics
|
35 |
+
- Error handling and recovery
|
36 |
+
"""
|
37 |
+
|
38 |
+
def __init__(self, config_path: Optional[str] = None):
|
39 |
+
"""
|
40 |
+
Initialize the ingestion pipeline.
|
41 |
+
|
42 |
+
Args:
|
43 |
+
config_path: Path to configuration file
|
44 |
+
"""
|
45 |
+
self.logger = logging.getLogger(__name__)
|
46 |
+
|
47 |
+
# Load configuration
|
48 |
+
self.config_manager = ConfigManager(config_path)
|
49 |
+
self.config = self.config_manager.config
|
50 |
+
|
51 |
+
# Initialize statistics
|
52 |
+
self.stats = {
|
53 |
+
"documents_processed": 0,
|
54 |
+
"urls_processed": 0,
|
55 |
+
"chunks_created": 0,
|
56 |
+
"embeddings_generated": 0,
|
57 |
+
"vectors_stored": 0,
|
58 |
+
"errors_encountered": 0,
|
59 |
+
"start_time": None,
|
60 |
+
"end_time": None,
|
61 |
+
}
|
62 |
+
|
63 |
+
# Initialize components
|
64 |
+
self._initialize_components()
|
65 |
+
|
66 |
+
def _initialize_components(self):
|
67 |
+
"""Initialize all pipeline components."""
|
68 |
+
try:
|
69 |
+
# 📄 Document processor
|
70 |
+
doc_config = self.config.get("document_processing", {})
|
71 |
+
self.document_processor = DocumentProcessor(doc_config)
|
72 |
+
|
73 |
+
# URL processor
|
74 |
+
url_config = self.config.get("url_processing", {})
|
75 |
+
self.url_processor = URLProcessor(url_config)
|
76 |
+
|
77 |
+
# Text extractor
|
78 |
+
text_config = self.config.get("document_processing", {})
|
79 |
+
self.text_extractor = TextExtractor(text_config)
|
80 |
+
|
81 |
+
# 🔮 Embedding generator
|
82 |
+
embedding_config = self.config.get("embedding", {})
|
83 |
+
embedding_config["api_key"] = self.config.get("api_keys", {}).get(
|
84 |
+
"gemini_api_key"
|
85 |
+
)
|
86 |
+
self.embedding_generator = EmbeddingGenerator(embedding_config)
|
87 |
+
|
88 |
+
# Vector database
|
89 |
+
vector_config = self.config.get("vector_db", {})
|
90 |
+
vector_config["api_key"] = self.config.get("api_keys", {}).get(
|
91 |
+
"pinecone_api_key"
|
92 |
+
)
|
93 |
+
self.vector_db = VectorDB(vector_config)
|
94 |
+
|
95 |
+
self.logger.info("All pipeline components initialized successfully")
|
96 |
+
|
97 |
+
except Exception as e:
|
98 |
+
self.logger.error(f"❌ Failed to initialize pipeline components: {str(e)}")
|
99 |
+
raise RAGError(f"Pipeline initialization failed: {str(e)}")
|
100 |
+
|
101 |
+
@error_handler(ErrorType.DOCUMENT_PROCESSING)
|
102 |
+
def process_documents(self, file_paths: List[str]) -> Dict[str, Any]:
|
103 |
+
"""
|
104 |
+
Process multiple documents through the complete pipeline.
|
105 |
+
|
106 |
+
Args:
|
107 |
+
file_paths: List of document file paths
|
108 |
+
|
109 |
+
Returns:
|
110 |
+
Processing results and statistics
|
111 |
+
"""
|
112 |
+
self.logger.info(
|
113 |
+
f"Starting document processing pipeline for {len(file_paths)} files"
|
114 |
+
)
|
115 |
+
self.stats["start_time"] = datetime.now()
|
116 |
+
|
117 |
+
all_results = []
|
118 |
+
|
119 |
+
for i, file_path in enumerate(file_paths):
|
120 |
+
try:
|
121 |
+
self.logger.info(
|
122 |
+
f"📄 Processing document {i+1}/{len(file_paths)}: {file_path}"
|
123 |
+
)
|
124 |
+
|
125 |
+
# 📄 Step 1: Process document
|
126 |
+
doc_result = self.document_processor.process_document(file_path)
|
127 |
+
self.stats["documents_processed"] += 1
|
128 |
+
|
129 |
+
# Step 2: Extract and chunk text
|
130 |
+
text_chunks = self.text_extractor.process_text(
|
131 |
+
doc_result["content"], doc_result["metadata"]
|
132 |
+
)
|
133 |
+
self.stats["chunks_created"] += len(text_chunks)
|
134 |
+
|
135 |
+
# 🔮 Step 3: Generate embeddings
|
136 |
+
embedded_chunks = self.embedding_generator.generate_embeddings(
|
137 |
+
text_chunks
|
138 |
+
)
|
139 |
+
valid_embeddings = [
|
140 |
+
chunk for chunk in embedded_chunks if chunk.get("embedding")
|
141 |
+
]
|
142 |
+
self.stats["embeddings_generated"] += len(valid_embeddings)
|
143 |
+
|
144 |
+
# Step 4: Store in vector database
|
145 |
+
if valid_embeddings:
|
146 |
+
storage_success = self.vector_db.store_embeddings(valid_embeddings)
|
147 |
+
if storage_success:
|
148 |
+
self.stats["vectors_stored"] += len(valid_embeddings)
|
149 |
+
|
150 |
+
# Compile results
|
151 |
+
result = {
|
152 |
+
"file_path": file_path,
|
153 |
+
"document_type": doc_result.get("document_type"),
|
154 |
+
"chunks_created": len(text_chunks),
|
155 |
+
"embeddings_generated": len(valid_embeddings),
|
156 |
+
"storage_success": storage_success if valid_embeddings else False,
|
157 |
+
"metadata": doc_result["metadata"],
|
158 |
+
}
|
159 |
+
|
160 |
+
all_results.append(result)
|
161 |
+
self.logger.info(
|
162 |
+
f"Document processed: {len(text_chunks)} chunks, {len(valid_embeddings)} embeddings"
|
163 |
+
)
|
164 |
+
|
165 |
+
except Exception as e:
|
166 |
+
self.stats["errors_encountered"] += 1
|
167 |
+
self.logger.error(f"❌ Error processing {file_path}: {str(e)}")
|
168 |
+
|
169 |
+
all_results.append(
|
170 |
+
{
|
171 |
+
"file_path": file_path,
|
172 |
+
"error": str(e),
|
173 |
+
"chunks_created": 0,
|
174 |
+
"embeddings_generated": 0,
|
175 |
+
"storage_success": False,
|
176 |
+
}
|
177 |
+
)
|
178 |
+
|
179 |
+
self.stats["end_time"] = datetime.now()
|
180 |
+
|
181 |
+
return {
|
182 |
+
"results": all_results,
|
183 |
+
"statistics": self.get_statistics(),
|
184 |
+
"success_rate": self._calculate_success_rate(all_results),
|
185 |
+
}
|
186 |
+
|
187 |
+
@error_handler(ErrorType.URL_PROCESSING)
|
188 |
+
def process_urls(self, urls: List[str]) -> Dict[str, Any]:
|
189 |
+
"""
|
190 |
+
Process multiple URLs through the complete pipeline.
|
191 |
+
|
192 |
+
Args:
|
193 |
+
urls: List of URLs to process
|
194 |
+
|
195 |
+
Returns:
|
196 |
+
Processing results and statistics
|
197 |
+
"""
|
198 |
+
self.logger.info(f"Starting URL processing pipeline for {len(urls)} URLs")
|
199 |
+
self.stats["start_time"] = datetime.now()
|
200 |
+
|
201 |
+
all_results = []
|
202 |
+
|
203 |
+
for i, url in enumerate(urls):
|
204 |
+
try:
|
205 |
+
self.logger.info(f"Processing URL {i+1}/{len(urls)}: {url}")
|
206 |
+
|
207 |
+
# Step 1: Process URL
|
208 |
+
url_result = self.url_processor.process_url(url)
|
209 |
+
if not url_result:
|
210 |
+
self.logger.warning(f"No content extracted from URL: {url}")
|
211 |
+
continue
|
212 |
+
|
213 |
+
self.stats["urls_processed"] += 1
|
214 |
+
|
215 |
+
# Step 2: Extract and chunk text
|
216 |
+
text_chunks = self.text_extractor.process_text(
|
217 |
+
url_result["content"], url_result["metadata"]
|
218 |
+
)
|
219 |
+
self.stats["chunks_created"] += len(text_chunks)
|
220 |
+
|
221 |
+
# 🔮 Step 3: Generate embeddings
|
222 |
+
embedded_chunks = self.embedding_generator.generate_embeddings(
|
223 |
+
text_chunks
|
224 |
+
)
|
225 |
+
valid_embeddings = [
|
226 |
+
chunk for chunk in embedded_chunks if chunk.get("embedding")
|
227 |
+
]
|
228 |
+
self.stats["embeddings_generated"] += len(valid_embeddings)
|
229 |
+
|
230 |
+
# Step 4: Store in vector database
|
231 |
+
storage_success = False
|
232 |
+
if valid_embeddings:
|
233 |
+
storage_success = self.vector_db.store_embeddings(valid_embeddings)
|
234 |
+
if storage_success:
|
235 |
+
self.stats["vectors_stored"] += len(valid_embeddings)
|
236 |
+
|
237 |
+
# Process linked documents if any
|
238 |
+
linked_results = []
|
239 |
+
for linked_doc in url_result.get("linked_documents", []):
|
240 |
+
if linked_doc.get("content"):
|
241 |
+
linked_chunks = self.text_extractor.process_text(
|
242 |
+
linked_doc["content"], linked_doc["metadata"]
|
243 |
+
)
|
244 |
+
linked_embedded = self.embedding_generator.generate_embeddings(
|
245 |
+
linked_chunks
|
246 |
+
)
|
247 |
+
linked_valid = [
|
248 |
+
chunk for chunk in linked_embedded if chunk.get("embedding")
|
249 |
+
]
|
250 |
+
|
251 |
+
if linked_valid:
|
252 |
+
self.vector_db.store_embeddings(linked_valid)
|
253 |
+
linked_results.append(
|
254 |
+
{
|
255 |
+
"url": linked_doc["source"],
|
256 |
+
"chunks": len(linked_chunks),
|
257 |
+
"embeddings": len(linked_valid),
|
258 |
+
}
|
259 |
+
)
|
260 |
+
|
261 |
+
# Compile results
|
262 |
+
result = {
|
263 |
+
"url": url,
|
264 |
+
"chunks_created": len(text_chunks),
|
265 |
+
"embeddings_generated": len(valid_embeddings),
|
266 |
+
"storage_success": storage_success,
|
267 |
+
"linked_documents": linked_results,
|
268 |
+
"metadata": url_result["metadata"],
|
269 |
+
}
|
270 |
+
|
271 |
+
all_results.append(result)
|
272 |
+
self.logger.info(
|
273 |
+
f"URL processed: {len(text_chunks)} chunks, {len(valid_embeddings)} embeddings"
|
274 |
+
)
|
275 |
+
|
276 |
+
except Exception as e:
|
277 |
+
self.stats["errors_encountered"] += 1
|
278 |
+
self.logger.error(f"❌ Error processing {url}: {str(e)}")
|
279 |
+
|
280 |
+
all_results.append(
|
281 |
+
{
|
282 |
+
"url": url,
|
283 |
+
"error": str(e),
|
284 |
+
"chunks_created": 0,
|
285 |
+
"embeddings_generated": 0,
|
286 |
+
"storage_success": False,
|
287 |
+
}
|
288 |
+
)
|
289 |
+
|
290 |
+
self.stats["end_time"] = datetime.now()
|
291 |
+
|
292 |
+
return {
|
293 |
+
"results": all_results,
|
294 |
+
"statistics": self.get_statistics(),
|
295 |
+
"success_rate": self._calculate_success_rate(all_results),
|
296 |
+
}
|
297 |
+
|
298 |
+
def process_mixed_sources(
|
299 |
+
self, file_paths: Optional[List[str]] = None, urls: Optional[List[str]] = None
|
300 |
+
) -> Dict[str, Any]:
|
301 |
+
"""
|
302 |
+
Process both documents and URLs in a single pipeline run.
|
303 |
+
|
304 |
+
Args:
|
305 |
+
file_paths: Optional list of document file paths
|
306 |
+
urls: Optional list of URLs
|
307 |
+
|
308 |
+
Returns:
|
309 |
+
Combined processing results
|
310 |
+
"""
|
311 |
+
self.logger.info("Starting mixed source processing pipeline")
|
312 |
+
|
313 |
+
results = {
|
314 |
+
"document_results": [],
|
315 |
+
"url_results": [],
|
316 |
+
"combined_statistics": {},
|
317 |
+
"overall_success_rate": 0.0,
|
318 |
+
}
|
319 |
+
|
320 |
+
# 📄 Process documents
|
321 |
+
if file_paths:
|
322 |
+
doc_results = self.process_documents(file_paths)
|
323 |
+
results["document_results"] = doc_results["results"]
|
324 |
+
|
325 |
+
# Process URLs
|
326 |
+
if urls:
|
327 |
+
url_results = self.process_urls(urls)
|
328 |
+
results["url_results"] = url_results["results"]
|
329 |
+
|
330 |
+
# Combine statistics
|
331 |
+
results["combined_statistics"] = self.get_statistics()
|
332 |
+
|
333 |
+
# 🎯 Calculate overall success rate
|
334 |
+
all_items = results["document_results"] + results["url_results"]
|
335 |
+
results["overall_success_rate"] = self._calculate_success_rate(all_items)
|
336 |
+
|
337 |
+
return results
|
338 |
+
|
339 |
+
def _calculate_success_rate(self, results: List[Dict[str, Any]]) -> float:
|
340 |
+
"""
|
341 |
+
Calculate success rate from results.
|
342 |
+
|
343 |
+
Args:
|
344 |
+
results: List of processing results
|
345 |
+
|
346 |
+
Returns:
|
347 |
+
Success rate as percentage
|
348 |
+
"""
|
349 |
+
if not results:
|
350 |
+
return 0.0
|
351 |
+
|
352 |
+
successful = sum(
|
353 |
+
1 for result in results if result.get("storage_success", False)
|
354 |
+
)
|
355 |
+
return (successful / len(results)) * 100
|
356 |
+
|
357 |
+
def get_statistics(self) -> Dict[str, Any]:
|
358 |
+
"""
|
359 |
+
Get comprehensive pipeline statistics.
|
360 |
+
|
361 |
+
Returns:
|
362 |
+
Statistics dictionary
|
363 |
+
"""
|
364 |
+
stats = self.stats.copy()
|
365 |
+
|
366 |
+
if stats["start_time"] and stats["end_time"]:
|
367 |
+
runtime = stats["end_time"] - stats["start_time"]
|
368 |
+
stats["runtime_seconds"] = runtime.total_seconds()
|
369 |
+
stats["processing_rate"] = (
|
370 |
+
stats["documents_processed"] + stats["urls_processed"]
|
371 |
+
) / max(1, runtime.total_seconds())
|
372 |
+
|
373 |
+
# 🔮 Add component statistics
|
374 |
+
stats["embedding_stats"] = self.embedding_generator.get_statistics()
|
375 |
+
stats["vector_db_stats"] = self.vector_db.get_index_stats()
|
376 |
+
stats["url_processor_stats"] = self.url_processor.get_statistics()
|
377 |
+
|
378 |
+
return stats
|
379 |
+
|
380 |
+
def health_check(self) -> Dict[str, Any]:
|
381 |
+
"""
|
382 |
+
Perform comprehensive health check on all components.
|
383 |
+
|
384 |
+
Returns:
|
385 |
+
Health check results
|
386 |
+
"""
|
387 |
+
health = {
|
388 |
+
"overall_status": "healthy",
|
389 |
+
"timestamp": datetime.now().isoformat(),
|
390 |
+
"components": {},
|
391 |
+
}
|
392 |
+
|
393 |
+
try:
|
394 |
+
# 🔮 Check embedding generator
|
395 |
+
if self.embedding_generator.client:
|
396 |
+
health["components"]["embedding_generator"] = "Ready"
|
397 |
+
else:
|
398 |
+
health["components"]["embedding_generator"] = "❌ Not configured"
|
399 |
+
health["overall_status"] = "degraded"
|
400 |
+
|
401 |
+
# Check vector database
|
402 |
+
vector_health = self.vector_db.health_check()
|
403 |
+
health["components"]["vector_database"] = vector_health["status"]
|
404 |
+
if vector_health["status"] != "healthy":
|
405 |
+
health["overall_status"] = "degraded"
|
406 |
+
|
407 |
+
# Add component details
|
408 |
+
health["details"] = {
|
409 |
+
"vector_db_health": vector_health,
|
410 |
+
"embedding_stats": self.embedding_generator.get_statistics(),
|
411 |
+
"pipeline_stats": self.get_statistics(),
|
412 |
+
}
|
413 |
+
|
414 |
+
except Exception as e:
|
415 |
+
health["overall_status"] = "unhealthy"
|
416 |
+
health["error"] = str(e)
|
417 |
+
|
418 |
+
return health
|
419 |
+
|
420 |
+
def reset_statistics(self):
|
421 |
+
"""Reset pipeline statistics."""
|
422 |
+
self.stats = {
|
423 |
+
"documents_processed": 0,
|
424 |
+
"urls_processed": 0,
|
425 |
+
"chunks_created": 0,
|
426 |
+
"embeddings_generated": 0,
|
427 |
+
"vectors_stored": 0,
|
428 |
+
"errors_encountered": 0,
|
429 |
+
"start_time": None,
|
430 |
+
"end_time": None,
|
431 |
+
}
|
432 |
+
|
433 |
+
# Reset component statistics
|
434 |
+
self.embedding_generator.stats = {
|
435 |
+
"total_requests": 0,
|
436 |
+
"successful_requests": 0,
|
437 |
+
"failed_requests": 0,
|
438 |
+
"cache_hits": 0,
|
439 |
+
"total_tokens_processed": 0,
|
440 |
+
"start_time": datetime.now(),
|
441 |
+
}
|
442 |
+
|
443 |
+
self.vector_db.reset_stats()
|
444 |
+
self.url_processor.reset()
|
445 |
+
|
446 |
+
self.logger.info("All pipeline statistics reset")
|
447 |
+
|
448 |
+
|
449 |
+
# Convenience function for quick pipeline usage
|
450 |
+
def create_pipeline(config_path: Optional[str] = None) -> IngestionPipeline:
|
451 |
+
"""
|
452 |
+
Create and return a configured ingestion pipeline.
|
453 |
+
|
454 |
+
Args:
|
455 |
+
config_path: Optional path to configuration file
|
456 |
+
|
457 |
+
Returns:
|
458 |
+
Configured IngestionPipeline instance
|
459 |
+
"""
|
460 |
+
return IngestionPipeline(config_path)
|
461 |
+
|
462 |
+
|
463 |
+
# 📄 Example usage functions
|
464 |
+
def process_documents_simple(
|
465 |
+
file_paths: List[str], config_path: Optional[str] = None
|
466 |
+
) -> Dict[str, Any]:
|
467 |
+
"""
|
468 |
+
📄 Simple function to process documents with default configuration.
|
469 |
+
|
470 |
+
Args:
|
471 |
+
file_paths: List of document file paths
|
472 |
+
config_path: Optional configuration file path
|
473 |
+
|
474 |
+
Returns:
|
475 |
+
Processing results
|
476 |
+
"""
|
477 |
+
pipeline = create_pipeline(config_path)
|
478 |
+
return pipeline.process_documents(file_paths)
|
479 |
+
|
480 |
+
|
481 |
+
def process_urls_simple(
|
482 |
+
urls: List[str], config_path: Optional[str] = None
|
483 |
+
) -> Dict[str, Any]:
|
484 |
+
"""
|
485 |
+
Simple function to process URLs with default configuration.
|
486 |
+
|
487 |
+
Args:
|
488 |
+
urls: List of URLs to process
|
489 |
+
config_path: Optional configuration file path
|
490 |
+
|
491 |
+
Returns:
|
492 |
+
Processing results
|
493 |
+
"""
|
494 |
+
pipeline = create_pipeline(config_path)
|
495 |
+
return pipeline.process_urls(urls)
|
src/ingestion/text_extractor.py
ADDED
@@ -0,0 +1,526 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Text Extractor Module
|
3 |
+
|
4 |
+
This module is responsible for cleaning, normalizing, and chunking text
|
5 |
+
from various sources with complete NLP functionality.
|
6 |
+
|
7 |
+
Technologies: NLTK, spaCy, regex, langdetect
|
8 |
+
"""
|
9 |
+
|
10 |
+
import re
|
11 |
+
import logging
|
12 |
+
from datetime import datetime
|
13 |
+
from typing import Dict, List, Any, Optional, Union
|
14 |
+
import unicodedata
|
15 |
+
|
16 |
+
# Import NLP libraries
|
17 |
+
try:
|
18 |
+
import nltk
|
19 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
20 |
+
from nltk.corpus import stopwords
|
21 |
+
from nltk.stem import PorterStemmer
|
22 |
+
import spacy
|
23 |
+
from langdetect import detect
|
24 |
+
from langdetect.lang_detect_exception import LangDetectException as LangDetectError
|
25 |
+
|
26 |
+
# Download required NLTK data
|
27 |
+
try:
|
28 |
+
nltk.data.find("tokenizers/punkt")
|
29 |
+
except LookupError:
|
30 |
+
nltk.download("punkt", quiet=True)
|
31 |
+
|
32 |
+
try:
|
33 |
+
nltk.data.find("corpora/stopwords")
|
34 |
+
except LookupError:
|
35 |
+
nltk.download("stopwords", quiet=True)
|
36 |
+
|
37 |
+
except ImportError as e:
|
38 |
+
logging.warning(f"Some NLP libraries are not installed: {e}")
|
39 |
+
|
40 |
+
from utils.error_handler import error_handler, ErrorType
|
41 |
+
|
42 |
+
|
43 |
+
class TextExtractor:
|
44 |
+
"""
|
45 |
+
Cleans, normalizes, and chunks text from various sources with intelligent processing.
|
46 |
+
|
47 |
+
Features:
|
48 |
+
- Advanced text cleaning and normalization
|
49 |
+
- Language detection
|
50 |
+
- Intelligent sentence segmentation
|
51 |
+
- Smart text chunking with overlap
|
52 |
+
- Metadata preservation
|
53 |
+
"""
|
54 |
+
|
55 |
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
56 |
+
"""
|
57 |
+
Initialize the TextExtractor with configuration.
|
58 |
+
|
59 |
+
Args:
|
60 |
+
config: Configuration dictionary with processing parameters
|
61 |
+
"""
|
62 |
+
self.config = config or {}
|
63 |
+
self.logger = logging.getLogger(__name__)
|
64 |
+
|
65 |
+
# Configuration settings
|
66 |
+
self.chunk_size = self.config.get("chunk_size", 1000)
|
67 |
+
self.chunk_overlap = self.config.get("chunk_overlap", 200)
|
68 |
+
self.min_chunk_size = self.config.get("min_chunk_size", 100)
|
69 |
+
self.max_chunk_size = self.config.get("max_chunk_size", 2000)
|
70 |
+
|
71 |
+
# NLP settings
|
72 |
+
self.enable_language_detection = self.config.get(
|
73 |
+
"enable_language_detection", True
|
74 |
+
)
|
75 |
+
self.preserve_formatting = self.config.get("preserve_formatting", False)
|
76 |
+
self.remove_stopwords = self.config.get("remove_stopwords", False)
|
77 |
+
|
78 |
+
# Initialize NLP components
|
79 |
+
self.nlp = None
|
80 |
+
self.stemmer = None
|
81 |
+
self.stop_words = set()
|
82 |
+
|
83 |
+
self._initialize_nlp_components()
|
84 |
+
|
85 |
+
def _initialize_nlp_components(self):
|
86 |
+
"""Initialize NLP components with error handling."""
|
87 |
+
try:
|
88 |
+
# Load spaCy model for advanced processing
|
89 |
+
self.nlp = spacy.load("en_core_web_sm")
|
90 |
+
self.logger.info("spaCy model loaded successfully")
|
91 |
+
except Exception as e:
|
92 |
+
self.logger.warning(f"Could not load spaCy model: {str(e)}")
|
93 |
+
|
94 |
+
try:
|
95 |
+
# Initialize NLTK components
|
96 |
+
self.stemmer = PorterStemmer()
|
97 |
+
self.stop_words = set(stopwords.words("english"))
|
98 |
+
self.logger.info("NLTK components initialized")
|
99 |
+
except Exception as e:
|
100 |
+
self.logger.warning(f"Could not initialize NLTK components: {str(e)}")
|
101 |
+
|
102 |
+
@error_handler(ErrorType.DOCUMENT_PROCESSING)
|
103 |
+
def process_text(
|
104 |
+
self,
|
105 |
+
text: Union[str, List[str]],
|
106 |
+
metadata: Optional[Dict[str, Any]] = None,
|
107 |
+
preserve_structure: bool = False,
|
108 |
+
) -> List[Dict[str, Any]]:
|
109 |
+
"""
|
110 |
+
Process text by cleaning, normalizing, and chunking with intelligence.
|
111 |
+
|
112 |
+
Args:
|
113 |
+
text: Raw text content (string or list of strings)
|
114 |
+
metadata: Optional metadata to include with each chunk
|
115 |
+
preserve_structure: Whether to preserve original text structure
|
116 |
+
|
117 |
+
Returns:
|
118 |
+
List of dictionaries containing processed text chunks and metadata
|
119 |
+
"""
|
120 |
+
if not text:
|
121 |
+
return []
|
122 |
+
|
123 |
+
# Convert list to string if needed
|
124 |
+
if isinstance(text, list):
|
125 |
+
text = "\n".join(str(item) for item in text if item)
|
126 |
+
|
127 |
+
if not text.strip():
|
128 |
+
return []
|
129 |
+
|
130 |
+
self.logger.info(f"Processing text: {len(text)} characters")
|
131 |
+
|
132 |
+
# Detect language
|
133 |
+
language = self._detect_language(text)
|
134 |
+
|
135 |
+
# Clean and normalize the text
|
136 |
+
cleaned_text = self._clean_text(text, preserve_structure)
|
137 |
+
|
138 |
+
if len(cleaned_text.strip()) < self.min_chunk_size:
|
139 |
+
self.logger.warning(
|
140 |
+
f"Text too short after cleaning: {len(cleaned_text)} chars"
|
141 |
+
)
|
142 |
+
return []
|
143 |
+
|
144 |
+
# Split text into chunks
|
145 |
+
chunks = self._chunk_text(cleaned_text)
|
146 |
+
|
147 |
+
# Prepare result with enhanced metadata
|
148 |
+
result = []
|
149 |
+
base_metadata = metadata.copy() if metadata else {}
|
150 |
+
base_metadata.update(
|
151 |
+
{
|
152 |
+
"language": language,
|
153 |
+
"original_length": len(text),
|
154 |
+
"cleaned_length": len(cleaned_text),
|
155 |
+
"chunk_count": len(chunks),
|
156 |
+
"processing_time": datetime.now().isoformat(),
|
157 |
+
"chunk_size_config": self.chunk_size,
|
158 |
+
"chunk_overlap_config": self.chunk_overlap,
|
159 |
+
}
|
160 |
+
)
|
161 |
+
|
162 |
+
for i, chunk in enumerate(chunks):
|
163 |
+
chunk_metadata = base_metadata.copy()
|
164 |
+
chunk_stats = self._analyze_chunk(chunk)
|
165 |
+
|
166 |
+
chunk_metadata.update(
|
167 |
+
{
|
168 |
+
"chunk_index": i,
|
169 |
+
"chunk_id": f"chunk_{i}_{hash(chunk) % 10000}",
|
170 |
+
**chunk_stats,
|
171 |
+
}
|
172 |
+
)
|
173 |
+
|
174 |
+
result.append({"content": chunk, "metadata": chunk_metadata})
|
175 |
+
|
176 |
+
self.logger.info(f"Processed text into {len(chunks)} chunks")
|
177 |
+
return result
|
178 |
+
|
179 |
+
def _detect_language(self, text: str) -> str:
|
180 |
+
"""
|
181 |
+
Detect the language of the text.
|
182 |
+
|
183 |
+
Args:
|
184 |
+
text: Text to analyze
|
185 |
+
|
186 |
+
Returns:
|
187 |
+
Language code (e.g., 'en', 'es', 'fr')
|
188 |
+
"""
|
189 |
+
if not self.enable_language_detection:
|
190 |
+
return "en" # Default to English
|
191 |
+
|
192 |
+
try:
|
193 |
+
# Use a sample of text for detection (first 1000 chars)
|
194 |
+
sample = text[:1000].strip()
|
195 |
+
if len(sample) < 50: # Too short for reliable detection
|
196 |
+
return "en"
|
197 |
+
|
198 |
+
language = detect(sample)
|
199 |
+
self.logger.info(f"Detected language: {language}")
|
200 |
+
return language
|
201 |
+
|
202 |
+
except (LangDetectError, Exception) as e:
|
203 |
+
self.logger.warning(f"Language detection failed: {str(e)}")
|
204 |
+
return "en" # Default to English
|
205 |
+
|
206 |
+
def _clean_text(self, text: str, preserve_structure: bool = False) -> str:
|
207 |
+
"""
|
208 |
+
Clean and normalize text with advanced processing.
|
209 |
+
|
210 |
+
Args:
|
211 |
+
text: Raw text to clean
|
212 |
+
preserve_structure: Whether to preserve formatting
|
213 |
+
|
214 |
+
Returns:
|
215 |
+
Cleaned and normalized text
|
216 |
+
"""
|
217 |
+
# Unicode normalization
|
218 |
+
text = unicodedata.normalize("NFKC", text)
|
219 |
+
|
220 |
+
if not preserve_structure:
|
221 |
+
# Basic cleaning operations
|
222 |
+
# Remove excessive whitespace but preserve paragraph breaks
|
223 |
+
text = re.sub(r"[ \t]+", " ", text) # Multiple spaces/tabs to single space
|
224 |
+
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text) # Multiple newlines to double
|
225 |
+
|
226 |
+
# Remove or normalize special characters
|
227 |
+
# Keep basic punctuation and common symbols
|
228 |
+
text = re.sub(r'[^\w\s.,;:!?\'"\-()[\]{}/@#$%&*+=<>|\\~`\n]', " ", text)
|
229 |
+
|
230 |
+
# Clean up whitespace again
|
231 |
+
text = re.sub(r"[ \t]+", " ", text)
|
232 |
+
text = re.sub(r"\n\s*\n+", "\n\n", text)
|
233 |
+
|
234 |
+
# Remove common artifacts
|
235 |
+
# Remove page numbers and headers/footers patterns
|
236 |
+
text = re.sub(r"\n\s*\d+\s*\n", "\n", text) # Standalone page numbers
|
237 |
+
text = re.sub(r"\n\s*Page \d+.*?\n", "\n", text, flags=re.IGNORECASE)
|
238 |
+
|
239 |
+
# Remove excessive punctuation
|
240 |
+
text = re.sub(r"[.]{3,}", "...", text) # Multiple dots
|
241 |
+
text = re.sub(r"[-]{3,}", "---", text) # Multiple dashes
|
242 |
+
|
243 |
+
# Final cleanup
|
244 |
+
text = text.strip()
|
245 |
+
|
246 |
+
return text
|
247 |
+
|
248 |
+
def _chunk_text(self, text: str) -> List[str]:
|
249 |
+
"""
|
250 |
+
Split text into chunks with intelligent boundary detection.
|
251 |
+
|
252 |
+
Args:
|
253 |
+
text: Cleaned text to chunk
|
254 |
+
|
255 |
+
Returns:
|
256 |
+
List of text chunks
|
257 |
+
"""
|
258 |
+
if len(text) <= self.chunk_size:
|
259 |
+
return [text]
|
260 |
+
|
261 |
+
chunks = []
|
262 |
+
|
263 |
+
# Try intelligent chunking with spaCy first
|
264 |
+
if self.nlp:
|
265 |
+
try:
|
266 |
+
return self._chunk_with_spacy(text)
|
267 |
+
except Exception as e:
|
268 |
+
self.logger.warning(f"spaCy chunking failed: {str(e)}")
|
269 |
+
|
270 |
+
# Fallback to NLTK sentence-based chunking
|
271 |
+
try:
|
272 |
+
return self._chunk_with_sentences(text)
|
273 |
+
except Exception as e:
|
274 |
+
self.logger.warning(f"Sentence chunking failed: {str(e)}")
|
275 |
+
|
276 |
+
# Final fallback to character-based chunking
|
277 |
+
return self._chunk_by_characters(text)
|
278 |
+
|
279 |
+
def _chunk_with_spacy(self, text: str) -> List[str]:
|
280 |
+
"""
|
281 |
+
Intelligent chunking using spaCy for better semantic boundaries.
|
282 |
+
|
283 |
+
Args:
|
284 |
+
text: Text to chunk
|
285 |
+
|
286 |
+
Returns:
|
287 |
+
List of text chunks
|
288 |
+
"""
|
289 |
+
doc = self.nlp(text)
|
290 |
+
chunks = []
|
291 |
+
current_chunk = []
|
292 |
+
current_size = 0
|
293 |
+
|
294 |
+
for sent in doc.sents:
|
295 |
+
sent_text = sent.text.strip()
|
296 |
+
sent_size = len(sent_text)
|
297 |
+
|
298 |
+
# 📏 Check if adding this sentence exceeds chunk size
|
299 |
+
if current_size + sent_size > self.chunk_size and current_chunk:
|
300 |
+
# 📦 Finalize current chunk
|
301 |
+
chunk_text = " ".join(current_chunk)
|
302 |
+
chunks.append(chunk_text)
|
303 |
+
|
304 |
+
# Start new chunk with overlap
|
305 |
+
overlap_chunk, overlap_size = self._create_overlap(current_chunk)
|
306 |
+
current_chunk = overlap_chunk
|
307 |
+
current_size = overlap_size
|
308 |
+
|
309 |
+
current_chunk.append(sent_text)
|
310 |
+
current_size += sent_size
|
311 |
+
|
312 |
+
# 📦 Add the last chunk
|
313 |
+
if current_chunk:
|
314 |
+
chunk_text = " ".join(current_chunk)
|
315 |
+
if len(chunk_text.strip()) >= self.min_chunk_size:
|
316 |
+
chunks.append(chunk_text)
|
317 |
+
|
318 |
+
return chunks
|
319 |
+
|
320 |
+
def _chunk_with_sentences(self, text: str) -> List[str]:
|
321 |
+
"""
|
322 |
+
Chunk text using NLTK sentence tokenization.
|
323 |
+
|
324 |
+
Args:
|
325 |
+
text: Text to chunk
|
326 |
+
|
327 |
+
Returns:
|
328 |
+
List of text chunks
|
329 |
+
"""
|
330 |
+
sentences = sent_tokenize(text)
|
331 |
+
chunks = []
|
332 |
+
current_chunk = []
|
333 |
+
current_size = 0
|
334 |
+
|
335 |
+
for sentence in sentences:
|
336 |
+
sentence = sentence.strip()
|
337 |
+
sentence_size = len(sentence)
|
338 |
+
|
339 |
+
# 📏 Check chunk size limit
|
340 |
+
if current_size + sentence_size > self.chunk_size and current_chunk:
|
341 |
+
# 📦 Finalize current chunk
|
342 |
+
chunk_text = " ".join(current_chunk)
|
343 |
+
chunks.append(chunk_text)
|
344 |
+
|
345 |
+
# Create overlap
|
346 |
+
overlap_chunk, overlap_size = self._create_overlap(current_chunk)
|
347 |
+
current_chunk = overlap_chunk
|
348 |
+
current_size = overlap_size
|
349 |
+
|
350 |
+
current_chunk.append(sentence)
|
351 |
+
current_size += sentence_size
|
352 |
+
|
353 |
+
# 📦 Add final chunk
|
354 |
+
if current_chunk:
|
355 |
+
chunk_text = " ".join(current_chunk)
|
356 |
+
if len(chunk_text.strip()) >= self.min_chunk_size:
|
357 |
+
chunks.append(chunk_text)
|
358 |
+
|
359 |
+
return chunks
|
360 |
+
|
361 |
+
def _chunk_by_characters(self, text: str) -> List[str]:
|
362 |
+
"""
|
363 |
+
Fallback character-based chunking with boundary detection.
|
364 |
+
|
365 |
+
Args:
|
366 |
+
text: Text to chunk
|
367 |
+
|
368 |
+
Returns:
|
369 |
+
List of text chunks
|
370 |
+
"""
|
371 |
+
chunks = []
|
372 |
+
start = 0
|
373 |
+
|
374 |
+
while start < len(text):
|
375 |
+
end = start + self.chunk_size
|
376 |
+
|
377 |
+
# Try to find a good boundary
|
378 |
+
if end < len(text):
|
379 |
+
# Look for sentence boundaries first
|
380 |
+
for boundary in [". ", "! ", "? ", "\n\n", "\n", ". "]:
|
381 |
+
boundary_pos = text.rfind(boundary, start, end)
|
382 |
+
if boundary_pos > start + self.min_chunk_size:
|
383 |
+
end = boundary_pos + len(boundary)
|
384 |
+
break
|
385 |
+
|
386 |
+
chunk = text[start:end].strip()
|
387 |
+
if len(chunk) >= self.min_chunk_size:
|
388 |
+
chunks.append(chunk)
|
389 |
+
|
390 |
+
# Move start position with overlap
|
391 |
+
start = max(start + 1, end - self.chunk_overlap)
|
392 |
+
|
393 |
+
return chunks
|
394 |
+
|
395 |
+
def _create_overlap(self, sentences: List[str]) -> tuple:
|
396 |
+
"""
|
397 |
+
Create overlap from previous chunk sentences.
|
398 |
+
|
399 |
+
Args:
|
400 |
+
sentences: List of sentences from previous chunk
|
401 |
+
|
402 |
+
Returns:
|
403 |
+
Tuple of (overlap_sentences, overlap_size)
|
404 |
+
"""
|
405 |
+
overlap_sentences = []
|
406 |
+
overlap_size = 0
|
407 |
+
|
408 |
+
# Add sentences from the end for overlap
|
409 |
+
for sentence in reversed(sentences):
|
410 |
+
if overlap_size + len(sentence) <= self.chunk_overlap:
|
411 |
+
overlap_sentences.insert(0, sentence)
|
412 |
+
overlap_size += len(sentence)
|
413 |
+
else:
|
414 |
+
break
|
415 |
+
|
416 |
+
return overlap_sentences, overlap_size
|
417 |
+
|
418 |
+
def _analyze_chunk(self, chunk: str) -> Dict[str, Any]:
|
419 |
+
"""
|
420 |
+
Analyze chunk statistics and properties.
|
421 |
+
|
422 |
+
Args:
|
423 |
+
chunk: Text chunk to analyze
|
424 |
+
|
425 |
+
Returns:
|
426 |
+
Dictionary with chunk statistics
|
427 |
+
"""
|
428 |
+
words = chunk.split()
|
429 |
+
|
430 |
+
stats = {
|
431 |
+
"character_count": len(chunk),
|
432 |
+
"word_count": len(words),
|
433 |
+
"sentence_count": len(sent_tokenize(chunk)) if chunk else 0,
|
434 |
+
"avg_word_length": (
|
435 |
+
sum(len(word) for word in words) / len(words) if words else 0
|
436 |
+
),
|
437 |
+
}
|
438 |
+
|
439 |
+
# Advanced analysis with spaCy if available
|
440 |
+
if self.nlp:
|
441 |
+
try:
|
442 |
+
doc = self.nlp(chunk)
|
443 |
+
stats.update(
|
444 |
+
{
|
445 |
+
"entity_count": len(doc.ents),
|
446 |
+
"noun_count": len(
|
447 |
+
[token for token in doc if token.pos_ == "NOUN"]
|
448 |
+
),
|
449 |
+
"verb_count": len(
|
450 |
+
[token for token in doc if token.pos_ == "VERB"]
|
451 |
+
),
|
452 |
+
}
|
453 |
+
)
|
454 |
+
except Exception:
|
455 |
+
pass # Skip advanced analysis if it fails
|
456 |
+
|
457 |
+
return stats
|
458 |
+
|
459 |
+
def extract_keywords(self, text: str, max_keywords: int = 10) -> List[str]:
|
460 |
+
"""
|
461 |
+
Extract keywords from text using NLP techniques.
|
462 |
+
|
463 |
+
Args:
|
464 |
+
text: Text to extract keywords from
|
465 |
+
max_keywords: Maximum number of keywords to return
|
466 |
+
|
467 |
+
Returns:
|
468 |
+
List of extracted keywords
|
469 |
+
"""
|
470 |
+
if not self.nlp:
|
471 |
+
return []
|
472 |
+
|
473 |
+
try:
|
474 |
+
doc = self.nlp(text)
|
475 |
+
|
476 |
+
# Extract keywords based on POS tags and frequency
|
477 |
+
keywords = []
|
478 |
+
word_freq = {}
|
479 |
+
|
480 |
+
for token in doc:
|
481 |
+
if (
|
482 |
+
token.pos_ in ["NOUN", "PROPN", "ADJ"]
|
483 |
+
and not token.is_stop
|
484 |
+
and not token.is_punct
|
485 |
+
and len(token.text) > 2
|
486 |
+
):
|
487 |
+
|
488 |
+
word = token.lemma_.lower()
|
489 |
+
word_freq[word] = word_freq.get(word, 0) + 1
|
490 |
+
|
491 |
+
# Sort by frequency and return top keywords
|
492 |
+
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
493 |
+
keywords = [word for word, freq in sorted_words[:max_keywords]]
|
494 |
+
|
495 |
+
return keywords
|
496 |
+
|
497 |
+
except Exception as e:
|
498 |
+
self.logger.warning(f"Keyword extraction failed: {str(e)}")
|
499 |
+
return []
|
500 |
+
|
501 |
+
def get_text_statistics(self, text: str) -> Dict[str, Any]:
|
502 |
+
"""
|
503 |
+
Get comprehensive text statistics.
|
504 |
+
|
505 |
+
Args:
|
506 |
+
text: Text to analyze
|
507 |
+
|
508 |
+
Returns:
|
509 |
+
Dictionary with text statistics
|
510 |
+
"""
|
511 |
+
words = text.split()
|
512 |
+
sentences = sent_tokenize(text) if text else []
|
513 |
+
|
514 |
+
stats = {
|
515 |
+
"character_count": len(text),
|
516 |
+
"word_count": len(words),
|
517 |
+
"sentence_count": len(sentences),
|
518 |
+
"paragraph_count": len([p for p in text.split("\n\n") if p.strip()]),
|
519 |
+
"avg_words_per_sentence": len(words) / len(sentences) if sentences else 0,
|
520 |
+
"avg_chars_per_word": (
|
521 |
+
sum(len(word) for word in words) / len(words) if words else 0
|
522 |
+
),
|
523 |
+
"language": self._detect_language(text),
|
524 |
+
}
|
525 |
+
|
526 |
+
return stats
|
src/ingestion/url_processor.py
ADDED
@@ -0,0 +1,603 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
URL Processor Module
|
3 |
+
|
4 |
+
This module is responsible for crawling and extracting content from provided URLs,
|
5 |
+
including nested documents and links with complete web scraping functionality.
|
6 |
+
|
7 |
+
Technologies: BeautifulSoup, requests, trafilatura
|
8 |
+
"""
|
9 |
+
|
10 |
+
import logging
|
11 |
+
import time
|
12 |
+
import re
|
13 |
+
from datetime import datetime
|
14 |
+
from typing import Dict, List, Any, Optional, Set
|
15 |
+
from urllib.parse import urlparse, urljoin, urlunparse
|
16 |
+
from urllib.robotparser import RobotFileParser
|
17 |
+
|
18 |
+
# Import web scraping libraries
|
19 |
+
try:
|
20 |
+
import requests
|
21 |
+
from bs4 import BeautifulSoup
|
22 |
+
import trafilatura
|
23 |
+
from requests.adapters import HTTPAdapter
|
24 |
+
from urllib3.util.retry import Retry
|
25 |
+
except ImportError as e:
|
26 |
+
logging.warning(f"Some web scraping libraries are not installed: {e}")
|
27 |
+
|
28 |
+
from utils.error_handler import URLProcessingError, error_handler, ErrorType
|
29 |
+
|
30 |
+
|
31 |
+
class URLProcessor:
|
32 |
+
"""
|
33 |
+
Processes URLs to extract content from web pages and linked documents with full functionality.
|
34 |
+
|
35 |
+
Features:
|
36 |
+
- Web page content extraction with trafilatura
|
37 |
+
- Recursive link following with depth control
|
38 |
+
- Rate limiting and retry logic
|
39 |
+
- Robots.txt respect
|
40 |
+
- Multiple content type handling
|
41 |
+
"""
|
42 |
+
|
43 |
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
44 |
+
"""
|
45 |
+
Initialize the URLProcessor with configuration.
|
46 |
+
|
47 |
+
Args:
|
48 |
+
config: Configuration dictionary with processing parameters
|
49 |
+
"""
|
50 |
+
self.config = config or {}
|
51 |
+
self.logger = logging.getLogger(__name__)
|
52 |
+
|
53 |
+
# Configuration settings
|
54 |
+
self.max_depth = self.config.get("max_depth", 1)
|
55 |
+
self.follow_links = self.config.get("follow_links", True)
|
56 |
+
self.max_pages = self.config.get("max_pages", 10)
|
57 |
+
self.timeout = self.config.get("timeout", 10)
|
58 |
+
self.user_agent = self.config.get("user_agent", "RAG-AI-Bot/1.0")
|
59 |
+
self.respect_robots_txt = self.config.get("respect_robots_txt", True)
|
60 |
+
self.rate_limit_delay = self.config.get("rate_limit_delay", 1.0)
|
61 |
+
|
62 |
+
# Retry configuration
|
63 |
+
self.max_retries = 3
|
64 |
+
self.backoff_factor = 0.3
|
65 |
+
|
66 |
+
# Track visited URLs and robots.txt cache
|
67 |
+
self.visited_urls: Set[str] = set()
|
68 |
+
self.robots_cache: Dict[str, RobotFileParser] = {}
|
69 |
+
self.last_request_time: Dict[str, float] = {}
|
70 |
+
|
71 |
+
# Setup session with retry strategy
|
72 |
+
self.session = self._setup_session()
|
73 |
+
|
74 |
+
def _setup_session(self) -> requests.Session:
|
75 |
+
"""
|
76 |
+
Setup requests session with retry strategy and headers.
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
Configured requests session
|
80 |
+
"""
|
81 |
+
session = requests.Session()
|
82 |
+
|
83 |
+
# Retry strategy
|
84 |
+
retry_strategy = Retry(
|
85 |
+
total=self.max_retries,
|
86 |
+
backoff_factor=self.backoff_factor,
|
87 |
+
status_forcelist=[429, 500, 502, 503, 504],
|
88 |
+
)
|
89 |
+
|
90 |
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
91 |
+
session.mount("http://", adapter)
|
92 |
+
session.mount("https://", adapter)
|
93 |
+
|
94 |
+
# 🏷Default headers
|
95 |
+
session.headers.update(
|
96 |
+
{
|
97 |
+
"User-Agent": self.user_agent,
|
98 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
99 |
+
"Accept-Language": "en-US,en;q=0.5",
|
100 |
+
"Accept-Encoding": "gzip, deflate",
|
101 |
+
"Connection": "keep-alive",
|
102 |
+
}
|
103 |
+
)
|
104 |
+
|
105 |
+
return session
|
106 |
+
|
107 |
+
@error_handler(ErrorType.URL_PROCESSING)
|
108 |
+
def process_url(self, url: str, depth: int = 0) -> Dict[str, Any]:
|
109 |
+
"""
|
110 |
+
Process a URL and extract its content with full functionality.
|
111 |
+
|
112 |
+
Args:
|
113 |
+
url: The URL to process
|
114 |
+
depth: Current crawling depth
|
115 |
+
|
116 |
+
Returns:
|
117 |
+
Dictionary containing extracted text and metadata
|
118 |
+
"""
|
119 |
+
# Validation checks
|
120 |
+
if not url or not self._is_valid_url(url):
|
121 |
+
raise URLProcessingError(f"Invalid URL: {url}", url)
|
122 |
+
|
123 |
+
if depth > self.max_depth:
|
124 |
+
self.logger.info(f"🛑 Max depth reached for: {url}")
|
125 |
+
return {}
|
126 |
+
|
127 |
+
if len(self.visited_urls) >= self.max_pages:
|
128 |
+
self.logger.info(f"🛑 Max pages limit reached")
|
129 |
+
return {}
|
130 |
+
|
131 |
+
if url in self.visited_urls:
|
132 |
+
self.logger.info(f"Already visited: {url}")
|
133 |
+
return {}
|
134 |
+
|
135 |
+
# Check robots.txt if enabled
|
136 |
+
if self.respect_robots_txt and not self._can_fetch(url):
|
137 |
+
self.logger.info(f"Robots.txt disallows: {url}")
|
138 |
+
return {}
|
139 |
+
|
140 |
+
self.visited_urls.add(url)
|
141 |
+
self.logger.info(f"Processing URL: {url} (depth: {depth})")
|
142 |
+
|
143 |
+
try:
|
144 |
+
# Rate limiting
|
145 |
+
self._apply_rate_limit(url)
|
146 |
+
|
147 |
+
# Fetch and extract content
|
148 |
+
content, metadata = self._extract_content(url)
|
149 |
+
|
150 |
+
if not content:
|
151 |
+
self.logger.warning(f"No content extracted from: {url}")
|
152 |
+
return {}
|
153 |
+
|
154 |
+
result = {
|
155 |
+
"content": content,
|
156 |
+
"metadata": metadata,
|
157 |
+
"source": url,
|
158 |
+
"depth": depth,
|
159 |
+
"linked_documents": [],
|
160 |
+
"document_type": "webpage",
|
161 |
+
"crawl_stats": {
|
162 |
+
"max_depth_configured": self.max_depth,
|
163 |
+
"follow_links_enabled": self.follow_links,
|
164 |
+
"current_depth": depth,
|
165 |
+
},
|
166 |
+
}
|
167 |
+
|
168 |
+
# Follow links if configured and not at max depth
|
169 |
+
if (
|
170 |
+
self.follow_links
|
171 |
+
and depth < self.max_depth
|
172 |
+
and len(self.visited_urls) < self.max_pages
|
173 |
+
):
|
174 |
+
links = self._extract_links(url, content)
|
175 |
+
self.logger.info(f" Found {len(links)} links on {url}")
|
176 |
+
|
177 |
+
for link in links[:5]: # Limit links per page
|
178 |
+
try:
|
179 |
+
linked_content = self.process_url(link, depth + 1)
|
180 |
+
if linked_content:
|
181 |
+
result["linked_documents"].append(linked_content)
|
182 |
+
except Exception as e:
|
183 |
+
self.logger.warning(
|
184 |
+
f"Failed to process linked URL {link}: {str(e)}"
|
185 |
+
)
|
186 |
+
continue
|
187 |
+
|
188 |
+
return result
|
189 |
+
|
190 |
+
except Exception as e:
|
191 |
+
raise URLProcessingError(f"Error processing URL: {str(e)}", url)
|
192 |
+
|
193 |
+
def process_batch(self, urls: List[str]) -> List[Dict[str, Any]]:
|
194 |
+
"""
|
195 |
+
Process multiple URLs in batch.
|
196 |
+
|
197 |
+
Args:
|
198 |
+
urls: List of URLs to process
|
199 |
+
|
200 |
+
Returns:
|
201 |
+
List of processed URL results
|
202 |
+
"""
|
203 |
+
results = []
|
204 |
+
self.logger.info(f"Processing batch of {len(urls)} URLs")
|
205 |
+
|
206 |
+
for i, url in enumerate(urls):
|
207 |
+
try:
|
208 |
+
result = self.process_url(url)
|
209 |
+
if result:
|
210 |
+
results.append(result)
|
211 |
+
self.logger.info(f"Processed {i+1}/{len(urls)}: {url}")
|
212 |
+
except Exception as e:
|
213 |
+
self.logger.error(f"❌ Failed to process {url}: {str(e)}")
|
214 |
+
continue
|
215 |
+
|
216 |
+
return results
|
217 |
+
|
218 |
+
def _is_valid_url(self, url: str) -> bool:
|
219 |
+
"""
|
220 |
+
Validate URL format and scheme.
|
221 |
+
|
222 |
+
Args:
|
223 |
+
url: URL to validate
|
224 |
+
|
225 |
+
Returns:
|
226 |
+
True if URL is valid
|
227 |
+
"""
|
228 |
+
try:
|
229 |
+
parsed = urlparse(url)
|
230 |
+
return bool(parsed.netloc) and parsed.scheme in ["http", "https"]
|
231 |
+
except Exception:
|
232 |
+
return False
|
233 |
+
|
234 |
+
def _can_fetch(self, url: str) -> bool:
|
235 |
+
"""
|
236 |
+
Check if URL can be fetched according to robots.txt.
|
237 |
+
|
238 |
+
Args:
|
239 |
+
url: URL to check
|
240 |
+
|
241 |
+
Returns:
|
242 |
+
True if URL can be fetched
|
243 |
+
"""
|
244 |
+
try:
|
245 |
+
parsed_url = urlparse(url)
|
246 |
+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
247 |
+
|
248 |
+
if base_url not in self.robots_cache:
|
249 |
+
robots_url = urljoin(base_url, "/robots.txt")
|
250 |
+
rp = RobotFileParser()
|
251 |
+
rp.set_url(robots_url)
|
252 |
+
|
253 |
+
try:
|
254 |
+
rp.read()
|
255 |
+
self.robots_cache[base_url] = rp
|
256 |
+
except Exception:
|
257 |
+
# If robots.txt can't be fetched, assume allowed
|
258 |
+
return True
|
259 |
+
|
260 |
+
return self.robots_cache[base_url].can_fetch(self.user_agent, url)
|
261 |
+
|
262 |
+
except Exception:
|
263 |
+
# If robots.txt check fails, assume allowed
|
264 |
+
return True
|
265 |
+
|
266 |
+
def _apply_rate_limit(self, url: str) -> None:
|
267 |
+
"""
|
268 |
+
Apply rate limiting between requests to the same domain.
|
269 |
+
|
270 |
+
Args:
|
271 |
+
url: URL being processed
|
272 |
+
"""
|
273 |
+
domain = urlparse(url).netloc
|
274 |
+
current_time = time.time()
|
275 |
+
|
276 |
+
if domain in self.last_request_time:
|
277 |
+
time_since_last = current_time - self.last_request_time[domain]
|
278 |
+
if time_since_last < self.rate_limit_delay:
|
279 |
+
sleep_time = self.rate_limit_delay - time_since_last
|
280 |
+
self.logger.info(
|
281 |
+
f"Rate limiting: sleeping {sleep_time:.1f}s for {domain}"
|
282 |
+
)
|
283 |
+
time.sleep(sleep_time)
|
284 |
+
|
285 |
+
self.last_request_time[domain] = time.time()
|
286 |
+
|
287 |
+
def _extract_content(self, url: str) -> tuple:
|
288 |
+
"""
|
289 |
+
Extract content from a web page using trafilatura with BeautifulSoup fallback.
|
290 |
+
|
291 |
+
Args:
|
292 |
+
url: The URL to extract content from
|
293 |
+
|
294 |
+
Returns:
|
295 |
+
Tuple of (content, metadata)
|
296 |
+
"""
|
297 |
+
self.logger.info(f"Extracting content from: {url}")
|
298 |
+
|
299 |
+
try:
|
300 |
+
# Fetch the page
|
301 |
+
response = self.session.get(url, timeout=self.timeout)
|
302 |
+
response.raise_for_status()
|
303 |
+
|
304 |
+
# Basic metadata
|
305 |
+
metadata = {
|
306 |
+
"url": url,
|
307 |
+
"status_code": response.status_code,
|
308 |
+
"content_type": response.headers.get("content-type", ""),
|
309 |
+
"content_length": len(response.content),
|
310 |
+
"extracted_time": datetime.now().isoformat(),
|
311 |
+
"encoding": response.encoding or "utf-8",
|
312 |
+
}
|
313 |
+
|
314 |
+
# Check content type
|
315 |
+
content_type = response.headers.get("content-type", "").lower()
|
316 |
+
|
317 |
+
if "application/pdf" in content_type:
|
318 |
+
return self._handle_pdf_url(response, metadata)
|
319 |
+
elif "text/html" not in content_type and "text/plain" not in content_type:
|
320 |
+
self.logger.warning(f"Unsupported content type: {content_type}")
|
321 |
+
return "", metadata
|
322 |
+
|
323 |
+
# Primary method: trafilatura (best for content extraction)
|
324 |
+
try:
|
325 |
+
content = trafilatura.extract(
|
326 |
+
response.text,
|
327 |
+
include_comments=False,
|
328 |
+
include_tables=True,
|
329 |
+
include_formatting=False,
|
330 |
+
favor_precision=True,
|
331 |
+
)
|
332 |
+
|
333 |
+
if content and len(content.strip()) > 50: # Minimum content threshold
|
334 |
+
# Extract additional metadata with trafilatura
|
335 |
+
metadata_extracted = trafilatura.extract_metadata(response.text)
|
336 |
+
if metadata_extracted:
|
337 |
+
metadata.update(
|
338 |
+
{
|
339 |
+
"title": metadata_extracted.title or "",
|
340 |
+
"author": metadata_extracted.author or "",
|
341 |
+
"description": metadata_extracted.description or "",
|
342 |
+
"sitename": metadata_extracted.sitename or "",
|
343 |
+
"date": metadata_extracted.date or "",
|
344 |
+
}
|
345 |
+
)
|
346 |
+
|
347 |
+
metadata.update(
|
348 |
+
{
|
349 |
+
"extraction_method": "trafilatura",
|
350 |
+
"word_count": len(content.split()),
|
351 |
+
"character_count": len(content),
|
352 |
+
}
|
353 |
+
)
|
354 |
+
|
355 |
+
return content.strip(), metadata
|
356 |
+
|
357 |
+
except Exception as e:
|
358 |
+
self.logger.warning(f"Trafilatura failed: {str(e)}")
|
359 |
+
|
360 |
+
# Fallback method: BeautifulSoup
|
361 |
+
return self._extract_with_beautifulsoup(response.text, metadata)
|
362 |
+
|
363 |
+
except requests.RequestException as e:
|
364 |
+
raise URLProcessingError(f"Failed to fetch URL: {str(e)}", url)
|
365 |
+
except Exception as e:
|
366 |
+
raise URLProcessingError(f"Content extraction failed: {str(e)}", url)
|
367 |
+
|
368 |
+
def _extract_with_beautifulsoup(self, html: str, metadata: Dict[str, Any]) -> tuple:
|
369 |
+
"""
|
370 |
+
Fallback content extraction using BeautifulSoup.
|
371 |
+
|
372 |
+
Args:
|
373 |
+
html: HTML content
|
374 |
+
metadata: Existing metadata dictionary
|
375 |
+
|
376 |
+
Returns:
|
377 |
+
Tuple of (content, metadata)
|
378 |
+
"""
|
379 |
+
try:
|
380 |
+
soup = BeautifulSoup(html, "html.parser")
|
381 |
+
|
382 |
+
# Extract metadata
|
383 |
+
title_tag = soup.find("title")
|
384 |
+
if title_tag:
|
385 |
+
metadata["title"] = title_tag.get_text().strip()
|
386 |
+
|
387 |
+
# Meta tags
|
388 |
+
for meta in soup.find_all("meta"):
|
389 |
+
name = meta.get("name", "").lower()
|
390 |
+
content = meta.get("content", "")
|
391 |
+
if name == "description":
|
392 |
+
metadata["description"] = content
|
393 |
+
elif name == "author":
|
394 |
+
metadata["author"] = content
|
395 |
+
|
396 |
+
# Remove unwanted elements
|
397 |
+
for element in soup(
|
398 |
+
["script", "style", "nav", "header", "footer", "aside"]
|
399 |
+
):
|
400 |
+
element.decompose()
|
401 |
+
|
402 |
+
# Extract main content
|
403 |
+
content_selectors = [
|
404 |
+
"main",
|
405 |
+
"article",
|
406 |
+
".content",
|
407 |
+
"#content",
|
408 |
+
".post",
|
409 |
+
".entry",
|
410 |
+
]
|
411 |
+
|
412 |
+
content = ""
|
413 |
+
for selector in content_selectors:
|
414 |
+
content_elem = soup.select_one(selector)
|
415 |
+
if content_elem:
|
416 |
+
content = content_elem.get_text(separator="\n", strip=True)
|
417 |
+
break
|
418 |
+
|
419 |
+
# Fallback to body if no main content found
|
420 |
+
if not content:
|
421 |
+
body = soup.find("body")
|
422 |
+
if body:
|
423 |
+
content = body.get_text(separator="\n", strip=True)
|
424 |
+
|
425 |
+
# Clean and validate content
|
426 |
+
content = re.sub(r"\n\s*\n", "\n\n", content) # Clean multiple newlines
|
427 |
+
content = content.strip()
|
428 |
+
|
429 |
+
metadata.update(
|
430 |
+
{
|
431 |
+
"extraction_method": "beautifulsoup",
|
432 |
+
"word_count": len(content.split()),
|
433 |
+
"character_count": len(content),
|
434 |
+
}
|
435 |
+
)
|
436 |
+
|
437 |
+
return content, metadata
|
438 |
+
|
439 |
+
except Exception as e:
|
440 |
+
self.logger.error(f"❌ BeautifulSoup extraction failed: {str(e)}")
|
441 |
+
return "", metadata
|
442 |
+
|
443 |
+
def _handle_pdf_url(
|
444 |
+
self, response: requests.Response, metadata: Dict[str, Any]
|
445 |
+
) -> tuple:
|
446 |
+
"""
|
447 |
+
📄 Handle PDF content from URL.
|
448 |
+
|
449 |
+
Args:
|
450 |
+
response: HTTP response containing PDF
|
451 |
+
metadata: Existing metadata
|
452 |
+
|
453 |
+
Returns:
|
454 |
+
Tuple of (content, metadata)
|
455 |
+
"""
|
456 |
+
self.logger.info("📄 Detected PDF content, extracting text...")
|
457 |
+
|
458 |
+
try:
|
459 |
+
# Save PDF temporarily and process with document processor
|
460 |
+
import tempfile
|
461 |
+
import os
|
462 |
+
from .document_processor import DocumentProcessor
|
463 |
+
|
464 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
|
465 |
+
tmp_file.write(response.content)
|
466 |
+
tmp_file.flush()
|
467 |
+
|
468 |
+
# Process PDF
|
469 |
+
doc_processor = DocumentProcessor(self.config)
|
470 |
+
result = doc_processor.process_document(tmp_file.name)
|
471 |
+
|
472 |
+
# Cleanup
|
473 |
+
os.unlink(tmp_file.name)
|
474 |
+
|
475 |
+
metadata.update(
|
476 |
+
{
|
477 |
+
"document_type": "pdf_from_url",
|
478 |
+
"extraction_method": "document_processor",
|
479 |
+
}
|
480 |
+
)
|
481 |
+
metadata.update(result.get("metadata", {}))
|
482 |
+
|
483 |
+
return result.get("content", ""), metadata
|
484 |
+
|
485 |
+
except Exception as e:
|
486 |
+
self.logger.error(f"❌ PDF extraction failed: {str(e)}")
|
487 |
+
return "", metadata
|
488 |
+
|
489 |
+
def _extract_links(self, url: str, content: str) -> List[str]:
|
490 |
+
"""
|
491 |
+
Extract links from a web page.
|
492 |
+
|
493 |
+
Args:
|
494 |
+
url: The source URL
|
495 |
+
content: Page content (for context)
|
496 |
+
|
497 |
+
Returns:
|
498 |
+
List of discovered URLs
|
499 |
+
"""
|
500 |
+
self.logger.info(f" Extracting links from: {url}")
|
501 |
+
|
502 |
+
try:
|
503 |
+
response = self.session.get(url, timeout=self.timeout)
|
504 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
505 |
+
|
506 |
+
links = []
|
507 |
+
base_domain = urlparse(url).netloc
|
508 |
+
|
509 |
+
for a_tag in soup.find_all("a", href=True):
|
510 |
+
href = a_tag.get("href")
|
511 |
+
if not href:
|
512 |
+
continue
|
513 |
+
|
514 |
+
# Convert relative URLs to absolute
|
515 |
+
absolute_url = urljoin(url, href)
|
516 |
+
|
517 |
+
# Filter links
|
518 |
+
if self._should_follow_link(absolute_url, base_domain):
|
519 |
+
links.append(absolute_url)
|
520 |
+
|
521 |
+
# 🎯 Remove duplicates and limit
|
522 |
+
unique_links = list(dict.fromkeys(links)) # Preserve order
|
523 |
+
return unique_links[:20] # Limit to prevent explosion
|
524 |
+
|
525 |
+
except Exception as e:
|
526 |
+
self.logger.error(f"❌ Link extraction failed: {str(e)}")
|
527 |
+
return []
|
528 |
+
|
529 |
+
def _should_follow_link(self, url: str, base_domain: str) -> bool:
|
530 |
+
"""
|
531 |
+
Determine if a link should be followed.
|
532 |
+
|
533 |
+
Args:
|
534 |
+
url: URL to check
|
535 |
+
base_domain: Base domain of the source page
|
536 |
+
|
537 |
+
Returns:
|
538 |
+
True if link should be followed
|
539 |
+
"""
|
540 |
+
try:
|
541 |
+
parsed = urlparse(url)
|
542 |
+
|
543 |
+
# Skip non-HTTP(S) links
|
544 |
+
if parsed.scheme not in ["http", "https"]:
|
545 |
+
return False
|
546 |
+
|
547 |
+
# Skip already visited
|
548 |
+
if url in self.visited_urls:
|
549 |
+
return False
|
550 |
+
|
551 |
+
# Skip file downloads (basic check)
|
552 |
+
path = parsed.path.lower()
|
553 |
+
skip_extensions = [
|
554 |
+
".pdf",
|
555 |
+
".doc",
|
556 |
+
".docx",
|
557 |
+
".zip",
|
558 |
+
".exe",
|
559 |
+
".dmg",
|
560 |
+
".jpg",
|
561 |
+
".png",
|
562 |
+
".gif",
|
563 |
+
]
|
564 |
+
if any(path.endswith(ext) for ext in skip_extensions):
|
565 |
+
return False
|
566 |
+
|
567 |
+
# Skip fragments and query-heavy URLs
|
568 |
+
if parsed.fragment or len(parsed.query) > 100:
|
569 |
+
return False
|
570 |
+
|
571 |
+
# Prefer same domain (but allow subdomains)
|
572 |
+
link_domain = parsed.netloc
|
573 |
+
if not (
|
574 |
+
link_domain == base_domain or link_domain.endswith("." + base_domain)
|
575 |
+
):
|
576 |
+
return False
|
577 |
+
|
578 |
+
return True
|
579 |
+
|
580 |
+
except Exception:
|
581 |
+
return False
|
582 |
+
|
583 |
+
def reset(self):
|
584 |
+
"""Reset the processor state, clearing visited URLs and caches."""
|
585 |
+
self.visited_urls.clear()
|
586 |
+
self.robots_cache.clear()
|
587 |
+
self.last_request_time.clear()
|
588 |
+
self.logger.info("URL processor state reset")
|
589 |
+
|
590 |
+
def get_statistics(self) -> Dict[str, Any]:
|
591 |
+
"""
|
592 |
+
Get processing statistics.
|
593 |
+
|
594 |
+
Returns:
|
595 |
+
Dictionary with processing statistics
|
596 |
+
"""
|
597 |
+
return {
|
598 |
+
"urls_processed": len(self.visited_urls),
|
599 |
+
"domains_cached": len(self.robots_cache),
|
600 |
+
"rate_limited_domains": len(self.last_request_time),
|
601 |
+
"max_pages_limit": self.max_pages,
|
602 |
+
"max_depth_limit": self.max_depth,
|
603 |
+
}
|
src/integrations/__init__.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Integrations Module
|
3 |
+
|
4 |
+
This module contains integrations with external services and APIs
|
5 |
+
for enhanced RAG functionality.
|
6 |
+
|
7 |
+
Available Integrations:
|
8 |
+
- MCP Tavily: Live web search via Model Context Protocol
|
9 |
+
"""
|
10 |
+
|
11 |
+
from .mcp_tavily_integration import MCPTavilyIntegration, create_mcp_tavily_client
|
12 |
+
|
13 |
+
__all__ = ["MCPTavilyIntegration", "create_mcp_tavily_client"]
|
src/integrations/mcp_tavily_integration.py
ADDED
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
MCP Tavily Integration Module
|
3 |
+
|
4 |
+
This module demonstrates how to integrate Tavily API via MCP (Model Context Protocol)
|
5 |
+
for live web search functionality in the RAG system.
|
6 |
+
|
7 |
+
Technology: MCP + Tavily API
|
8 |
+
"""
|
9 |
+
|
10 |
+
import logging
|
11 |
+
import time
|
12 |
+
from typing import Dict, List, Any, Optional
|
13 |
+
from datetime import datetime
|
14 |
+
|
15 |
+
|
16 |
+
class MCPTavilyIntegration:
|
17 |
+
"""
|
18 |
+
Handles MCP integration with Tavily API for live web search.
|
19 |
+
|
20 |
+
This class provides the bridge between the RAG system and Tavily's
|
21 |
+
search capabilities through the Model Context Protocol.
|
22 |
+
"""
|
23 |
+
|
24 |
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
25 |
+
"""
|
26 |
+
Initialize MCP Tavily integration.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
config: Configuration dictionary
|
30 |
+
"""
|
31 |
+
self.config = config or {}
|
32 |
+
self.logger = logging.getLogger(__name__)
|
33 |
+
|
34 |
+
# 🔧 MCP Configuration
|
35 |
+
self.server_name = self.config.get("mcp_server_name", "tavily-mcp")
|
36 |
+
self.tool_name = self.config.get("mcp_tool_name", "tavily-search")
|
37 |
+
self.timeout = self.config.get("timeout", 30)
|
38 |
+
|
39 |
+
self.logger.info(" MCP Tavily Integration initialized")
|
40 |
+
|
41 |
+
def search_web(
|
42 |
+
self,
|
43 |
+
query: str,
|
44 |
+
max_results: int = 5,
|
45 |
+
search_depth: str = "basic",
|
46 |
+
time_range: str = "month",
|
47 |
+
topic: str = "general",
|
48 |
+
) -> Dict[str, Any]:
|
49 |
+
"""
|
50 |
+
Perform web search using Tavily API via MCP.
|
51 |
+
|
52 |
+
Args:
|
53 |
+
query: Search query
|
54 |
+
max_results: Maximum number of results
|
55 |
+
search_depth: Search depth (basic/advanced)
|
56 |
+
time_range: Time range for results
|
57 |
+
topic: Search topic category
|
58 |
+
|
59 |
+
Returns:
|
60 |
+
Dictionary with search results
|
61 |
+
"""
|
62 |
+
try:
|
63 |
+
self.logger.info(f" MCP Tavily search: '{query}' (depth: {search_depth})")
|
64 |
+
|
65 |
+
# 🚀 Prepare MCP arguments
|
66 |
+
mcp_arguments = {
|
67 |
+
"query": query,
|
68 |
+
"max_results": min(max_results, 20), # Tavily limit
|
69 |
+
"search_depth": search_depth,
|
70 |
+
"topic": topic,
|
71 |
+
"include_raw_content": True,
|
72 |
+
"time_range": time_range,
|
73 |
+
}
|
74 |
+
|
75 |
+
# 🌐 This is where the actual MCP call would be made
|
76 |
+
# In a real implementation, this would use the MCP client:
|
77 |
+
|
78 |
+
"""
|
79 |
+
Example MCP call structure:
|
80 |
+
|
81 |
+
result = use_mcp_tool(
|
82 |
+
server_name=self.server_name,
|
83 |
+
tool_name=self.tool_name,
|
84 |
+
arguments=mcp_arguments
|
85 |
+
)
|
86 |
+
"""
|
87 |
+
|
88 |
+
# 🚧 For demonstration, we'll simulate the MCP response structure
|
89 |
+
simulated_result = self._simulate_tavily_response(query, max_results)
|
90 |
+
|
91 |
+
# 🔄 Process and validate MCP response
|
92 |
+
processed_result = self._process_mcp_response(simulated_result, query)
|
93 |
+
|
94 |
+
self.logger.info(
|
95 |
+
f" MCP search completed: {processed_result.get('total_results', 0)} results"
|
96 |
+
)
|
97 |
+
return processed_result
|
98 |
+
|
99 |
+
except Exception as e:
|
100 |
+
self.logger.error(f" MCP Tavily search failed: {str(e)}")
|
101 |
+
return {
|
102 |
+
"query": query,
|
103 |
+
"results": [],
|
104 |
+
"total_results": 0,
|
105 |
+
"error": str(e),
|
106 |
+
"status": "mcp_error",
|
107 |
+
}
|
108 |
+
|
109 |
+
def _simulate_tavily_response(self, query: str, max_results: int) -> Dict[str, Any]:
|
110 |
+
"""
|
111 |
+
Simulate Tavily API response for demonstration.
|
112 |
+
|
113 |
+
In production, this would be replaced by actual MCP call results.
|
114 |
+
"""
|
115 |
+
# 🚧 Simulated response structure matching Tavily API
|
116 |
+
return {
|
117 |
+
"query": query,
|
118 |
+
"follow_up_questions": None,
|
119 |
+
"answer": f"Based on web search for '{query}'...",
|
120 |
+
"images": [],
|
121 |
+
"results": [
|
122 |
+
{
|
123 |
+
"title": f"Example Result 1 for {query}",
|
124 |
+
"url": "https://example.com/result1",
|
125 |
+
"content": f"This is example content related to {query}. It provides comprehensive information about the topic.",
|
126 |
+
"raw_content": f"Raw content for {query} with additional details...",
|
127 |
+
"published_date": "2024-01-15",
|
128 |
+
"score": 0.95,
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"title": f"Example Result 2 for {query}",
|
132 |
+
"url": "https://example.com/result2",
|
133 |
+
"content": f"Another relevant result for {query} with different perspective and insights.",
|
134 |
+
"raw_content": f"Extended raw content for {query}...",
|
135 |
+
"published_date": "2024-01-14",
|
136 |
+
"score": 0.87,
|
137 |
+
},
|
138 |
+
][:max_results],
|
139 |
+
"response_time": 1.2,
|
140 |
+
}
|
141 |
+
|
142 |
+
def _process_mcp_response(
|
143 |
+
self, mcp_result: Dict[str, Any], original_query: str
|
144 |
+
) -> Dict[str, Any]:
|
145 |
+
"""
|
146 |
+
Process and validate MCP response from Tavily.
|
147 |
+
|
148 |
+
Args:
|
149 |
+
mcp_result: Raw MCP response
|
150 |
+
original_query: Original search query
|
151 |
+
|
152 |
+
Returns:
|
153 |
+
Processed search results
|
154 |
+
"""
|
155 |
+
try:
|
156 |
+
# 🔍 Extract results from MCP response
|
157 |
+
raw_results = mcp_result.get("results", [])
|
158 |
+
|
159 |
+
# 🔄 Process each result
|
160 |
+
processed_results = []
|
161 |
+
for i, result in enumerate(raw_results):
|
162 |
+
processed_result = {
|
163 |
+
"title": result.get("title", f"Web Result {i+1}"),
|
164 |
+
"url": result.get("url", ""),
|
165 |
+
"content": result.get("content", ""),
|
166 |
+
"raw_content": result.get("raw_content", ""),
|
167 |
+
"score": result.get("score", 0.0),
|
168 |
+
"published_date": result.get("published_date", ""),
|
169 |
+
"rank": i + 1,
|
170 |
+
"source": "tavily_web_search",
|
171 |
+
"search_engine": "tavily",
|
172 |
+
"metadata": {
|
173 |
+
"title": result.get("title", ""),
|
174 |
+
"url": result.get("url", ""),
|
175 |
+
"content_length": len(result.get("content", "")),
|
176 |
+
"has_raw_content": bool(result.get("raw_content")),
|
177 |
+
"search_rank": i + 1,
|
178 |
+
"published_date": result.get("published_date", ""),
|
179 |
+
},
|
180 |
+
}
|
181 |
+
processed_results.append(processed_result)
|
182 |
+
|
183 |
+
# 📊 Prepare final response
|
184 |
+
return {
|
185 |
+
"query": original_query,
|
186 |
+
"results": processed_results,
|
187 |
+
"total_results": len(processed_results),
|
188 |
+
"answer": mcp_result.get("answer", ""),
|
189 |
+
"follow_up_questions": mcp_result.get("follow_up_questions", []),
|
190 |
+
"response_time": mcp_result.get("response_time", 0),
|
191 |
+
"timestamp": datetime.now(),
|
192 |
+
"status": "success",
|
193 |
+
"source": "mcp_tavily",
|
194 |
+
}
|
195 |
+
|
196 |
+
except Exception as e:
|
197 |
+
self.logger.error(f" Error processing MCP response: {str(e)}")
|
198 |
+
return {
|
199 |
+
"query": original_query,
|
200 |
+
"results": [],
|
201 |
+
"total_results": 0,
|
202 |
+
"error": f"Response processing failed: {str(e)}",
|
203 |
+
"status": "processing_error",
|
204 |
+
}
|
205 |
+
|
206 |
+
def test_connection(self) -> Dict[str, Any]:
|
207 |
+
"""
|
208 |
+
Test MCP connection to Tavily.
|
209 |
+
|
210 |
+
Returns:
|
211 |
+
Connection test results
|
212 |
+
"""
|
213 |
+
try:
|
214 |
+
self.logger.info(" Testing MCP Tavily connection...")
|
215 |
+
|
216 |
+
# 🔍 Simple test query
|
217 |
+
test_result = self.search_web(
|
218 |
+
query="test connection", max_results=1, search_depth="basic"
|
219 |
+
)
|
220 |
+
|
221 |
+
if test_result.get("status") == "success":
|
222 |
+
return {
|
223 |
+
"status": "success",
|
224 |
+
"message": " MCP Tavily connection successful",
|
225 |
+
"server_name": self.server_name,
|
226 |
+
"tool_name": self.tool_name,
|
227 |
+
"response_time": test_result.get("response_time", 0),
|
228 |
+
}
|
229 |
+
else:
|
230 |
+
return {
|
231 |
+
"status": "error",
|
232 |
+
"message": " MCP Tavily connection failed",
|
233 |
+
"error": test_result.get("error", "Unknown error"),
|
234 |
+
}
|
235 |
+
|
236 |
+
except Exception as e:
|
237 |
+
self.logger.error(f" MCP connection test failed: {str(e)}")
|
238 |
+
return {
|
239 |
+
"status": "error",
|
240 |
+
"message": " MCP connection test failed",
|
241 |
+
"error": str(e),
|
242 |
+
}
|
243 |
+
|
244 |
+
def get_server_info(self) -> Dict[str, Any]:
|
245 |
+
"""
|
246 |
+
Get MCP server information.
|
247 |
+
|
248 |
+
Returns:
|
249 |
+
Server information dictionary
|
250 |
+
"""
|
251 |
+
return {
|
252 |
+
"server_name": self.server_name,
|
253 |
+
"tool_name": self.tool_name,
|
254 |
+
"timeout": self.timeout,
|
255 |
+
"status": "configured",
|
256 |
+
"description": "MCP integration for Tavily web search API",
|
257 |
+
}
|
258 |
+
|
259 |
+
|
260 |
+
# 🔧 Helper function for easy integration
|
261 |
+
def create_mcp_tavily_client(
|
262 |
+
config: Optional[Dict[str, Any]] = None,
|
263 |
+
) -> MCPTavilyIntegration:
|
264 |
+
"""
|
265 |
+
Create and configure MCP Tavily client.
|
266 |
+
|
267 |
+
Args:
|
268 |
+
config: Optional configuration dictionary
|
269 |
+
|
270 |
+
Returns:
|
271 |
+
Configured MCPTavilyIntegration instance
|
272 |
+
"""
|
273 |
+
return MCPTavilyIntegration(config)
|
274 |
+
|
275 |
+
|
276 |
+
# 📝 Example usage and integration guide
|
277 |
+
if __name__ == "__main__":
|
278 |
+
"""
|
279 |
+
Example usage of MCP Tavily Integration
|
280 |
+
|
281 |
+
This demonstrates how to use the MCP integration in your RAG system.
|
282 |
+
"""
|
283 |
+
|
284 |
+
# 🔧 Configure MCP client
|
285 |
+
config = {
|
286 |
+
"mcp_server_name": "tavily-mcp",
|
287 |
+
"mcp_tool_name": "tavily-search",
|
288 |
+
"timeout": 30,
|
289 |
+
}
|
290 |
+
|
291 |
+
# 🚀 Create client
|
292 |
+
mcp_client = create_mcp_tavily_client(config)
|
293 |
+
|
294 |
+
# 🧪 Test connection
|
295 |
+
connection_test = mcp_client.test_connection()
|
296 |
+
print(f"Connection test: {connection_test}")
|
297 |
+
|
298 |
+
# 🔍 Example search
|
299 |
+
search_result = mcp_client.search_web(
|
300 |
+
query="latest AI developments 2024",
|
301 |
+
max_results=5,
|
302 |
+
search_depth="basic",
|
303 |
+
time_range="month",
|
304 |
+
)
|
305 |
+
|
306 |
+
print(f"Search results: {search_result.get('total_results', 0)} found")
|
307 |
+
for result in search_result.get("results", []):
|
308 |
+
print(f"- {result['title']}: {result['url']}")
|
src/rag/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
RAG (Retrieval Augmented Generation) module.
|
3 |
+
|
4 |
+
This module contains components for processing queries and
|
5 |
+
generating responses using retrieved knowledge.
|
6 |
+
"""
|
src/rag/live_search.py
ADDED
@@ -0,0 +1,523 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Live Search Processor using Tavily Python Client.
|
3 |
+
Provides real-time web search capabilities for the RAG system.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import logging
|
7 |
+
import os
|
8 |
+
import time
|
9 |
+
from typing import Dict, List, Any, Optional
|
10 |
+
from datetime import datetime, timedelta
|
11 |
+
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
+
|
14 |
+
|
15 |
+
class LiveSearchProcessor:
|
16 |
+
"""Handles live web search using Tavily Python Client."""
|
17 |
+
|
18 |
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
19 |
+
"""
|
20 |
+
Initialize the LiveSearchProcessor.
|
21 |
+
|
22 |
+
Args:
|
23 |
+
config: Configuration dictionary containing live search settings
|
24 |
+
"""
|
25 |
+
self.config = config or {}
|
26 |
+
self.logger = logging.getLogger(__name__)
|
27 |
+
|
28 |
+
# Search configuration
|
29 |
+
self.enabled = self.config.get("enabled", False)
|
30 |
+
self.max_results = self.config.get("max_results", 5)
|
31 |
+
self.search_depth = self.config.get("search_depth", "basic")
|
32 |
+
self.include_answer = self.config.get("include_answer", True)
|
33 |
+
self.include_raw_content = self.config.get("include_raw_content", False)
|
34 |
+
self.include_images = self.config.get("include_images", False)
|
35 |
+
self.topic = self.config.get("topic", "general")
|
36 |
+
self.enable_caching = self.config.get("enable_caching", True)
|
37 |
+
|
38 |
+
# Search cache and analytics
|
39 |
+
self.search_cache = {}
|
40 |
+
self.search_history = []
|
41 |
+
|
42 |
+
# Initialize Tavily client
|
43 |
+
self.tavily_client = None
|
44 |
+
self._initialize_client()
|
45 |
+
|
46 |
+
self.logger.info(f"LiveSearchProcessor initialized - Enabled: {self.enabled}")
|
47 |
+
|
48 |
+
def _initialize_client(self):
|
49 |
+
"""Initialize the Tavily client."""
|
50 |
+
try:
|
51 |
+
# Get API key from environment variable
|
52 |
+
api_key = os.getenv("TAVILY_API_KEY")
|
53 |
+
|
54 |
+
if not api_key:
|
55 |
+
self.logger.warning("TAVILY_API_KEY not found in environment variables")
|
56 |
+
self.enabled = False
|
57 |
+
return
|
58 |
+
|
59 |
+
# Import and initialize Tavily client
|
60 |
+
from tavily import TavilyClient
|
61 |
+
|
62 |
+
self.tavily_client = TavilyClient(api_key=api_key)
|
63 |
+
|
64 |
+
# ✅ Auto-enable if client initializes successfully and no explicit config
|
65 |
+
if self.tavily_client and not self.config.get(
|
66 |
+
"enabled_explicitly_set", False
|
67 |
+
):
|
68 |
+
self.enabled = True
|
69 |
+
self.logger.info(
|
70 |
+
"Tavily client initialized successfully - Auto-enabled live search"
|
71 |
+
)
|
72 |
+
else:
|
73 |
+
self.logger.info("Tavily client initialized successfully")
|
74 |
+
|
75 |
+
except ImportError:
|
76 |
+
self.logger.error(
|
77 |
+
"tavily-python package not installed. Install with: pip install tavily-python"
|
78 |
+
)
|
79 |
+
self.enabled = False
|
80 |
+
except Exception as e:
|
81 |
+
self.logger.error(f"Failed to initialize Tavily client: {str(e)}")
|
82 |
+
self.enabled = False
|
83 |
+
|
84 |
+
def is_enabled(self) -> bool:
|
85 |
+
"""Check if live search is enabled."""
|
86 |
+
return self.enabled and self.tavily_client is not None
|
87 |
+
|
88 |
+
def search_web(
|
89 |
+
self,
|
90 |
+
query: str,
|
91 |
+
max_results: Optional[int] = None,
|
92 |
+
search_depth: Optional[str] = None,
|
93 |
+
time_range: Optional[str] = None,
|
94 |
+
) -> Dict[str, Any]:
|
95 |
+
"""
|
96 |
+
Perform live web search using Tavily API.
|
97 |
+
|
98 |
+
Args:
|
99 |
+
query: Search query string
|
100 |
+
max_results: Maximum number of results to return
|
101 |
+
search_depth: Search depth ('basic' or 'advanced')
|
102 |
+
time_range: Time range for search results
|
103 |
+
|
104 |
+
Returns:
|
105 |
+
Dictionary containing search results and metadata
|
106 |
+
"""
|
107 |
+
if not query or not query.strip():
|
108 |
+
return {
|
109 |
+
"query": query,
|
110 |
+
"results": [],
|
111 |
+
"total_results": 0,
|
112 |
+
"error": "Empty query provided",
|
113 |
+
"source": "live_search",
|
114 |
+
}
|
115 |
+
|
116 |
+
if not self.is_enabled():
|
117 |
+
self.logger.warning("Live search is disabled or client not initialized")
|
118 |
+
return {
|
119 |
+
"query": query,
|
120 |
+
"results": [],
|
121 |
+
"total_results": 0,
|
122 |
+
"error": "Live search is disabled or Tavily client not initialized",
|
123 |
+
"source": "live_search",
|
124 |
+
}
|
125 |
+
|
126 |
+
self.logger.info(f"Performing live search: {query[:100]}...")
|
127 |
+
start_time = time.time()
|
128 |
+
|
129 |
+
try:
|
130 |
+
# Use provided parameters or defaults
|
131 |
+
search_max_results = max_results or self.max_results
|
132 |
+
search_depth_param = search_depth or self.search_depth
|
133 |
+
|
134 |
+
# Check cache first
|
135 |
+
cache_key = self._generate_cache_key(
|
136 |
+
query, search_max_results, search_depth_param
|
137 |
+
)
|
138 |
+
if self.enable_caching and cache_key in self.search_cache:
|
139 |
+
cached_result = self.search_cache[cache_key]
|
140 |
+
if self._is_cache_valid(cached_result["timestamp"]):
|
141 |
+
self.logger.info("Returning cached search result")
|
142 |
+
cached_result["from_cache"] = True
|
143 |
+
return cached_result
|
144 |
+
|
145 |
+
# Prepare search parameters
|
146 |
+
search_params = {
|
147 |
+
"query": query,
|
148 |
+
"max_results": min(search_max_results, 20), # Tavily limit
|
149 |
+
"search_depth": search_depth_param,
|
150 |
+
"include_answer": self.include_answer,
|
151 |
+
"include_raw_content": self.include_raw_content,
|
152 |
+
"include_images": self.include_images,
|
153 |
+
"topic": self.topic,
|
154 |
+
}
|
155 |
+
|
156 |
+
# Add time_range if provided
|
157 |
+
if time_range:
|
158 |
+
search_params["time_range"] = time_range
|
159 |
+
|
160 |
+
# Perform the search
|
161 |
+
response = self.tavily_client.search(**search_params)
|
162 |
+
|
163 |
+
# Process and format results
|
164 |
+
processed_results = self._process_search_results(
|
165 |
+
response.get("results", []), query
|
166 |
+
)
|
167 |
+
|
168 |
+
# Prepare final result
|
169 |
+
result = {
|
170 |
+
"query": query,
|
171 |
+
"results": processed_results,
|
172 |
+
"total_results": len(processed_results),
|
173 |
+
"answer": response.get("answer"),
|
174 |
+
"images": response.get("images", []),
|
175 |
+
"follow_up_questions": response.get("follow_up_questions", []),
|
176 |
+
"search_params": {
|
177 |
+
"max_results": search_max_results,
|
178 |
+
"search_depth": search_depth_param,
|
179 |
+
"time_range": time_range,
|
180 |
+
},
|
181 |
+
"processing_time": time.time() - start_time,
|
182 |
+
"timestamp": datetime.now(),
|
183 |
+
"source": "live_search",
|
184 |
+
"from_cache": False,
|
185 |
+
"search_metadata": {
|
186 |
+
"source": "tavily",
|
187 |
+
"timestamp": datetime.now().isoformat(),
|
188 |
+
"results_count": len(processed_results),
|
189 |
+
"search_depth": search_depth_param,
|
190 |
+
"max_results": search_max_results,
|
191 |
+
"response_time": response.get("response_time"),
|
192 |
+
},
|
193 |
+
}
|
194 |
+
|
195 |
+
# Cache the result
|
196 |
+
if self.enable_caching:
|
197 |
+
self.search_cache[cache_key] = result.copy()
|
198 |
+
|
199 |
+
# Add to search history
|
200 |
+
self._add_to_history(query, len(processed_results))
|
201 |
+
|
202 |
+
self.logger.info(
|
203 |
+
f"Live search completed in {result['processing_time']:.2f}s"
|
204 |
+
)
|
205 |
+
return result
|
206 |
+
|
207 |
+
except Exception as e:
|
208 |
+
self.logger.error(f"Error in live search: {str(e)}")
|
209 |
+
return {
|
210 |
+
"query": query,
|
211 |
+
"results": [],
|
212 |
+
"total_results": 0,
|
213 |
+
"error": str(e),
|
214 |
+
"processing_time": time.time() - start_time,
|
215 |
+
"source": "live_search",
|
216 |
+
}
|
217 |
+
|
218 |
+
def search(self, query: str, **kwargs) -> Dict[str, Any]:
|
219 |
+
"""
|
220 |
+
Perform a live web search using Tavily API.
|
221 |
+
|
222 |
+
Args:
|
223 |
+
query: Search query string
|
224 |
+
**kwargs: Additional search parameters
|
225 |
+
|
226 |
+
Returns:
|
227 |
+
Dictionary containing search results
|
228 |
+
"""
|
229 |
+
return self.search_web(query, **kwargs)
|
230 |
+
|
231 |
+
def _process_search_results(
|
232 |
+
self, raw_results: List[Dict[str, Any]], query: str
|
233 |
+
) -> List[Dict[str, Any]]:
|
234 |
+
"""
|
235 |
+
Process and format raw search results from Tavily.
|
236 |
+
|
237 |
+
Args:
|
238 |
+
raw_results: Raw results from Tavily API
|
239 |
+
query: Original search query
|
240 |
+
|
241 |
+
Returns:
|
242 |
+
Processed and formatted results
|
243 |
+
"""
|
244 |
+
processed_results = []
|
245 |
+
query_words = set(query.lower().split())
|
246 |
+
|
247 |
+
for i, result in enumerate(raw_results):
|
248 |
+
try:
|
249 |
+
# Extract key information
|
250 |
+
title = result.get("title", "")
|
251 |
+
url = result.get("url", "")
|
252 |
+
content = result.get("content", "")
|
253 |
+
raw_content = result.get("raw_content", "")
|
254 |
+
score = result.get("score", 0.0)
|
255 |
+
|
256 |
+
# Calculate relevance score
|
257 |
+
relevance_score = self._calculate_relevance_score(
|
258 |
+
title, content, query_words, score
|
259 |
+
)
|
260 |
+
|
261 |
+
# Format result
|
262 |
+
formatted_result = {
|
263 |
+
"title": title,
|
264 |
+
"url": url,
|
265 |
+
"content": content[:500] + "..." if len(content) > 500 else content,
|
266 |
+
"raw_content": raw_content if self.include_raw_content else "",
|
267 |
+
"score": score,
|
268 |
+
"relevance_score": relevance_score,
|
269 |
+
"rank": i + 1,
|
270 |
+
"source": "web_search",
|
271 |
+
"search_engine": "tavily",
|
272 |
+
"published_date": result.get("published_date"),
|
273 |
+
"metadata": {
|
274 |
+
"title": title,
|
275 |
+
"url": url,
|
276 |
+
"content_length": len(content),
|
277 |
+
"has_raw_content": bool(raw_content),
|
278 |
+
"search_rank": i + 1,
|
279 |
+
},
|
280 |
+
}
|
281 |
+
|
282 |
+
processed_results.append(formatted_result)
|
283 |
+
|
284 |
+
except Exception as e:
|
285 |
+
self.logger.warning(f"Error processing search result {i}: {str(e)}")
|
286 |
+
continue
|
287 |
+
|
288 |
+
# Sort by relevance score
|
289 |
+
processed_results.sort(key=lambda x: x["relevance_score"], reverse=True)
|
290 |
+
|
291 |
+
return processed_results
|
292 |
+
|
293 |
+
def _calculate_relevance_score(
|
294 |
+
self, title: str, content: str, query_words: set, base_score: float
|
295 |
+
) -> float:
|
296 |
+
"""
|
297 |
+
Calculate relevance score for search results.
|
298 |
+
|
299 |
+
Args:
|
300 |
+
title: Result title
|
301 |
+
content: Result content
|
302 |
+
query_words: Set of query words
|
303 |
+
base_score: Base score from search engine
|
304 |
+
|
305 |
+
Returns:
|
306 |
+
Calculated relevance score
|
307 |
+
"""
|
308 |
+
try:
|
309 |
+
# Start with base score
|
310 |
+
relevance = base_score
|
311 |
+
|
312 |
+
# Title relevance (higher weight)
|
313 |
+
title_words = set(title.lower().split())
|
314 |
+
title_overlap = len(query_words.intersection(title_words))
|
315 |
+
title_boost = (title_overlap / max(len(query_words), 1)) * 0.3
|
316 |
+
|
317 |
+
# Content relevance
|
318 |
+
content_words = set(content.lower().split())
|
319 |
+
content_overlap = len(query_words.intersection(content_words))
|
320 |
+
content_boost = (content_overlap / max(len(query_words), 1)) * 0.2
|
321 |
+
|
322 |
+
# Exact phrase matching bonus
|
323 |
+
query_phrase = " ".join(query_words).lower()
|
324 |
+
if query_phrase in title.lower():
|
325 |
+
relevance += 0.2
|
326 |
+
elif query_phrase in content.lower():
|
327 |
+
relevance += 0.1
|
328 |
+
|
329 |
+
# Final score calculation
|
330 |
+
final_score = min(relevance + title_boost + content_boost, 1.0)
|
331 |
+
|
332 |
+
return round(final_score, 3)
|
333 |
+
|
334 |
+
except Exception as e:
|
335 |
+
self.logger.warning(f"Error calculating relevance score: {str(e)}")
|
336 |
+
return base_score
|
337 |
+
|
338 |
+
def get_search_context(self, query: str, **kwargs) -> str:
|
339 |
+
"""
|
340 |
+
Get search context suitable for RAG applications.
|
341 |
+
|
342 |
+
Args:
|
343 |
+
query: Search query string
|
344 |
+
**kwargs: Additional search parameters
|
345 |
+
|
346 |
+
Returns:
|
347 |
+
Formatted context string
|
348 |
+
"""
|
349 |
+
search_results = self.search(query, **kwargs)
|
350 |
+
|
351 |
+
if not search_results.get("results"):
|
352 |
+
error_msg = search_results.get("error", "Unknown error")
|
353 |
+
return f"No live search results found for: {query}. Error: {error_msg}"
|
354 |
+
|
355 |
+
context_parts = []
|
356 |
+
|
357 |
+
# Add answer if available
|
358 |
+
if search_results.get("answer"):
|
359 |
+
context_parts.append(f"Answer: {search_results['answer']}")
|
360 |
+
context_parts.append("")
|
361 |
+
|
362 |
+
# Add search results
|
363 |
+
context_parts.append("Search Results:")
|
364 |
+
for i, result in enumerate(search_results["results"], 1):
|
365 |
+
context_parts.append(f"{i}. {result['title']}")
|
366 |
+
context_parts.append(f" URL: {result['url']}")
|
367 |
+
context_parts.append(f" Content: {result['content']}")
|
368 |
+
if result.get("published_date"):
|
369 |
+
context_parts.append(f" Published: {result['published_date']}")
|
370 |
+
context_parts.append("")
|
371 |
+
|
372 |
+
# Add metadata
|
373 |
+
metadata = search_results.get("search_metadata", {})
|
374 |
+
context_parts.append(
|
375 |
+
f"Search performed at: {metadata.get('timestamp', 'Unknown')}"
|
376 |
+
)
|
377 |
+
context_parts.append(f"Source: {metadata.get('source', 'Unknown')}")
|
378 |
+
context_parts.append(f"Results count: {metadata.get('results_count', 0)}")
|
379 |
+
|
380 |
+
return "\n".join(context_parts)
|
381 |
+
|
382 |
+
def qna_search(self, query: str, **kwargs) -> str:
|
383 |
+
"""
|
384 |
+
Get a quick answer to a question using Tavily's QnA search.
|
385 |
+
|
386 |
+
Args:
|
387 |
+
query: Question to answer
|
388 |
+
**kwargs: Additional search parameters
|
389 |
+
|
390 |
+
Returns:
|
391 |
+
Answer string
|
392 |
+
"""
|
393 |
+
if not self.is_enabled():
|
394 |
+
return "Live search is disabled or not properly configured."
|
395 |
+
|
396 |
+
try:
|
397 |
+
# Use Tavily's QnA search method
|
398 |
+
answer = self.tavily_client.qna_search(query=query)
|
399 |
+
return answer if answer else "No answer found for the given question."
|
400 |
+
|
401 |
+
except Exception as e:
|
402 |
+
self.logger.error(f"Error in QnA search: {str(e)}")
|
403 |
+
return f"Error getting answer: {str(e)}"
|
404 |
+
|
405 |
+
def _generate_cache_key(
|
406 |
+
self, query: str, max_results: int, search_depth: str
|
407 |
+
) -> str:
|
408 |
+
"""Generate cache key for search results."""
|
409 |
+
import hashlib
|
410 |
+
|
411 |
+
cache_string = f"{query.lower().strip()}{max_results}{search_depth}"
|
412 |
+
return hashlib.md5(cache_string.encode()).hexdigest()
|
413 |
+
|
414 |
+
def _is_cache_valid(self, timestamp: datetime) -> bool:
|
415 |
+
"""Check if cached result is still valid (30 minutes for live search)."""
|
416 |
+
return datetime.now() - timestamp < timedelta(minutes=30)
|
417 |
+
|
418 |
+
def _add_to_history(self, query: str, result_count: int):
|
419 |
+
"""Add search to history for analytics."""
|
420 |
+
self.search_history.append(
|
421 |
+
{
|
422 |
+
"query": query,
|
423 |
+
"timestamp": datetime.now(),
|
424 |
+
"result_count": result_count,
|
425 |
+
"search_type": "live_web",
|
426 |
+
}
|
427 |
+
)
|
428 |
+
|
429 |
+
# Keep only last 50 searches
|
430 |
+
if len(self.search_history) > 50:
|
431 |
+
self.search_history = self.search_history[-50:]
|
432 |
+
|
433 |
+
def health_check(self) -> Dict[str, Any]:
|
434 |
+
"""
|
435 |
+
Perform a health check of the live search service.
|
436 |
+
|
437 |
+
Returns:
|
438 |
+
Dictionary containing health status
|
439 |
+
"""
|
440 |
+
try:
|
441 |
+
if not self.enabled:
|
442 |
+
return {
|
443 |
+
"status": "disabled",
|
444 |
+
"message": "Live search is disabled in configuration",
|
445 |
+
"timestamp": datetime.now().isoformat(),
|
446 |
+
}
|
447 |
+
|
448 |
+
if not self.tavily_client:
|
449 |
+
return {
|
450 |
+
"status": "error",
|
451 |
+
"message": "Tavily client not initialized. Check TAVILY_API_KEY environment variable.",
|
452 |
+
"timestamp": datetime.now().isoformat(),
|
453 |
+
}
|
454 |
+
|
455 |
+
# Perform a simple test search
|
456 |
+
test_result = self.search("test health check", max_results=1)
|
457 |
+
|
458 |
+
if test_result.get("error"):
|
459 |
+
return {
|
460 |
+
"status": "error",
|
461 |
+
"message": f"Health check failed: {test_result['error']}",
|
462 |
+
"timestamp": datetime.now().isoformat(),
|
463 |
+
}
|
464 |
+
|
465 |
+
return {
|
466 |
+
"status": "healthy",
|
467 |
+
"message": "Live search service is operational",
|
468 |
+
"timestamp": datetime.now().isoformat(),
|
469 |
+
"config": {
|
470 |
+
"max_results": self.max_results,
|
471 |
+
"search_depth": self.search_depth,
|
472 |
+
"include_answer": self.include_answer,
|
473 |
+
"topic": self.topic,
|
474 |
+
},
|
475 |
+
}
|
476 |
+
|
477 |
+
except Exception as e:
|
478 |
+
self.logger.error(f"Health check failed: {str(e)}")
|
479 |
+
return {
|
480 |
+
"status": "error",
|
481 |
+
"message": f"Health check failed: {str(e)}",
|
482 |
+
"timestamp": datetime.now().isoformat(),
|
483 |
+
}
|
484 |
+
|
485 |
+
def get_search_analytics(self) -> Dict[str, Any]:
|
486 |
+
"""
|
487 |
+
Get analytics about search patterns.
|
488 |
+
|
489 |
+
Returns:
|
490 |
+
Dictionary with search analytics
|
491 |
+
"""
|
492 |
+
if not self.search_history:
|
493 |
+
return {"total_searches": 0, "cache_hit_rate": 0.0, "average_results": 0.0}
|
494 |
+
|
495 |
+
total_searches = len(self.search_history)
|
496 |
+
avg_results = (
|
497 |
+
sum(s["result_count"] for s in self.search_history) / total_searches
|
498 |
+
)
|
499 |
+
|
500 |
+
# Recent search trends
|
501 |
+
recent_searches = [s["query"] for s in self.search_history[-10:]]
|
502 |
+
|
503 |
+
return {
|
504 |
+
"total_searches": total_searches,
|
505 |
+
"average_results_per_search": round(avg_results, 2),
|
506 |
+
"recent_searches": recent_searches,
|
507 |
+
"cache_size": len(self.search_cache),
|
508 |
+
"search_type": "live_web",
|
509 |
+
}
|
510 |
+
|
511 |
+
def clear_cache(self):
|
512 |
+
"""Clear the search cache."""
|
513 |
+
self.search_cache.clear()
|
514 |
+
self.logger.info("Live search cache cleared")
|
515 |
+
|
516 |
+
def clear_history(self):
|
517 |
+
"""Clear the search history."""
|
518 |
+
self.search_history.clear()
|
519 |
+
self.logger.info("Live search history cleared")
|
520 |
+
|
521 |
+
|
522 |
+
# 🔄 Compatibility alias for existing imports
|
523 |
+
LiveSearchManager = LiveSearchProcessor
|
src/rag/optimized_query_processor.py
ADDED
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Optimized Query Processor with Rate Limiting and Better Error Handling
|
3 |
+
"""
|
4 |
+
|
5 |
+
import logging
|
6 |
+
import time
|
7 |
+
from typing import Dict, List, Any, Optional
|
8 |
+
from datetime import datetime, timedelta
|
9 |
+
|
10 |
+
|
11 |
+
class OptimizedQueryProcessor:
|
12 |
+
"""
|
13 |
+
Optimized QueryProcessor with rate limiting and better error handling
|
14 |
+
"""
|
15 |
+
|
16 |
+
def __init__(
|
17 |
+
self, embedding_generator, vector_db, config: Optional[Dict[str, Any]] = None
|
18 |
+
):
|
19 |
+
self.embedding_generator = embedding_generator
|
20 |
+
self.vector_db = vector_db
|
21 |
+
self.config = config or {}
|
22 |
+
self.logger = logging.getLogger(__name__)
|
23 |
+
|
24 |
+
# Optimized configuration settings
|
25 |
+
self.top_k = self.config.get("top_k", 10) # Increased from 5
|
26 |
+
self.similarity_threshold = self.config.get(
|
27 |
+
"similarity_threshold", 0.4
|
28 |
+
) # Lowered from 0.7
|
29 |
+
self.max_context_length = self.config.get(
|
30 |
+
"max_context_length", 8000
|
31 |
+
) # Increased
|
32 |
+
self.enable_caching = self.config.get("enable_caching", True)
|
33 |
+
self.cache_ttl = self.config.get("cache_ttl", 7200) # 2 hours
|
34 |
+
|
35 |
+
# Rate limiting settings
|
36 |
+
self.last_api_call = 0
|
37 |
+
self.min_api_interval = 1.0 # Minimum 1 second between API calls
|
38 |
+
self.max_retries = 3
|
39 |
+
self.retry_delay = 2.0
|
40 |
+
|
41 |
+
# Query cache and history
|
42 |
+
self.query_cache = {}
|
43 |
+
self.query_history = []
|
44 |
+
|
45 |
+
self.logger.info("OptimizedQueryProcessor initialized")
|
46 |
+
|
47 |
+
def process_query(
|
48 |
+
self, query: str, filter: Optional[Dict[str, Any]] = None
|
49 |
+
) -> Dict[str, Any]:
|
50 |
+
"""
|
51 |
+
Process query with optimized rate limiting and error handling
|
52 |
+
"""
|
53 |
+
if not query or not query.strip():
|
54 |
+
return {
|
55 |
+
"query": query,
|
56 |
+
"context": [],
|
57 |
+
"total_results": 0,
|
58 |
+
"error": "Empty query provided",
|
59 |
+
}
|
60 |
+
|
61 |
+
self.logger.info(f"Processing query: {query[:100]}...")
|
62 |
+
start_time = time.time()
|
63 |
+
|
64 |
+
try:
|
65 |
+
# Check cache first
|
66 |
+
cache_key = self._generate_cache_key(query, filter)
|
67 |
+
if self.enable_caching and cache_key in self.query_cache:
|
68 |
+
cached_result = self.query_cache[cache_key]
|
69 |
+
if self._is_cache_valid(cached_result["timestamp"]):
|
70 |
+
self.logger.info("Returning cached result")
|
71 |
+
cached_result["from_cache"] = True
|
72 |
+
return cached_result
|
73 |
+
|
74 |
+
# Rate limiting protection
|
75 |
+
self._enforce_rate_limit()
|
76 |
+
|
77 |
+
# Generate query embedding with retry logic
|
78 |
+
query_embedding = self._generate_embedding_with_retry(query)
|
79 |
+
|
80 |
+
if not query_embedding:
|
81 |
+
return {
|
82 |
+
"query": query,
|
83 |
+
"context": [],
|
84 |
+
"total_results": 0,
|
85 |
+
"error": "Failed to generate query embedding",
|
86 |
+
}
|
87 |
+
|
88 |
+
# Search for similar vectors with increased top_k
|
89 |
+
search_results = self.vector_db.search(
|
90 |
+
query_embedding=query_embedding,
|
91 |
+
top_k=self.top_k * 2, # Get more results for better filtering
|
92 |
+
filter=filter,
|
93 |
+
include_metadata=True,
|
94 |
+
)
|
95 |
+
|
96 |
+
if not search_results:
|
97 |
+
self.logger.warning("No search results returned from vector database")
|
98 |
+
return {
|
99 |
+
"query": query,
|
100 |
+
"context": [],
|
101 |
+
"total_results": 0,
|
102 |
+
"error": "No similar documents found",
|
103 |
+
}
|
104 |
+
|
105 |
+
# Apply optimized filtering
|
106 |
+
filtered_results = self._apply_smart_filtering(search_results, query)
|
107 |
+
|
108 |
+
# Extract and format context with better error handling
|
109 |
+
context = self._extract_context_safely(filtered_results)
|
110 |
+
|
111 |
+
# Prepare result
|
112 |
+
result = {
|
113 |
+
"query": query,
|
114 |
+
"context": context,
|
115 |
+
"total_results": len(filtered_results),
|
116 |
+
"processing_time": time.time() - start_time,
|
117 |
+
"timestamp": datetime.now(),
|
118 |
+
"from_cache": False,
|
119 |
+
"similarity_scores": [r.get("score", 0) for r in filtered_results[:5]],
|
120 |
+
}
|
121 |
+
|
122 |
+
# Cache the result
|
123 |
+
if self.enable_caching:
|
124 |
+
self.query_cache[cache_key] = result.copy()
|
125 |
+
|
126 |
+
self.logger.info(
|
127 |
+
f"Query processed in {result['processing_time']:.2f}s, {len(context)} context items"
|
128 |
+
)
|
129 |
+
return result
|
130 |
+
|
131 |
+
except Exception as e:
|
132 |
+
self.logger.error(f"Error processing query: {str(e)}")
|
133 |
+
return {
|
134 |
+
"query": query,
|
135 |
+
"context": [],
|
136 |
+
"total_results": 0,
|
137 |
+
"error": str(e),
|
138 |
+
"processing_time": time.time() - start_time,
|
139 |
+
}
|
140 |
+
|
141 |
+
def _enforce_rate_limit(self):
|
142 |
+
"""Enforce rate limiting between API calls"""
|
143 |
+
current_time = time.time()
|
144 |
+
time_since_last_call = current_time - self.last_api_call
|
145 |
+
|
146 |
+
if time_since_last_call < self.min_api_interval:
|
147 |
+
sleep_time = self.min_api_interval - time_since_last_call
|
148 |
+
self.logger.info(f"Rate limiting: sleeping {sleep_time:.1f}s")
|
149 |
+
time.sleep(sleep_time)
|
150 |
+
|
151 |
+
self.last_api_call = time.time()
|
152 |
+
|
153 |
+
def _generate_embedding_with_retry(self, query: str) -> List[float]:
|
154 |
+
"""Generate embedding with retry logic and rate limiting"""
|
155 |
+
for attempt in range(self.max_retries):
|
156 |
+
try:
|
157 |
+
self._enforce_rate_limit()
|
158 |
+
embedding = self.embedding_generator.generate_query_embedding(query)
|
159 |
+
|
160 |
+
if embedding:
|
161 |
+
return embedding
|
162 |
+
else:
|
163 |
+
self.logger.warning(
|
164 |
+
f"Attempt {attempt + 1}: Empty embedding returned"
|
165 |
+
)
|
166 |
+
|
167 |
+
except Exception as e:
|
168 |
+
self.logger.warning(f"Attempt {attempt + 1} failed: {str(e)}")
|
169 |
+
|
170 |
+
if "429" in str(e) or "quota" in str(e).lower():
|
171 |
+
# Rate limit hit - wait longer
|
172 |
+
wait_time = self.retry_delay * (2**attempt)
|
173 |
+
self.logger.info(f"Rate limit hit, waiting {wait_time}s...")
|
174 |
+
time.sleep(wait_time)
|
175 |
+
elif attempt < self.max_retries - 1:
|
176 |
+
time.sleep(self.retry_delay)
|
177 |
+
|
178 |
+
self.logger.error("All embedding generation attempts failed")
|
179 |
+
return []
|
180 |
+
|
181 |
+
def _apply_smart_filtering(
|
182 |
+
self, search_results: List[Dict[str, Any]], query: str
|
183 |
+
) -> List[Dict[str, Any]]:
|
184 |
+
"""Apply smart filtering with adaptive threshold"""
|
185 |
+
if not search_results:
|
186 |
+
return []
|
187 |
+
|
188 |
+
# Get score statistics
|
189 |
+
scores = [r.get("score", 0) for r in search_results]
|
190 |
+
max_score = max(scores)
|
191 |
+
avg_score = sum(scores) / len(scores)
|
192 |
+
|
193 |
+
# Adaptive threshold: use lower threshold if max score is low
|
194 |
+
adaptive_threshold = min(self.similarity_threshold, max_score * 0.8)
|
195 |
+
|
196 |
+
self.logger.info(
|
197 |
+
f"Score stats - Max: {max_score:.3f}, Avg: {avg_score:.3f}, Threshold: {adaptive_threshold:.3f}"
|
198 |
+
)
|
199 |
+
|
200 |
+
# Filter results
|
201 |
+
filtered = [
|
202 |
+
result
|
203 |
+
for result in search_results[: self.top_k]
|
204 |
+
if result.get("score", 0) >= adaptive_threshold
|
205 |
+
]
|
206 |
+
|
207 |
+
# If no results pass threshold, return top 3 anyway
|
208 |
+
if not filtered and search_results:
|
209 |
+
self.logger.warning(
|
210 |
+
f"No results above threshold {adaptive_threshold:.3f}, returning top 3"
|
211 |
+
)
|
212 |
+
filtered = search_results[:3]
|
213 |
+
|
214 |
+
return filtered
|
215 |
+
|
216 |
+
def _extract_context_safely(
|
217 |
+
self, search_results: List[Dict[str, Any]]
|
218 |
+
) -> List[Dict[str, Any]]:
|
219 |
+
"""Extract context with better error handling"""
|
220 |
+
context = []
|
221 |
+
total_length = 0
|
222 |
+
|
223 |
+
for i, result in enumerate(search_results):
|
224 |
+
try:
|
225 |
+
# Multiple ways to extract text content
|
226 |
+
text = ""
|
227 |
+
metadata = result.get("metadata", {})
|
228 |
+
|
229 |
+
# Try different text fields
|
230 |
+
for field in ["text", "content", "content_preview", "description"]:
|
231 |
+
if field in metadata and metadata[field]:
|
232 |
+
text = str(metadata[field])
|
233 |
+
break
|
234 |
+
|
235 |
+
if not text:
|
236 |
+
self.logger.warning(f"No text content found in result {i}")
|
237 |
+
continue
|
238 |
+
|
239 |
+
# Check length limit
|
240 |
+
if total_length + len(text) > self.max_context_length and context:
|
241 |
+
break
|
242 |
+
|
243 |
+
# Create context item
|
244 |
+
context_item = {
|
245 |
+
"text": text,
|
246 |
+
"score": result.get("score", 0),
|
247 |
+
"source": metadata.get("source", f"Document {i+1}"),
|
248 |
+
"chunk_id": result.get("id", ""),
|
249 |
+
"metadata": metadata,
|
250 |
+
"relevance_rank": len(context) + 1,
|
251 |
+
}
|
252 |
+
|
253 |
+
context.append(context_item)
|
254 |
+
total_length += len(text)
|
255 |
+
|
256 |
+
except Exception as e:
|
257 |
+
self.logger.warning(f"Error extracting context from result {i}: {e}")
|
258 |
+
continue
|
259 |
+
|
260 |
+
self.logger.info(
|
261 |
+
f"Extracted {len(context)} context items (total length: {total_length})"
|
262 |
+
)
|
263 |
+
return context
|
264 |
+
|
265 |
+
def _generate_cache_key(self, query: str, filter: Optional[Dict[str, Any]]) -> str:
|
266 |
+
"""Generate cache key for query"""
|
267 |
+
import hashlib
|
268 |
+
|
269 |
+
filter_str = str(sorted(filter.items())) if filter else ""
|
270 |
+
cache_string = f"{query.lower().strip()}{filter_str}"
|
271 |
+
return hashlib.md5(cache_string.encode()).hexdigest()
|
272 |
+
|
273 |
+
def _is_cache_valid(self, timestamp: datetime) -> bool:
|
274 |
+
"""Check if cached result is still valid"""
|
275 |
+
return datetime.now() - timestamp < timedelta(seconds=self.cache_ttl)
|
src/rag/query_processor.py
ADDED
@@ -0,0 +1,427 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Query Processor Module
|
3 |
+
|
4 |
+
This module is responsible for processing user queries and converting
|
5 |
+
them to vector embeddings for retrieval.
|
6 |
+
|
7 |
+
Technologies: Gemini Embedding v3, LangChain, Pinecone
|
8 |
+
"""
|
9 |
+
|
10 |
+
import logging
|
11 |
+
import time
|
12 |
+
from typing import Dict, List, Any, Optional
|
13 |
+
from datetime import datetime, timedelta
|
14 |
+
|
15 |
+
|
16 |
+
class QueryProcessor:
|
17 |
+
"""
|
18 |
+
Processes user queries and converts them to vector embeddings.
|
19 |
+
|
20 |
+
Features:
|
21 |
+
- Query preprocessing and normalization
|
22 |
+
- Query embedding generation
|
23 |
+
- Context retrieval from vector database
|
24 |
+
- Query expansion and caching
|
25 |
+
- Metadata filtering and ranking
|
26 |
+
"""
|
27 |
+
|
28 |
+
def __init__(
|
29 |
+
self, embedding_generator, vector_db, config: Optional[Dict[str, Any]] = None
|
30 |
+
):
|
31 |
+
"""
|
32 |
+
Initialize the QueryProcessor with dependencies.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
embedding_generator: Instance of EmbeddingGenerator
|
36 |
+
vector_db: Instance of VectorDB
|
37 |
+
config: Configuration dictionary with processing parameters
|
38 |
+
"""
|
39 |
+
self.embedding_generator = embedding_generator
|
40 |
+
self.vector_db = vector_db
|
41 |
+
self.config = config or {}
|
42 |
+
self.logger = logging.getLogger(__name__)
|
43 |
+
|
44 |
+
# Configuration settings
|
45 |
+
self.top_k = self.config.get("top_k", 5)
|
46 |
+
self.similarity_threshold = self.config.get("similarity_threshold", 0.7)
|
47 |
+
self.max_context_length = self.config.get("max_context_length", 4000)
|
48 |
+
self.enable_caching = self.config.get("enable_caching", True)
|
49 |
+
self.cache_ttl = self.config.get("cache_ttl", 3600) # 1 hour
|
50 |
+
|
51 |
+
# Query cache and history
|
52 |
+
self.query_cache = {}
|
53 |
+
self.query_history = []
|
54 |
+
|
55 |
+
self.logger.info("QueryProcessor initialized with advanced features")
|
56 |
+
|
57 |
+
def process_query(
|
58 |
+
self, query: str, filter: Optional[Dict[str, Any]] = None
|
59 |
+
) -> Dict[str, Any]:
|
60 |
+
"""
|
61 |
+
Process a user query and retrieve relevant context.
|
62 |
+
|
63 |
+
Args:
|
64 |
+
query: User query string
|
65 |
+
filter: Optional metadata filter for search
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
Dictionary containing query, retrieved context, and metadata
|
69 |
+
"""
|
70 |
+
if not query or not query.strip():
|
71 |
+
return {
|
72 |
+
"query": query,
|
73 |
+
"context": [],
|
74 |
+
"total_results": 0,
|
75 |
+
"error": "Empty query provided",
|
76 |
+
}
|
77 |
+
|
78 |
+
self.logger.info(f"Processing query: {query[:100]}...")
|
79 |
+
start_time = time.time()
|
80 |
+
|
81 |
+
try:
|
82 |
+
# Check cache first
|
83 |
+
cache_key = self._generate_cache_key(query, filter)
|
84 |
+
if self.enable_caching and cache_key in self.query_cache:
|
85 |
+
cached_result = self.query_cache[cache_key]
|
86 |
+
if self._is_cache_valid(cached_result["timestamp"]):
|
87 |
+
self.logger.info(" Returning cached result")
|
88 |
+
cached_result["from_cache"] = True
|
89 |
+
return cached_result
|
90 |
+
|
91 |
+
# Preprocess the query
|
92 |
+
processed_query = self._preprocess_query(query)
|
93 |
+
expanded_queries = self._expand_query(processed_query)
|
94 |
+
|
95 |
+
# Generate embeddings for all query variations
|
96 |
+
all_results = []
|
97 |
+
for q in expanded_queries:
|
98 |
+
query_embedding = self.embedding_generator.generate_query_embedding(q)
|
99 |
+
|
100 |
+
if query_embedding:
|
101 |
+
# Search for similar vectors
|
102 |
+
search_results = self.vector_db.search(
|
103 |
+
query_embedding=query_embedding,
|
104 |
+
top_k=self.top_k * 2, # Get more results for better filtering
|
105 |
+
filter=filter,
|
106 |
+
)
|
107 |
+
all_results.extend(search_results)
|
108 |
+
|
109 |
+
# Deduplicate and rank results
|
110 |
+
unique_results = self._deduplicate_results(all_results)
|
111 |
+
ranked_results = self._rank_results(unique_results, query)
|
112 |
+
|
113 |
+
# Filter results by similarity threshold
|
114 |
+
filtered_results = [
|
115 |
+
result
|
116 |
+
for result in ranked_results[: self.top_k]
|
117 |
+
if result.get("score", 0) >= self.similarity_threshold
|
118 |
+
]
|
119 |
+
|
120 |
+
# Extract and format context
|
121 |
+
context = self._extract_context(filtered_results)
|
122 |
+
|
123 |
+
# Prepare result
|
124 |
+
result = {
|
125 |
+
"query": query,
|
126 |
+
"processed_query": processed_query,
|
127 |
+
"expanded_queries": expanded_queries,
|
128 |
+
"context": context,
|
129 |
+
"total_results": len(filtered_results),
|
130 |
+
"processing_time": time.time() - start_time,
|
131 |
+
"timestamp": datetime.now(),
|
132 |
+
"from_cache": False,
|
133 |
+
}
|
134 |
+
|
135 |
+
# Cache the result
|
136 |
+
if self.enable_caching:
|
137 |
+
self.query_cache[cache_key] = result.copy()
|
138 |
+
|
139 |
+
# Add to query history
|
140 |
+
self._add_to_history(query, len(filtered_results))
|
141 |
+
|
142 |
+
self.logger.info(f"Query processed in {result['processing_time']:.2f}s")
|
143 |
+
return result
|
144 |
+
|
145 |
+
except Exception as e:
|
146 |
+
self.logger.error(f"❌ Error processing query: {str(e)}")
|
147 |
+
return {
|
148 |
+
"query": query,
|
149 |
+
"context": [],
|
150 |
+
"total_results": 0,
|
151 |
+
"error": str(e),
|
152 |
+
"processing_time": time.time() - start_time,
|
153 |
+
}
|
154 |
+
|
155 |
+
def _preprocess_query(self, query: str) -> str:
|
156 |
+
"""
|
157 |
+
Preprocess the query for better embedding generation.
|
158 |
+
|
159 |
+
Args:
|
160 |
+
query: Raw query string
|
161 |
+
|
162 |
+
Returns:
|
163 |
+
Preprocessed query string
|
164 |
+
"""
|
165 |
+
# Remove extra whitespace
|
166 |
+
query = " ".join(query.split())
|
167 |
+
|
168 |
+
# Remove special characters that might interfere
|
169 |
+
import re
|
170 |
+
|
171 |
+
query = re.sub(r"[^\w\s\-\?\!]", " ", query)
|
172 |
+
|
173 |
+
# Normalize question words
|
174 |
+
question_words = {
|
175 |
+
"whats": "what is",
|
176 |
+
"hows": "how is",
|
177 |
+
"wheres": "where is",
|
178 |
+
"whos": "who is",
|
179 |
+
"whens": "when is",
|
180 |
+
}
|
181 |
+
|
182 |
+
for abbrev, full in question_words.items():
|
183 |
+
query = query.replace(abbrev, full)
|
184 |
+
|
185 |
+
return query.strip()
|
186 |
+
|
187 |
+
def _expand_query(self, query: str) -> List[str]:
|
188 |
+
"""
|
189 |
+
Expand the query with variations for better retrieval.
|
190 |
+
|
191 |
+
Args:
|
192 |
+
query: Preprocessed query
|
193 |
+
|
194 |
+
Returns:
|
195 |
+
List of query variations
|
196 |
+
"""
|
197 |
+
expanded = [query]
|
198 |
+
|
199 |
+
# Add question variations
|
200 |
+
if not any(
|
201 |
+
q in query.lower() for q in ["what", "how", "why", "when", "where", "who"]
|
202 |
+
):
|
203 |
+
expanded.append(f"what is {query}")
|
204 |
+
expanded.append(f"how does {query} work")
|
205 |
+
|
206 |
+
# Add definition variation
|
207 |
+
if "definition" not in query.lower() and "define" not in query.lower():
|
208 |
+
expanded.append(f"{query} definition")
|
209 |
+
|
210 |
+
# Add example variation
|
211 |
+
if "example" not in query.lower():
|
212 |
+
expanded.append(f"{query} examples")
|
213 |
+
|
214 |
+
return expanded[:3] # Limit to 3 variations
|
215 |
+
|
216 |
+
def _deduplicate_results(
|
217 |
+
self, results: List[Dict[str, Any]]
|
218 |
+
) -> List[Dict[str, Any]]:
|
219 |
+
"""
|
220 |
+
Remove duplicate results based on content similarity.
|
221 |
+
|
222 |
+
Args:
|
223 |
+
results: List of search results
|
224 |
+
|
225 |
+
Returns:
|
226 |
+
Deduplicated results
|
227 |
+
"""
|
228 |
+
seen_ids = set()
|
229 |
+
unique_results = []
|
230 |
+
|
231 |
+
for result in results:
|
232 |
+
result_id = result.get("id")
|
233 |
+
if result_id and result_id not in seen_ids:
|
234 |
+
seen_ids.add(result_id)
|
235 |
+
unique_results.append(result)
|
236 |
+
|
237 |
+
return unique_results
|
238 |
+
|
239 |
+
def _rank_results(
|
240 |
+
self, results: List[Dict[str, Any]], query: str
|
241 |
+
) -> List[Dict[str, Any]]:
|
242 |
+
"""
|
243 |
+
Rank results based on multiple factors.
|
244 |
+
|
245 |
+
Args:
|
246 |
+
results: List of search results
|
247 |
+
query: Original query
|
248 |
+
|
249 |
+
Returns:
|
250 |
+
Ranked results
|
251 |
+
"""
|
252 |
+
query_words = set(query.lower().split())
|
253 |
+
|
254 |
+
for result in results:
|
255 |
+
# Base score from similarity
|
256 |
+
base_score = result.get("score", 0.0)
|
257 |
+
|
258 |
+
# Boost score based on text relevance
|
259 |
+
text = result.get("metadata", {}).get("text", "").lower()
|
260 |
+
text_words = set(text.split())
|
261 |
+
word_overlap = len(query_words.intersection(text_words))
|
262 |
+
relevance_boost = word_overlap / max(len(query_words), 1) * 0.1
|
263 |
+
|
264 |
+
# Boost score based on source type
|
265 |
+
source = result.get("metadata", {}).get("source", "")
|
266 |
+
source_boost = 0.0
|
267 |
+
if source.endswith(".pdf"):
|
268 |
+
source_boost = 0.05 # PDFs often contain structured info
|
269 |
+
elif "http" in source:
|
270 |
+
source_boost = 0.02 # Web content
|
271 |
+
|
272 |
+
# Calculate final score
|
273 |
+
final_score = base_score + relevance_boost + source_boost
|
274 |
+
result["final_score"] = min(final_score, 1.0)
|
275 |
+
|
276 |
+
# Sort by final score
|
277 |
+
return sorted(results, key=lambda x: x.get("final_score", 0), reverse=True)
|
278 |
+
|
279 |
+
def _extract_context(
|
280 |
+
self, search_results: List[Dict[str, Any]]
|
281 |
+
) -> List[Dict[str, Any]]:
|
282 |
+
"""
|
283 |
+
Extract and format context from search results.
|
284 |
+
|
285 |
+
Args:
|
286 |
+
search_results: List of search results from vector database
|
287 |
+
|
288 |
+
Returns:
|
289 |
+
List of formatted context items
|
290 |
+
"""
|
291 |
+
context = []
|
292 |
+
total_length = 0
|
293 |
+
|
294 |
+
for result in search_results:
|
295 |
+
# Extract text content from metadata
|
296 |
+
text = result.get("metadata", {}).get("text", "")
|
297 |
+
|
298 |
+
# Check if adding this context would exceed the limit
|
299 |
+
if total_length + len(text) > self.max_context_length and context:
|
300 |
+
break
|
301 |
+
|
302 |
+
# Format context item with enhanced metadata
|
303 |
+
context_item = {
|
304 |
+
"text": text,
|
305 |
+
"score": result.get("score", 0),
|
306 |
+
"final_score": result.get("final_score", result.get("score", 0)),
|
307 |
+
"source": result.get("metadata", {}).get("source", "unknown"),
|
308 |
+
"chunk_id": result.get("id", ""),
|
309 |
+
"metadata": result.get("metadata", {}),
|
310 |
+
"relevance_rank": len(context) + 1,
|
311 |
+
}
|
312 |
+
|
313 |
+
context.append(context_item)
|
314 |
+
total_length += len(text)
|
315 |
+
|
316 |
+
self.logger.info(
|
317 |
+
f"Extracted {len(context)} context items (total length: {total_length})"
|
318 |
+
)
|
319 |
+
return context
|
320 |
+
|
321 |
+
def _generate_cache_key(self, query: str, filter: Optional[Dict[str, Any]]) -> str:
|
322 |
+
"""Generate a cache key for the query."""
|
323 |
+
import hashlib
|
324 |
+
|
325 |
+
filter_str = str(sorted(filter.items())) if filter else ""
|
326 |
+
cache_string = f"{query.lower().strip()}{filter_str}"
|
327 |
+
return hashlib.md5(cache_string.encode()).hexdigest()
|
328 |
+
|
329 |
+
def _is_cache_valid(self, timestamp: datetime) -> bool:
|
330 |
+
"""Check if cached result is still valid."""
|
331 |
+
return datetime.now() - timestamp < timedelta(seconds=self.cache_ttl)
|
332 |
+
|
333 |
+
def _add_to_history(self, query: str, result_count: int):
|
334 |
+
"""Add query to history for analytics."""
|
335 |
+
self.query_history.append(
|
336 |
+
{
|
337 |
+
"query": query,
|
338 |
+
"timestamp": datetime.now(),
|
339 |
+
"result_count": result_count,
|
340 |
+
}
|
341 |
+
)
|
342 |
+
|
343 |
+
# Keep only last 100 queries
|
344 |
+
if len(self.query_history) > 100:
|
345 |
+
self.query_history = self.query_history[-100:]
|
346 |
+
|
347 |
+
def get_query_suggestions(self, partial_query: str) -> List[str]:
|
348 |
+
"""
|
349 |
+
Generate query suggestions based on partial input and history.
|
350 |
+
|
351 |
+
Args:
|
352 |
+
partial_query: Partial query string
|
353 |
+
|
354 |
+
Returns:
|
355 |
+
List of suggested queries
|
356 |
+
"""
|
357 |
+
suggestions = []
|
358 |
+
|
359 |
+
# Add suggestions from query history
|
360 |
+
for hist_item in reversed(self.query_history[-20:]): # Last 20 queries
|
361 |
+
hist_query = hist_item["query"]
|
362 |
+
if (
|
363 |
+
partial_query.lower() in hist_query.lower()
|
364 |
+
and hist_query not in suggestions
|
365 |
+
):
|
366 |
+
suggestions.append(hist_query)
|
367 |
+
|
368 |
+
# Add template-based suggestions
|
369 |
+
if len(suggestions) < 3:
|
370 |
+
templates = [
|
371 |
+
f"What is {partial_query}?",
|
372 |
+
f"How does {partial_query} work?",
|
373 |
+
f"Examples of {partial_query}",
|
374 |
+
f"{partial_query} definition",
|
375 |
+
f"{partial_query} best practices",
|
376 |
+
]
|
377 |
+
|
378 |
+
for template in templates:
|
379 |
+
if template not in suggestions:
|
380 |
+
suggestions.append(template)
|
381 |
+
if len(suggestions) >= 5:
|
382 |
+
break
|
383 |
+
|
384 |
+
return suggestions[:5]
|
385 |
+
|
386 |
+
def get_query_analytics(self) -> Dict[str, Any]:
|
387 |
+
"""
|
388 |
+
Get analytics about query patterns.
|
389 |
+
|
390 |
+
Returns:
|
391 |
+
Dictionary with query analytics
|
392 |
+
"""
|
393 |
+
if not self.query_history:
|
394 |
+
return {"total_queries": 0, "cache_hit_rate": 0.0}
|
395 |
+
|
396 |
+
total_queries = len(self.query_history)
|
397 |
+
recent_queries = [q["query"] for q in self.query_history[-10:]]
|
398 |
+
|
399 |
+
# Calculate average results per query
|
400 |
+
avg_results = sum(q["result_count"] for q in self.query_history) / total_queries
|
401 |
+
|
402 |
+
# Most common query patterns
|
403 |
+
query_words = []
|
404 |
+
for q in self.query_history:
|
405 |
+
query_words.extend(q["query"].lower().split())
|
406 |
+
|
407 |
+
from collections import Counter
|
408 |
+
|
409 |
+
common_words = Counter(query_words).most_common(5)
|
410 |
+
|
411 |
+
return {
|
412 |
+
"total_queries": total_queries,
|
413 |
+
"average_results_per_query": round(avg_results, 2),
|
414 |
+
"recent_queries": recent_queries,
|
415 |
+
"common_query_words": common_words,
|
416 |
+
"cache_size": len(self.query_cache),
|
417 |
+
}
|
418 |
+
|
419 |
+
def clear_cache(self):
|
420 |
+
"""Clear the query cache."""
|
421 |
+
self.query_cache.clear()
|
422 |
+
self.logger.info("Query cache cleared")
|
423 |
+
|
424 |
+
def clear_history(self):
|
425 |
+
"""Clear the query history."""
|
426 |
+
self.query_history.clear()
|
427 |
+
self.logger.info("Query history cleared")
|
src/rag/query_router.py
ADDED
@@ -0,0 +1,587 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Query Router Module
|
3 |
+
|
4 |
+
This module intelligently routes queries between local document search
|
5 |
+
and live web search based on query analysis and user preferences.
|
6 |
+
|
7 |
+
Technology: Custom routing logic with RAG + Live Search integration
|
8 |
+
"""
|
9 |
+
|
10 |
+
import logging
|
11 |
+
import time
|
12 |
+
from typing import Dict, List, Any, Optional, Tuple
|
13 |
+
from datetime import datetime
|
14 |
+
from enum import Enum
|
15 |
+
|
16 |
+
|
17 |
+
class QueryType(Enum):
|
18 |
+
"""Enumeration of different query types for routing decisions."""
|
19 |
+
|
20 |
+
FACTUAL = "factual" # 📊 Current facts, news, data
|
21 |
+
CONCEPTUAL = "conceptual" # 💡 Definitions, explanations
|
22 |
+
PROCEDURAL = "procedural" # 🔧 How-to, instructions
|
23 |
+
ANALYTICAL = "analytical" # 📈 Analysis, comparisons
|
24 |
+
TEMPORAL = "temporal" # ⏰ Time-sensitive information
|
25 |
+
HYBRID = "hybrid" # 🔄 Requires both sources
|
26 |
+
|
27 |
+
|
28 |
+
class QueryRouter:
|
29 |
+
"""
|
30 |
+
Intelligent query router that decides between local docs and live search.
|
31 |
+
|
32 |
+
Features:
|
33 |
+
- Query type classification
|
34 |
+
- Intelligent routing decisions
|
35 |
+
- Hybrid search coordination
|
36 |
+
- Result fusion and ranking
|
37 |
+
- Performance optimization
|
38 |
+
"""
|
39 |
+
|
40 |
+
def __init__(
|
41 |
+
self,
|
42 |
+
local_query_processor,
|
43 |
+
live_search_processor,
|
44 |
+
config: Optional[Dict[str, Any]] = None,
|
45 |
+
):
|
46 |
+
"""
|
47 |
+
Initialize the QueryRouter.
|
48 |
+
|
49 |
+
Args:
|
50 |
+
local_query_processor: Local document query processor
|
51 |
+
live_search_processor: Live web search processor
|
52 |
+
config: Configuration dictionary
|
53 |
+
"""
|
54 |
+
self.local_processor = local_query_processor
|
55 |
+
self.live_processor = live_search_processor
|
56 |
+
self.config = config or {}
|
57 |
+
self.logger = logging.getLogger(__name__)
|
58 |
+
|
59 |
+
# 🎯 Routing configuration
|
60 |
+
self.enable_hybrid_search = self.config.get("enable_hybrid_search", True)
|
61 |
+
self.local_weight = self.config.get("local_weight", 0.6)
|
62 |
+
self.live_weight = self.config.get("live_weight", 0.4)
|
63 |
+
self.confidence_threshold = self.config.get("confidence_threshold", 0.5)
|
64 |
+
self.max_hybrid_results = self.config.get("max_hybrid_results", 10)
|
65 |
+
|
66 |
+
# 📊 Analytics and caching
|
67 |
+
self.routing_history = []
|
68 |
+
self.routing_cache = {}
|
69 |
+
|
70 |
+
# 🔍 Query classification patterns
|
71 |
+
self._init_classification_patterns()
|
72 |
+
|
73 |
+
self.logger.info("QueryRouter initialized with intelligent routing")
|
74 |
+
|
75 |
+
def _init_classification_patterns(self):
|
76 |
+
"""Initialize patterns for query classification."""
|
77 |
+
self.temporal_keywords = {
|
78 |
+
"current",
|
79 |
+
"latest",
|
80 |
+
"recent",
|
81 |
+
"today",
|
82 |
+
"now",
|
83 |
+
"2025",
|
84 |
+
"breaking",
|
85 |
+
"news",
|
86 |
+
"update",
|
87 |
+
"trending",
|
88 |
+
"happening",
|
89 |
+
}
|
90 |
+
|
91 |
+
self.factual_keywords = {
|
92 |
+
"what is",
|
93 |
+
"who is",
|
94 |
+
"when did",
|
95 |
+
"where is",
|
96 |
+
"statistics",
|
97 |
+
"data",
|
98 |
+
"facts",
|
99 |
+
"numbers",
|
100 |
+
"rate",
|
101 |
+
"percentage",
|
102 |
+
}
|
103 |
+
|
104 |
+
self.procedural_keywords = {
|
105 |
+
"how to",
|
106 |
+
"steps",
|
107 |
+
"guide",
|
108 |
+
"tutorial",
|
109 |
+
"instructions",
|
110 |
+
"process",
|
111 |
+
"method",
|
112 |
+
"way to",
|
113 |
+
"procedure",
|
114 |
+
}
|
115 |
+
|
116 |
+
self.conceptual_keywords = {
|
117 |
+
"explain",
|
118 |
+
"definition",
|
119 |
+
"meaning",
|
120 |
+
"concept",
|
121 |
+
"theory",
|
122 |
+
"principle",
|
123 |
+
"idea",
|
124 |
+
"understand",
|
125 |
+
"clarify",
|
126 |
+
}
|
127 |
+
|
128 |
+
def route_query(
|
129 |
+
self,
|
130 |
+
query: str,
|
131 |
+
use_live_search: bool = False,
|
132 |
+
max_results: int = 5,
|
133 |
+
search_options: Optional[Dict[str, Any]] = None,
|
134 |
+
search_mode: str = "auto",
|
135 |
+
) -> Dict[str, Any]:
|
136 |
+
"""
|
137 |
+
Route query to appropriate search method(s) with enhanced control.
|
138 |
+
|
139 |
+
Args:
|
140 |
+
query: User query string
|
141 |
+
use_live_search: Enable live search (will use hybrid approach)
|
142 |
+
max_results: Maximum results to return
|
143 |
+
search_options: Additional search options
|
144 |
+
search_mode: Search mode - "auto", "local_only", "live_only", "hybrid"
|
145 |
+
|
146 |
+
Returns:
|
147 |
+
Dictionary with routed results and metadata
|
148 |
+
"""
|
149 |
+
if not query or not query.strip():
|
150 |
+
return {
|
151 |
+
"query": query,
|
152 |
+
"results": [],
|
153 |
+
"routing_decision": "error",
|
154 |
+
"error": "Empty query provided",
|
155 |
+
}
|
156 |
+
|
157 |
+
self.logger.info(f" Routing query: {query[:100]}...")
|
158 |
+
start_time = time.time()
|
159 |
+
|
160 |
+
try:
|
161 |
+
# 🎯 Classify query type
|
162 |
+
query_type = self._classify_query(query)
|
163 |
+
|
164 |
+
# 🔄 Make routing decision with enhanced logic
|
165 |
+
routing_decision = self._make_enhanced_routing_decision(
|
166 |
+
query, query_type, use_live_search, search_mode
|
167 |
+
)
|
168 |
+
|
169 |
+
# 🚀 Execute search based on routing decision
|
170 |
+
if routing_decision == "local_only":
|
171 |
+
result = self._search_local_only(query, max_results)
|
172 |
+
elif routing_decision == "live_only":
|
173 |
+
result = self._search_live_only(query, max_results, search_options)
|
174 |
+
elif routing_decision == "hybrid":
|
175 |
+
result = self._search_hybrid(query, max_results, search_options)
|
176 |
+
else:
|
177 |
+
result = self._search_fallback(query, max_results)
|
178 |
+
|
179 |
+
# 📊 Add routing metadata
|
180 |
+
result.update(
|
181 |
+
{
|
182 |
+
"query_type": query_type.value,
|
183 |
+
"routing_decision": routing_decision,
|
184 |
+
"processing_time": time.time() - start_time,
|
185 |
+
"timestamp": datetime.now(),
|
186 |
+
}
|
187 |
+
)
|
188 |
+
|
189 |
+
# 📈 Track routing decision
|
190 |
+
self._track_routing_decision(query, query_type, routing_decision)
|
191 |
+
|
192 |
+
self.logger.info(
|
193 |
+
f" Query routed via {routing_decision} in {result['processing_time']:.2f}s"
|
194 |
+
)
|
195 |
+
return result
|
196 |
+
|
197 |
+
except Exception as e:
|
198 |
+
self.logger.error(f" Error in query routing: {str(e)}")
|
199 |
+
return {
|
200 |
+
"query": query,
|
201 |
+
"results": [],
|
202 |
+
"routing_decision": "error",
|
203 |
+
"error": str(e),
|
204 |
+
"processing_time": time.time() - start_time,
|
205 |
+
}
|
206 |
+
|
207 |
+
def _classify_query(self, query: str) -> QueryType:
|
208 |
+
"""
|
209 |
+
Classify query type for routing decisions.
|
210 |
+
|
211 |
+
Args:
|
212 |
+
query: Query string to classify
|
213 |
+
|
214 |
+
Returns:
|
215 |
+
QueryType enum value
|
216 |
+
"""
|
217 |
+
query_lower = query.lower()
|
218 |
+
|
219 |
+
# 🔍 Check for temporal indicators
|
220 |
+
if any(keyword in query_lower for keyword in self.temporal_keywords):
|
221 |
+
return QueryType.TEMPORAL
|
222 |
+
|
223 |
+
# 📊 Check for factual queries
|
224 |
+
if any(keyword in query_lower for keyword in self.factual_keywords):
|
225 |
+
return QueryType.FACTUAL
|
226 |
+
|
227 |
+
# 🔧 Check for procedural queries
|
228 |
+
if any(keyword in query_lower for keyword in self.procedural_keywords):
|
229 |
+
return QueryType.PROCEDURAL
|
230 |
+
|
231 |
+
# 💡 Check for conceptual queries
|
232 |
+
if any(keyword in query_lower for keyword in self.conceptual_keywords):
|
233 |
+
return QueryType.CONCEPTUAL
|
234 |
+
|
235 |
+
# 📈 Default to analytical for complex queries
|
236 |
+
if len(query.split()) > 10:
|
237 |
+
return QueryType.ANALYTICAL
|
238 |
+
|
239 |
+
# 🔄 Default to hybrid for uncertain cases
|
240 |
+
return QueryType.HYBRID
|
241 |
+
|
242 |
+
def _make_routing_decision(
|
243 |
+
self, query: str, query_type: QueryType, force_live: bool
|
244 |
+
) -> str:
|
245 |
+
"""
|
246 |
+
Make intelligent routing decision based on query analysis.
|
247 |
+
|
248 |
+
Args:
|
249 |
+
query: Query string
|
250 |
+
query_type: Classified query type
|
251 |
+
force_live: Whether to enable live search (not force only live)
|
252 |
+
|
253 |
+
Returns:
|
254 |
+
Routing decision string
|
255 |
+
"""
|
256 |
+
# 🔄 Smart hybrid approach when live search is enabled
|
257 |
+
if force_live:
|
258 |
+
# ✨ Instead of live_only, use hybrid to combine both sources
|
259 |
+
if query_type == QueryType.TEMPORAL:
|
260 |
+
return "hybrid" # ⏰ Time-sensitive + stored context
|
261 |
+
else:
|
262 |
+
return "hybrid" # 🎯 Always combine live + stored data
|
263 |
+
|
264 |
+
# 🎯 Route based on query type (when live search is disabled)
|
265 |
+
if query_type == QueryType.TEMPORAL:
|
266 |
+
return "local_only" # ⏰ Only stored data when live disabled
|
267 |
+
|
268 |
+
elif query_type == QueryType.FACTUAL:
|
269 |
+
return "local_only" # 📊 Facts from stored documents
|
270 |
+
|
271 |
+
elif query_type == QueryType.PROCEDURAL:
|
272 |
+
return "local_only" # 🔧 Procedures likely in documents
|
273 |
+
|
274 |
+
elif query_type == QueryType.CONCEPTUAL:
|
275 |
+
return "local_only" # 💡 Concepts likely in documents
|
276 |
+
|
277 |
+
elif query_type == QueryType.ANALYTICAL:
|
278 |
+
return "local_only" # 📈 Analysis from stored data
|
279 |
+
|
280 |
+
else: # QueryType.HYBRID
|
281 |
+
return "local_only" # 🔄 Default to local when live disabled
|
282 |
+
|
283 |
+
def _make_enhanced_routing_decision(
|
284 |
+
self, query: str, query_type: QueryType, use_live_search: bool, search_mode: str
|
285 |
+
) -> str:
|
286 |
+
"""
|
287 |
+
Enhanced routing decision with explicit search mode control.
|
288 |
+
|
289 |
+
Args:
|
290 |
+
query: Query string
|
291 |
+
query_type: Classified query type
|
292 |
+
use_live_search: Whether live search is enabled
|
293 |
+
search_mode: Explicit search mode preference
|
294 |
+
|
295 |
+
Returns:
|
296 |
+
Routing decision string
|
297 |
+
"""
|
298 |
+
# 🎯 Explicit mode override - user ka choice priority
|
299 |
+
if search_mode == "local_only":
|
300 |
+
return "local_only"
|
301 |
+
elif search_mode == "live_only":
|
302 |
+
return "live_only" if self.live_processor.is_enabled() else "local_only"
|
303 |
+
elif search_mode == "hybrid":
|
304 |
+
return "hybrid" if self.live_processor.is_enabled() else "local_only"
|
305 |
+
|
306 |
+
# 🧠 Auto mode - intelligent decision making
|
307 |
+
elif search_mode == "auto":
|
308 |
+
return self._make_routing_decision(query, query_type, use_live_search)
|
309 |
+
|
310 |
+
# 🔄 Fallback to original logic
|
311 |
+
else:
|
312 |
+
return self._make_routing_decision(query, query_type, use_live_search)
|
313 |
+
|
314 |
+
def _search_local_only(self, query: str, max_results: int) -> Dict[str, Any]:
|
315 |
+
"""Search only local documents."""
|
316 |
+
self.logger.info(" Searching local documents only")
|
317 |
+
|
318 |
+
try:
|
319 |
+
local_result = self.local_processor.process_query(query)
|
320 |
+
|
321 |
+
# 🔄 Format results consistently
|
322 |
+
formatted_results = []
|
323 |
+
for item in local_result.get("context", [])[:max_results]:
|
324 |
+
formatted_results.append(
|
325 |
+
{
|
326 |
+
"title": f"Document: {item.get('source', 'Unknown')}",
|
327 |
+
"content": item.get("text", ""),
|
328 |
+
"score": item.get("score", 0.0),
|
329 |
+
"source": item.get("source", "local_document"),
|
330 |
+
"type": "local_document",
|
331 |
+
"metadata": item.get("metadata", {}),
|
332 |
+
}
|
333 |
+
)
|
334 |
+
|
335 |
+
return {
|
336 |
+
"query": query,
|
337 |
+
"results": formatted_results,
|
338 |
+
"total_results": len(formatted_results),
|
339 |
+
"sources": ["local_documents"],
|
340 |
+
"local_results": local_result.get("total_results", 0),
|
341 |
+
}
|
342 |
+
|
343 |
+
except Exception as e:
|
344 |
+
self.logger.error(f" Local search error: {str(e)}")
|
345 |
+
return {
|
346 |
+
"query": query,
|
347 |
+
"results": [],
|
348 |
+
"total_results": 0,
|
349 |
+
"error": f"Local search failed: {str(e)}",
|
350 |
+
}
|
351 |
+
|
352 |
+
def _search_live_only(
|
353 |
+
self, query: str, max_results: int, search_options: Optional[Dict[str, Any]]
|
354 |
+
) -> Dict[str, Any]:
|
355 |
+
"""Search only live web sources."""
|
356 |
+
self.logger.info(" Searching live web sources only")
|
357 |
+
|
358 |
+
try:
|
359 |
+
# 🎯 Extract search options
|
360 |
+
options = search_options or {}
|
361 |
+
search_depth = options.get("search_depth", "basic")
|
362 |
+
time_range = options.get("time_range", "month")
|
363 |
+
|
364 |
+
live_result = self.live_processor.search_web(
|
365 |
+
query,
|
366 |
+
max_results=max_results,
|
367 |
+
search_depth=search_depth,
|
368 |
+
time_range=time_range,
|
369 |
+
)
|
370 |
+
|
371 |
+
return {
|
372 |
+
"query": query,
|
373 |
+
"results": live_result.get("results", []),
|
374 |
+
"total_results": live_result.get("total_results", 0),
|
375 |
+
"sources": ["live_web"],
|
376 |
+
"live_results": live_result.get("total_results", 0),
|
377 |
+
"search_params": live_result.get("search_params", {}),
|
378 |
+
}
|
379 |
+
|
380 |
+
except Exception as e:
|
381 |
+
self.logger.error(f" Live search error: {str(e)}")
|
382 |
+
return {
|
383 |
+
"query": query,
|
384 |
+
"results": [],
|
385 |
+
"total_results": 0,
|
386 |
+
"error": f"Live search failed: {str(e)}",
|
387 |
+
}
|
388 |
+
|
389 |
+
def _search_hybrid(
|
390 |
+
self, query: str, max_results: int, search_options: Optional[Dict[str, Any]]
|
391 |
+
) -> Dict[str, Any]:
|
392 |
+
"""Perform hybrid search combining local and live sources."""
|
393 |
+
self.logger.info(" Performing hybrid search")
|
394 |
+
|
395 |
+
try:
|
396 |
+
# 📊 Calculate result distribution
|
397 |
+
local_count = int(max_results * self.local_weight)
|
398 |
+
live_count = max_results - local_count
|
399 |
+
|
400 |
+
# 🚀 Perform both searches concurrently (simplified sequential for now)
|
401 |
+
local_result = self.local_processor.process_query(query)
|
402 |
+
|
403 |
+
options = search_options or {}
|
404 |
+
live_result = self.live_processor.search_web(
|
405 |
+
query,
|
406 |
+
max_results=live_count,
|
407 |
+
search_depth=options.get("search_depth", "basic"),
|
408 |
+
time_range=options.get("time_range", "month"),
|
409 |
+
)
|
410 |
+
|
411 |
+
# 🔄 Combine and rank results
|
412 |
+
combined_results = self._fuse_results(
|
413 |
+
local_result, live_result, local_count, live_count
|
414 |
+
)
|
415 |
+
|
416 |
+
return {
|
417 |
+
"query": query,
|
418 |
+
"results": combined_results[:max_results],
|
419 |
+
"total_results": len(combined_results),
|
420 |
+
"sources": ["local_documents", "live_web"],
|
421 |
+
"local_results": local_result.get("total_results", 0),
|
422 |
+
"live_results": live_result.get("total_results", 0),
|
423 |
+
"fusion_method": "weighted_ranking",
|
424 |
+
}
|
425 |
+
|
426 |
+
except Exception as e:
|
427 |
+
self.logger.error(f" Hybrid search error: {str(e)}")
|
428 |
+
return self._search_fallback(query, max_results)
|
429 |
+
|
430 |
+
def _fuse_results(
|
431 |
+
self,
|
432 |
+
local_result: Dict[str, Any],
|
433 |
+
live_result: Dict[str, Any],
|
434 |
+
local_count: int,
|
435 |
+
live_count: int,
|
436 |
+
) -> List[Dict[str, Any]]:
|
437 |
+
"""
|
438 |
+
Fuse results from local and live searches.
|
439 |
+
|
440 |
+
Args:
|
441 |
+
local_result: Results from local search
|
442 |
+
live_result: Results from live search
|
443 |
+
local_count: Number of local results to include
|
444 |
+
live_count: Number of live results to include
|
445 |
+
|
446 |
+
Returns:
|
447 |
+
Fused and ranked results
|
448 |
+
"""
|
449 |
+
fused_results = []
|
450 |
+
|
451 |
+
# 📚 Process local results
|
452 |
+
for item in local_result.get("context", [])[:local_count]:
|
453 |
+
fused_results.append(
|
454 |
+
{
|
455 |
+
"title": f"Document: {item.get('source', 'Unknown')}",
|
456 |
+
"content": item.get("text", ""),
|
457 |
+
"score": item.get("score", 0.0) * self.local_weight,
|
458 |
+
"source": item.get("source", "local_document"),
|
459 |
+
"type": "local_document",
|
460 |
+
"metadata": item.get("metadata", {}),
|
461 |
+
"fusion_score": item.get("score", 0.0) * self.local_weight,
|
462 |
+
}
|
463 |
+
)
|
464 |
+
|
465 |
+
# 🌐 Process live results
|
466 |
+
for item in live_result.get("results", [])[:live_count]:
|
467 |
+
fused_results.append(
|
468 |
+
{
|
469 |
+
"title": item.get("title", "Web Result"),
|
470 |
+
"content": item.get("content", ""),
|
471 |
+
"score": item.get("relevance_score", 0.0) * self.live_weight,
|
472 |
+
"source": item.get("url", "web_search"),
|
473 |
+
"type": "web_result",
|
474 |
+
"metadata": item.get("metadata", {}),
|
475 |
+
"fusion_score": item.get("relevance_score", 0.0) * self.live_weight,
|
476 |
+
}
|
477 |
+
)
|
478 |
+
|
479 |
+
# 🔄 Sort by fusion score
|
480 |
+
fused_results.sort(key=lambda x: x.get("fusion_score", 0), reverse=True)
|
481 |
+
|
482 |
+
return fused_results
|
483 |
+
|
484 |
+
def _search_fallback(self, query: str, max_results: int) -> Dict[str, Any]:
|
485 |
+
"""Fallback search method when other methods fail."""
|
486 |
+
self.logger.warning(" Using fallback search method")
|
487 |
+
|
488 |
+
try:
|
489 |
+
# 📚 Try local search first
|
490 |
+
local_result = self.local_processor.process_query(query)
|
491 |
+
|
492 |
+
if local_result.get("context"):
|
493 |
+
return self._search_local_only(query, max_results)
|
494 |
+
else:
|
495 |
+
return {
|
496 |
+
"query": query,
|
497 |
+
"results": [],
|
498 |
+
"total_results": 0,
|
499 |
+
"sources": [],
|
500 |
+
"error": "No results found in fallback search",
|
501 |
+
}
|
502 |
+
|
503 |
+
except Exception as e:
|
504 |
+
self.logger.error(f" Fallback search failed: {str(e)}")
|
505 |
+
return {
|
506 |
+
"query": query,
|
507 |
+
"results": [],
|
508 |
+
"total_results": 0,
|
509 |
+
"error": f"All search methods failed: {str(e)}",
|
510 |
+
}
|
511 |
+
|
512 |
+
def _track_routing_decision(
|
513 |
+
self, query: str, query_type: QueryType, routing_decision: str
|
514 |
+
):
|
515 |
+
"""Track routing decisions for analytics."""
|
516 |
+
self.routing_history.append(
|
517 |
+
{
|
518 |
+
"query": query[:100], # Truncate for privacy
|
519 |
+
"query_type": query_type.value,
|
520 |
+
"routing_decision": routing_decision,
|
521 |
+
"timestamp": datetime.now(),
|
522 |
+
}
|
523 |
+
)
|
524 |
+
|
525 |
+
# 📊 Keep only last 100 routing decisions
|
526 |
+
if len(self.routing_history) > 100:
|
527 |
+
self.routing_history = self.routing_history[-100:]
|
528 |
+
|
529 |
+
def get_routing_analytics(self) -> Dict[str, Any]:
|
530 |
+
"""
|
531 |
+
Get analytics about routing patterns.
|
532 |
+
|
533 |
+
Returns:
|
534 |
+
Dictionary with routing analytics
|
535 |
+
"""
|
536 |
+
if not self.routing_history:
|
537 |
+
return {
|
538 |
+
"total_queries": 0,
|
539 |
+
"routing_distribution": {},
|
540 |
+
"query_type_distribution": {},
|
541 |
+
}
|
542 |
+
|
543 |
+
total_queries = len(self.routing_history)
|
544 |
+
|
545 |
+
# 📊 Calculate routing distribution
|
546 |
+
routing_counts = {}
|
547 |
+
query_type_counts = {}
|
548 |
+
|
549 |
+
for entry in self.routing_history:
|
550 |
+
routing = entry["routing_decision"]
|
551 |
+
query_type = entry["query_type"]
|
552 |
+
|
553 |
+
routing_counts[routing] = routing_counts.get(routing, 0) + 1
|
554 |
+
query_type_counts[query_type] = query_type_counts.get(query_type, 0) + 1
|
555 |
+
|
556 |
+
# 📈 Convert to percentages
|
557 |
+
routing_distribution = {
|
558 |
+
k: round((v / total_queries) * 100, 1) for k, v in routing_counts.items()
|
559 |
+
}
|
560 |
+
|
561 |
+
query_type_distribution = {
|
562 |
+
k: round((v / total_queries) * 100, 1) for k, v in query_type_counts.items()
|
563 |
+
}
|
564 |
+
|
565 |
+
return {
|
566 |
+
"total_queries": total_queries,
|
567 |
+
"routing_distribution": routing_distribution,
|
568 |
+
"query_type_distribution": query_type_distribution,
|
569 |
+
"recent_decisions": [
|
570 |
+
{
|
571 |
+
"query": entry["query"][:50] + "...",
|
572 |
+
"type": entry["query_type"],
|
573 |
+
"routing": entry["routing_decision"],
|
574 |
+
}
|
575 |
+
for entry in self.routing_history[-5:]
|
576 |
+
],
|
577 |
+
}
|
578 |
+
|
579 |
+
def clear_cache(self):
|
580 |
+
"""Clear routing cache."""
|
581 |
+
self.routing_cache.clear()
|
582 |
+
self.logger.info(" Routing cache cleared")
|
583 |
+
|
584 |
+
def clear_history(self):
|
585 |
+
"""Clear routing history."""
|
586 |
+
self.routing_history.clear()
|
587 |
+
self.logger.info(" Routing history cleared")
|
src/rag/response_generator.py
ADDED
@@ -0,0 +1,591 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Response Generator Module
|
3 |
+
|
4 |
+
This module is responsible for generating coherent responses based on
|
5 |
+
retrieved knowledge using LangChain RAG.
|
6 |
+
|
7 |
+
Technology: LangChain RAG (Retrieval Augmented Generation)
|
8 |
+
"""
|
9 |
+
|
10 |
+
import logging
|
11 |
+
import time
|
12 |
+
import os
|
13 |
+
from typing import Dict, List, Any, Optional
|
14 |
+
from datetime import datetime
|
15 |
+
|
16 |
+
|
17 |
+
class ResponseGenerator:
|
18 |
+
"""
|
19 |
+
Generates coherent responses based on retrieved knowledge.
|
20 |
+
|
21 |
+
Features:
|
22 |
+
- Context-aware response generation
|
23 |
+
- Source attribution and confidence scoring
|
24 |
+
- Multiple LLM provider support (Gemini, OpenAI)
|
25 |
+
- Response quality assessment
|
26 |
+
- Template-based fallback generation
|
27 |
+
"""
|
28 |
+
|
29 |
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
30 |
+
"""
|
31 |
+
Initialize the ResponseGenerator with configuration.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
config: Configuration dictionary with generation parameters
|
35 |
+
"""
|
36 |
+
self.config = config or {}
|
37 |
+
self.logger = logging.getLogger(__name__)
|
38 |
+
|
39 |
+
# Configuration settings
|
40 |
+
self.model = self.config.get("model", "gpt-3.5-turbo")
|
41 |
+
self.max_tokens = self.config.get("max_tokens", 500)
|
42 |
+
self.temperature = self.config.get("temperature", 0.7)
|
43 |
+
self.include_sources = self.config.get("include_sources", True)
|
44 |
+
|
45 |
+
# Initialize LLM providers
|
46 |
+
self.llm = None
|
47 |
+
self.gemini_client = None
|
48 |
+
self.openai_client = None
|
49 |
+
|
50 |
+
self._initialize_llm_providers()
|
51 |
+
|
52 |
+
# Response templates with markdown formatting
|
53 |
+
self.response_templates = {
|
54 |
+
"no_context": "## ℹ️ No Information Available\n\nI don't have enough information to answer your question. Please try:\n\n- **Uploading relevant documents** using the Upload tab\n- **Adding URLs** using the Add URLs tab\n- **Enabling live search** for real-time web results",
|
55 |
+
"error": "## ⚠️ Error Occurred\n\nI encountered an error while generating the response. Please try again.\n\nIf the problem persists, check your API keys in the Settings tab.",
|
56 |
+
"insufficient_confidence": "## 🤔 Limited Confidence\n\nBased on the available information, I found some relevant content, but I'm **not confident enough** to provide a definitive answer.\n\n**Suggestions:**\n- Try rephrasing your question\n- Add more specific documents\n- Enable live search for additional context",
|
57 |
+
}
|
58 |
+
|
59 |
+
self.logger.info("ResponseGenerator initialized with advanced features")
|
60 |
+
|
61 |
+
def _initialize_llm_providers(self):
|
62 |
+
"""Initialize available LLM providers with optimization."""
|
63 |
+
try:
|
64 |
+
# Try to initialize Gemini
|
65 |
+
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
66 |
+
if gemini_api_key:
|
67 |
+
try:
|
68 |
+
import google.generativeai as genai
|
69 |
+
|
70 |
+
# Check if settings manager has already initialized Gemini client
|
71 |
+
# This is an optimization to avoid recreating the client
|
72 |
+
from utils.settings_manager import SettingsManager
|
73 |
+
|
74 |
+
if (
|
75 |
+
hasattr(SettingsManager, "_gemini_client_cache")
|
76 |
+
and SettingsManager._gemini_client_cache is not None
|
77 |
+
and SettingsManager._gemini_client_key == gemini_api_key
|
78 |
+
):
|
79 |
+
|
80 |
+
self.logger.info(
|
81 |
+
"Reusing existing Gemini client from settings manager"
|
82 |
+
)
|
83 |
+
genai_client = SettingsManager._gemini_client_cache
|
84 |
+
else:
|
85 |
+
# Configure new client
|
86 |
+
genai.configure(api_key=gemini_api_key)
|
87 |
+
genai_client = genai
|
88 |
+
|
89 |
+
# Create model instance
|
90 |
+
self.gemini_client = genai_client.GenerativeModel(
|
91 |
+
"gemini-2.5-flash-preview-05-20"
|
92 |
+
)
|
93 |
+
self.logger.info("Gemini client initialized")
|
94 |
+
except ImportError:
|
95 |
+
self.logger.warning("Gemini SDK not available")
|
96 |
+
except Exception as e:
|
97 |
+
self.logger.warning(f"Failed to initialize Gemini: {e}")
|
98 |
+
|
99 |
+
# Try to initialize OpenAI
|
100 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
101 |
+
if openai_api_key:
|
102 |
+
try:
|
103 |
+
import openai
|
104 |
+
|
105 |
+
self.openai_client = openai.OpenAI(api_key=openai_api_key)
|
106 |
+
self.logger.info("OpenAI client initialized")
|
107 |
+
except ImportError:
|
108 |
+
self.logger.warning("OpenAI SDK not available")
|
109 |
+
except Exception as e:
|
110 |
+
self.logger.warning(f"Failed to initialize OpenAI: {e}")
|
111 |
+
|
112 |
+
# Try to initialize LangChain
|
113 |
+
try:
|
114 |
+
if self.gemini_client:
|
115 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
116 |
+
|
117 |
+
self.llm = ChatGoogleGenerativeAI(
|
118 |
+
model="gemini-2.5-flash-preview-05-20",
|
119 |
+
temperature=self.temperature,
|
120 |
+
google_api_key=gemini_api_key,
|
121 |
+
)
|
122 |
+
elif self.openai_client:
|
123 |
+
from langchain_openai import ChatOpenAI
|
124 |
+
|
125 |
+
self.llm = ChatOpenAI(
|
126 |
+
model=self.model,
|
127 |
+
temperature=self.temperature,
|
128 |
+
max_tokens=self.max_tokens,
|
129 |
+
openai_api_key=openai_api_key,
|
130 |
+
)
|
131 |
+
|
132 |
+
if self.llm:
|
133 |
+
self.logger.info("LangChain LLM initialized")
|
134 |
+
|
135 |
+
except ImportError:
|
136 |
+
self.logger.warning("LangChain not available")
|
137 |
+
except Exception as e:
|
138 |
+
self.logger.warning(f"Failed to initialize LangChain: {e}")
|
139 |
+
|
140 |
+
except Exception as e:
|
141 |
+
self.logger.error(f"❌ Error initializing LLM providers: {e}")
|
142 |
+
|
143 |
+
def generate_response(
|
144 |
+
self, query: str, context: List[Dict[str, Any]]
|
145 |
+
) -> Dict[str, Any]:
|
146 |
+
"""
|
147 |
+
Generate a response based on the query and retrieved context.
|
148 |
+
|
149 |
+
Args:
|
150 |
+
query: Original user query
|
151 |
+
context: List of retrieved context items with text and metadata
|
152 |
+
|
153 |
+
Returns:
|
154 |
+
Dictionary containing the generated response and metadata
|
155 |
+
"""
|
156 |
+
if not query:
|
157 |
+
return {
|
158 |
+
"response": "I need a question to answer.",
|
159 |
+
"sources": [],
|
160 |
+
"confidence": 0.0,
|
161 |
+
"error": "No query provided",
|
162 |
+
}
|
163 |
+
|
164 |
+
if not context:
|
165 |
+
return {
|
166 |
+
"response": self.response_templates["no_context"],
|
167 |
+
"sources": [],
|
168 |
+
"confidence": 0.0,
|
169 |
+
"error": "No context available",
|
170 |
+
}
|
171 |
+
|
172 |
+
self.logger.info(f"Generating response for query: {query[:100]}...")
|
173 |
+
start_time = time.time()
|
174 |
+
|
175 |
+
try:
|
176 |
+
# Prepare context for generation
|
177 |
+
formatted_context = self._format_context(context)
|
178 |
+
|
179 |
+
# Calculate initial confidence based on context quality
|
180 |
+
base_confidence = self._calculate_confidence(context)
|
181 |
+
|
182 |
+
# Generate response using available LLM
|
183 |
+
response_result = self._generate_with_llm(query, formatted_context)
|
184 |
+
|
185 |
+
if not response_result["success"]:
|
186 |
+
# Fallback to template-based generation
|
187 |
+
response_result = self._fallback_generation(query, formatted_context)
|
188 |
+
|
189 |
+
# Extract sources from context
|
190 |
+
sources = self._extract_sources(context) if self.include_sources else []
|
191 |
+
|
192 |
+
# Assess response quality
|
193 |
+
quality_score = self._assess_response_quality(
|
194 |
+
response_result["response"], query, context
|
195 |
+
)
|
196 |
+
|
197 |
+
# Calculate final confidence
|
198 |
+
final_confidence = min(base_confidence * quality_score, 1.0)
|
199 |
+
|
200 |
+
# Check if confidence is too low
|
201 |
+
if final_confidence < 0.3:
|
202 |
+
response_text = self.response_templates["insufficient_confidence"]
|
203 |
+
final_confidence = 0.2
|
204 |
+
else:
|
205 |
+
response_text = response_result["response"]
|
206 |
+
|
207 |
+
result = {
|
208 |
+
"response": response_text,
|
209 |
+
"sources": sources,
|
210 |
+
"confidence": final_confidence,
|
211 |
+
"context_items": len(context),
|
212 |
+
"generation_time": time.time() - start_time,
|
213 |
+
"model_used": response_result.get("model", "fallback"),
|
214 |
+
"quality_score": quality_score,
|
215 |
+
}
|
216 |
+
|
217 |
+
self.logger.info(f"Response generated in {result['generation_time']:.2f}s")
|
218 |
+
return result
|
219 |
+
|
220 |
+
except Exception as e:
|
221 |
+
self.logger.error(f"❌ Error generating response: {str(e)}")
|
222 |
+
return {
|
223 |
+
"response": self.response_templates["error"],
|
224 |
+
"sources": [],
|
225 |
+
"confidence": 0.0,
|
226 |
+
"error": str(e),
|
227 |
+
"generation_time": time.time() - start_time,
|
228 |
+
}
|
229 |
+
|
230 |
+
def _generate_with_llm(self, query: str, context: str) -> Dict[str, Any]:
|
231 |
+
"""
|
232 |
+
Generate response using available LLM providers.
|
233 |
+
|
234 |
+
Args:
|
235 |
+
query: User query
|
236 |
+
context: Formatted context string
|
237 |
+
|
238 |
+
Returns:
|
239 |
+
Dictionary with generation result
|
240 |
+
"""
|
241 |
+
# Create RAG prompt
|
242 |
+
prompt = self._create_rag_prompt(query, context)
|
243 |
+
|
244 |
+
# Try LangChain first
|
245 |
+
if self.llm:
|
246 |
+
try:
|
247 |
+
from langchain.schema import HumanMessage
|
248 |
+
|
249 |
+
messages = [HumanMessage(content=prompt)]
|
250 |
+
response = self.llm.invoke(messages)
|
251 |
+
return {
|
252 |
+
"success": True,
|
253 |
+
"response": response.content,
|
254 |
+
"model": "langchain",
|
255 |
+
}
|
256 |
+
except Exception as e:
|
257 |
+
self.logger.warning(f"LangChain generation failed: {e}")
|
258 |
+
|
259 |
+
# Try Gemini directly
|
260 |
+
if self.gemini_client:
|
261 |
+
try:
|
262 |
+
response = self.gemini_client.generate_content(prompt)
|
263 |
+
return {
|
264 |
+
"success": True,
|
265 |
+
"response": response.text,
|
266 |
+
"model": "gemini-2.5-flash-preview-05-20",
|
267 |
+
}
|
268 |
+
except Exception as e:
|
269 |
+
self.logger.warning(f"Gemini generation failed: {e}")
|
270 |
+
|
271 |
+
# Try OpenAI directly
|
272 |
+
if self.openai_client:
|
273 |
+
try:
|
274 |
+
response = self.openai_client.chat.completions.create(
|
275 |
+
model=self.model,
|
276 |
+
messages=[{"role": "user", "content": prompt}],
|
277 |
+
max_tokens=self.max_tokens,
|
278 |
+
temperature=self.temperature,
|
279 |
+
)
|
280 |
+
return {
|
281 |
+
"success": True,
|
282 |
+
"response": response.choices[0].message.content,
|
283 |
+
"model": self.model,
|
284 |
+
}
|
285 |
+
except Exception as e:
|
286 |
+
self.logger.warning(f"OpenAI generation failed: {e}")
|
287 |
+
|
288 |
+
return {"success": False, "response": "", "model": "none"}
|
289 |
+
|
290 |
+
def _create_rag_prompt(self, query: str, context: str) -> str:
|
291 |
+
"""
|
292 |
+
Create an enhanced prompt template for RAG generation with markdown formatting.
|
293 |
+
|
294 |
+
Args:
|
295 |
+
query: User query
|
296 |
+
context: Formatted context
|
297 |
+
|
298 |
+
Returns:
|
299 |
+
Formatted prompt string
|
300 |
+
"""
|
301 |
+
prompt = f"""You are an AI assistant that answers questions based on provided context. Follow these guidelines:
|
302 |
+
|
303 |
+
1. Answer the question using ONLY the information provided in the context
|
304 |
+
2. If the context doesn't contain enough information, clearly state this
|
305 |
+
3. Cite specific sources when making claims
|
306 |
+
4. Be concise but comprehensive
|
307 |
+
5. If multiple sources provide different information, acknowledge this
|
308 |
+
6. Use a professional and helpful tone
|
309 |
+
7. **Format your response in clean, readable Markdown**
|
310 |
+
|
311 |
+
Context Information:
|
312 |
+
{context}
|
313 |
+
|
314 |
+
Question: {query}
|
315 |
+
|
316 |
+
Instructions:
|
317 |
+
- Provide a clear, well-structured answer using **Markdown formatting**
|
318 |
+
- Use headers (##, ###) to organize sections
|
319 |
+
- Use **bold** for important points
|
320 |
+
- Use bullet points (-) or numbered lists (1.) for clarity
|
321 |
+
- Use `code blocks` for technical terms or specific data
|
322 |
+
- Include relevant details from the context
|
323 |
+
- If uncertain, express the level of confidence
|
324 |
+
- Do not make up information not present in the context
|
325 |
+
|
326 |
+
Format your response in Markdown with proper structure and formatting.
|
327 |
+
|
328 |
+
Answer:"""
|
329 |
+
|
330 |
+
return prompt
|
331 |
+
|
332 |
+
def _fallback_generation(self, query: str, context: str) -> Dict[str, Any]:
|
333 |
+
"""
|
334 |
+
Fallback response generation when LLM is not available.
|
335 |
+
|
336 |
+
Args:
|
337 |
+
query: User query
|
338 |
+
context: Formatted context
|
339 |
+
|
340 |
+
Returns:
|
341 |
+
Dictionary with generation result
|
342 |
+
"""
|
343 |
+
self.logger.info("Using fallback generation")
|
344 |
+
|
345 |
+
# Extract key information from context
|
346 |
+
context_lines = context.split("\n")
|
347 |
+
relevant_lines = [
|
348 |
+
line.strip()
|
349 |
+
for line in context_lines
|
350 |
+
if line.strip() and not line.startswith("[Source:")
|
351 |
+
]
|
352 |
+
|
353 |
+
if not relevant_lines:
|
354 |
+
return {
|
355 |
+
"success": True,
|
356 |
+
"response": self.response_templates["no_context"],
|
357 |
+
"model": "fallback",
|
358 |
+
}
|
359 |
+
|
360 |
+
# Create a structured markdown response
|
361 |
+
response_parts = [
|
362 |
+
f"## Answer to: {query}",
|
363 |
+
"",
|
364 |
+
"Based on the available information:",
|
365 |
+
"",
|
366 |
+
]
|
367 |
+
|
368 |
+
# Add key information as markdown list
|
369 |
+
for i, line in enumerate(relevant_lines[:3]): # Limit to 3 most relevant
|
370 |
+
if len(line) > 50: # Only include substantial content
|
371 |
+
response_parts.append(f"- {line}")
|
372 |
+
|
373 |
+
response_parts.extend(
|
374 |
+
[
|
375 |
+
"",
|
376 |
+
"---",
|
377 |
+
"",
|
378 |
+
"**Note:** This response is generated using available context. For more detailed analysis, please ensure proper language model integration.",
|
379 |
+
]
|
380 |
+
)
|
381 |
+
|
382 |
+
response = "\n".join(response_parts)
|
383 |
+
|
384 |
+
return {
|
385 |
+
"success": True,
|
386 |
+
"response": response,
|
387 |
+
"model": "fallback",
|
388 |
+
}
|
389 |
+
|
390 |
+
def _format_context(self, context: List[Dict[str, Any]]) -> str:
|
391 |
+
"""
|
392 |
+
Format the retrieved context for use in response generation.
|
393 |
+
|
394 |
+
Args:
|
395 |
+
context: List of context items
|
396 |
+
|
397 |
+
Returns:
|
398 |
+
Formatted context string
|
399 |
+
"""
|
400 |
+
formatted_parts = []
|
401 |
+
|
402 |
+
for i, item in enumerate(context):
|
403 |
+
text = item.get("text", "")
|
404 |
+
source = item.get("source", f"Source {i+1}")
|
405 |
+
score = item.get("score", 0.0)
|
406 |
+
|
407 |
+
# Format each context item with metadata
|
408 |
+
formatted_part = f"""[Source {i+1}: {source} (Relevance: {score:.2f})]
|
409 |
+
{text}
|
410 |
+
---"""
|
411 |
+
formatted_parts.append(formatted_part)
|
412 |
+
|
413 |
+
return "\n\n".join(formatted_parts)
|
414 |
+
|
415 |
+
def _extract_sources(self, context: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
416 |
+
"""
|
417 |
+
Extract source information from context items.
|
418 |
+
|
419 |
+
Args:
|
420 |
+
context: List of context items
|
421 |
+
|
422 |
+
Returns:
|
423 |
+
List of source dictionaries
|
424 |
+
"""
|
425 |
+
sources = []
|
426 |
+
seen_sources = set()
|
427 |
+
|
428 |
+
for item in context:
|
429 |
+
source = item.get("source", "Unknown")
|
430 |
+
score = item.get("score", 0.0)
|
431 |
+
final_score = item.get("final_score", score)
|
432 |
+
|
433 |
+
if source not in seen_sources:
|
434 |
+
source_info = {
|
435 |
+
"source": source,
|
436 |
+
"relevance_score": round(score, 3),
|
437 |
+
"final_score": round(final_score, 3),
|
438 |
+
"metadata": item.get("metadata", {}),
|
439 |
+
}
|
440 |
+
|
441 |
+
# Add source type
|
442 |
+
if source.endswith(".pdf"):
|
443 |
+
source_info["type"] = "PDF Document"
|
444 |
+
elif source.startswith("http"):
|
445 |
+
source_info["type"] = "Web Page"
|
446 |
+
elif source.endswith((".docx", ".doc")):
|
447 |
+
source_info["type"] = "Word Document"
|
448 |
+
else:
|
449 |
+
source_info["type"] = "Document"
|
450 |
+
|
451 |
+
sources.append(source_info)
|
452 |
+
seen_sources.add(source)
|
453 |
+
|
454 |
+
# Sort by relevance score
|
455 |
+
sources.sort(key=lambda x: x["final_score"], reverse=True)
|
456 |
+
return sources
|
457 |
+
|
458 |
+
def _calculate_confidence(self, context: List[Dict[str, Any]]) -> float:
|
459 |
+
"""
|
460 |
+
Calculate confidence score based on context quality.
|
461 |
+
|
462 |
+
Args:
|
463 |
+
context: List of context items
|
464 |
+
|
465 |
+
Returns:
|
466 |
+
Confidence score between 0.0 and 1.0
|
467 |
+
"""
|
468 |
+
if not context:
|
469 |
+
return 0.0
|
470 |
+
|
471 |
+
# Calculate average similarity score
|
472 |
+
scores = [item.get("final_score", item.get("score", 0.0)) for item in context]
|
473 |
+
avg_score = sum(scores) / len(scores)
|
474 |
+
|
475 |
+
# Factor in the number of context items
|
476 |
+
context_factor = min(len(context) / 3.0, 1.0) # Normalize to max of 3 items
|
477 |
+
|
478 |
+
# Factor in score distribution (prefer consistent scores)
|
479 |
+
if len(scores) > 1:
|
480 |
+
score_variance = sum((s - avg_score) ** 2 for s in scores) / len(scores)
|
481 |
+
consistency_factor = max(0.5, 1.0 - score_variance)
|
482 |
+
else:
|
483 |
+
consistency_factor = 1.0
|
484 |
+
|
485 |
+
# Combine factors
|
486 |
+
confidence = (
|
487 |
+
(avg_score * 0.6) + (context_factor * 0.2) + (consistency_factor * 0.2)
|
488 |
+
)
|
489 |
+
|
490 |
+
return min(confidence, 1.0)
|
491 |
+
|
492 |
+
def _assess_response_quality(
|
493 |
+
self, response: str, query: str, context: List[Dict[str, Any]]
|
494 |
+
) -> float:
|
495 |
+
"""
|
496 |
+
Assess the quality of the generated response.
|
497 |
+
|
498 |
+
Args:
|
499 |
+
response: Generated response
|
500 |
+
query: Original query
|
501 |
+
context: Context used for generation
|
502 |
+
|
503 |
+
Returns:
|
504 |
+
Quality score between 0.0 and 1.0
|
505 |
+
"""
|
506 |
+
if not response or len(response.strip()) < 10:
|
507 |
+
return 0.1
|
508 |
+
|
509 |
+
quality_score = 0.5 # Base score
|
510 |
+
|
511 |
+
# Check response length (not too short, not too long)
|
512 |
+
response_length = len(response)
|
513 |
+
if 50 <= response_length <= 1000:
|
514 |
+
quality_score += 0.2
|
515 |
+
elif response_length > 1000:
|
516 |
+
quality_score += 0.1
|
517 |
+
|
518 |
+
# Check if response addresses the query
|
519 |
+
query_words = set(query.lower().split())
|
520 |
+
response_words = set(response.lower().split())
|
521 |
+
word_overlap = len(query_words.intersection(response_words))
|
522 |
+
if word_overlap > 0:
|
523 |
+
quality_score += min(word_overlap / len(query_words), 0.2)
|
524 |
+
|
525 |
+
# Check if response uses context information
|
526 |
+
context_texts = [item.get("text", "") for item in context]
|
527 |
+
context_words = set()
|
528 |
+
for text in context_texts:
|
529 |
+
context_words.update(text.lower().split())
|
530 |
+
|
531 |
+
context_usage = len(response_words.intersection(context_words))
|
532 |
+
if context_usage > 5: # Uses substantial context
|
533 |
+
quality_score += 0.1
|
534 |
+
|
535 |
+
return min(quality_score, 1.0)
|
536 |
+
|
537 |
+
def get_supported_models(self) -> List[str]:
|
538 |
+
"""
|
539 |
+
Get list of supported models.
|
540 |
+
|
541 |
+
Returns:
|
542 |
+
List of available model names
|
543 |
+
"""
|
544 |
+
models = ["fallback"]
|
545 |
+
|
546 |
+
if self.gemini_client:
|
547 |
+
models.extend(["gemini-2.5-flash-preview-05-20", "gemini-1.5-pro"])
|
548 |
+
|
549 |
+
if self.openai_client:
|
550 |
+
models.extend(["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo"])
|
551 |
+
|
552 |
+
return models
|
553 |
+
|
554 |
+
def update_model(self, model_name: str) -> bool:
|
555 |
+
"""
|
556 |
+
Update the model used for generation.
|
557 |
+
|
558 |
+
Args:
|
559 |
+
model_name: Name of the model to use
|
560 |
+
|
561 |
+
Returns:
|
562 |
+
True if model was updated successfully
|
563 |
+
"""
|
564 |
+
try:
|
565 |
+
if model_name in self.get_supported_models():
|
566 |
+
self.model = model_name
|
567 |
+
self.logger.info(f"Model updated to: {model_name}")
|
568 |
+
return True
|
569 |
+
else:
|
570 |
+
self.logger.warning(f"Model {model_name} not supported")
|
571 |
+
return False
|
572 |
+
except Exception as e:
|
573 |
+
self.logger.error(f"❌ Error updating model: {e}")
|
574 |
+
return False
|
575 |
+
|
576 |
+
def get_generation_stats(self) -> Dict[str, Any]:
|
577 |
+
"""
|
578 |
+
Get statistics about response generation.
|
579 |
+
|
580 |
+
Returns:
|
581 |
+
Dictionary with generation statistics
|
582 |
+
"""
|
583 |
+
return {
|
584 |
+
"supported_models": self.get_supported_models(),
|
585 |
+
"current_model": self.model,
|
586 |
+
"gemini_available": self.gemini_client is not None,
|
587 |
+
"openai_available": self.openai_client is not None,
|
588 |
+
"langchain_available": self.llm is not None,
|
589 |
+
"max_tokens": self.max_tokens,
|
590 |
+
"temperature": self.temperature,
|
591 |
+
}
|
src/storage/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Storage module for vector database operations.
|
3 |
+
|
4 |
+
This module contains components for storing and retrieving
|
5 |
+
vector embeddings using Pinecone.
|
6 |
+
"""
|
src/storage/vector_db.py
ADDED
@@ -0,0 +1,729 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Vector Database Module
|
3 |
+
|
4 |
+
This module is responsible for storing and indexing vector embeddings
|
5 |
+
for efficient retrieval using Pinecone with complete functionality.
|
6 |
+
|
7 |
+
Technology: Pinecone
|
8 |
+
"""
|
9 |
+
|
10 |
+
import logging
|
11 |
+
import os
|
12 |
+
import time
|
13 |
+
import uuid
|
14 |
+
import hashlib
|
15 |
+
from datetime import datetime
|
16 |
+
from typing import Dict, List, Any, Optional, Union
|
17 |
+
|
18 |
+
# Import Pinecone and related libraries
|
19 |
+
try:
|
20 |
+
import pinecone
|
21 |
+
from pinecone import Pinecone, ServerlessSpec
|
22 |
+
except ImportError as e:
|
23 |
+
logging.warning(f"Pinecone library not installed: {e}")
|
24 |
+
|
25 |
+
from utils.error_handler import VectorStorageError, error_handler, ErrorType
|
26 |
+
|
27 |
+
|
28 |
+
class VectorDB:
|
29 |
+
"""
|
30 |
+
Stores and indexes vector embeddings for efficient retrieval using Pinecone with full functionality.
|
31 |
+
|
32 |
+
Features:
|
33 |
+
- Complete Pinecone integration
|
34 |
+
- Index management (create, update, delete)
|
35 |
+
- Batch upsert operations with optimization
|
36 |
+
- Advanced similarity search with metadata filtering
|
37 |
+
- Statistics and monitoring
|
38 |
+
"""
|
39 |
+
|
40 |
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
41 |
+
"""
|
42 |
+
Initialize the VectorDB with configuration.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
config: Configuration dictionary with Pinecone parameters
|
46 |
+
"""
|
47 |
+
self.config = config or {}
|
48 |
+
self.logger = logging.getLogger(__name__)
|
49 |
+
|
50 |
+
# Configuration settings
|
51 |
+
self.api_key = self.config.get("api_key", os.environ.get("PINECONE_API_KEY"))
|
52 |
+
self.environment = self.config.get("environment", "us-west1-gcp")
|
53 |
+
self.index_name = self.config.get("index_name", "rag-ai-index")
|
54 |
+
self.dimension = self.config.get(
|
55 |
+
"dimension", 3072
|
56 |
+
) # ✅ Fixed: Match Gemini embedding dimension
|
57 |
+
self.metric = self.config.get("metric", "cosine")
|
58 |
+
self.batch_size = self.config.get("batch_size", 100)
|
59 |
+
|
60 |
+
# Performance settings
|
61 |
+
self.max_metadata_size = self.config.get(
|
62 |
+
"max_metadata_size", 40960
|
63 |
+
) # 40KB limit
|
64 |
+
self.upsert_timeout = self.config.get("upsert_timeout", 60)
|
65 |
+
self.query_timeout = self.config.get("query_timeout", 30)
|
66 |
+
|
67 |
+
# Statistics tracking
|
68 |
+
self.stats = {
|
69 |
+
"vectors_stored": 0,
|
70 |
+
"vectors_queried": 0,
|
71 |
+
"vectors_deleted": 0,
|
72 |
+
"batch_operations": 0,
|
73 |
+
"failed_operations": 0,
|
74 |
+
"start_time": datetime.now(),
|
75 |
+
}
|
76 |
+
|
77 |
+
# Initialize Pinecone client
|
78 |
+
self.pc = None
|
79 |
+
self.index = None
|
80 |
+
self._initialize_client()
|
81 |
+
|
82 |
+
def _initialize_client(self):
|
83 |
+
"""Initialize Pinecone client and index with validation."""
|
84 |
+
if not self.api_key:
|
85 |
+
self.logger.warning(
|
86 |
+
"No Pinecone API key provided. Vector storage will not be available."
|
87 |
+
)
|
88 |
+
return
|
89 |
+
|
90 |
+
try:
|
91 |
+
# Initialize Pinecone client
|
92 |
+
self.pc = Pinecone(api_key=self.api_key)
|
93 |
+
|
94 |
+
# Check if index exists, create if not
|
95 |
+
self._ensure_index_exists()
|
96 |
+
|
97 |
+
# Connect to index
|
98 |
+
self.index = self.pc.Index(self.index_name)
|
99 |
+
|
100 |
+
# Test connection
|
101 |
+
self._test_connection()
|
102 |
+
|
103 |
+
self.logger.info(
|
104 |
+
f"Pinecone client initialized successfully with index: {self.index_name}"
|
105 |
+
)
|
106 |
+
|
107 |
+
except Exception as e:
|
108 |
+
self.logger.error(f" Failed to initialize Pinecone client: {str(e)}")
|
109 |
+
self.pc = None
|
110 |
+
self.index = None
|
111 |
+
|
112 |
+
def _ensure_index_exists(self):
|
113 |
+
"""Ensure the Pinecone index exists, create if necessary."""
|
114 |
+
try:
|
115 |
+
# List existing indexes
|
116 |
+
existing_indexes = [index.name for index in self.pc.list_indexes()]
|
117 |
+
|
118 |
+
if self.index_name not in existing_indexes:
|
119 |
+
self.logger.info(f"Creating new Pinecone index: {self.index_name}")
|
120 |
+
|
121 |
+
# Create index with serverless spec
|
122 |
+
self.pc.create_index(
|
123 |
+
name=self.index_name,
|
124 |
+
dimension=self.dimension,
|
125 |
+
metric=self.metric,
|
126 |
+
spec=ServerlessSpec(cloud="aws", region=self.environment),
|
127 |
+
)
|
128 |
+
|
129 |
+
# Wait for index to be ready
|
130 |
+
self._wait_for_index_ready()
|
131 |
+
|
132 |
+
self.logger.info(f"Index {self.index_name} created successfully")
|
133 |
+
else:
|
134 |
+
self.logger.info(f"Index {self.index_name} already exists")
|
135 |
+
|
136 |
+
except Exception as e:
|
137 |
+
raise VectorStorageError(f"Failed to ensure index exists: {str(e)}")
|
138 |
+
|
139 |
+
def _wait_for_index_ready(self, max_wait_time: int = 300):
|
140 |
+
"""Wait for index to be ready for operations."""
|
141 |
+
start_time = time.time()
|
142 |
+
|
143 |
+
while time.time() - start_time < max_wait_time:
|
144 |
+
try:
|
145 |
+
index_stats = self.pc.describe_index(self.index_name)
|
146 |
+
if index_stats.status.ready:
|
147 |
+
self.logger.info(f"Index {self.index_name} is ready")
|
148 |
+
return
|
149 |
+
|
150 |
+
self.logger.info(f"Waiting for index to be ready...")
|
151 |
+
time.sleep(10)
|
152 |
+
|
153 |
+
except Exception as e:
|
154 |
+
self.logger.warning(f"Error checking index status: {str(e)}")
|
155 |
+
time.sleep(5)
|
156 |
+
|
157 |
+
raise VectorStorageError(
|
158 |
+
f"Index {self.index_name} not ready after {max_wait_time}s"
|
159 |
+
)
|
160 |
+
|
161 |
+
def _test_connection(self):
|
162 |
+
"""Test connection to Pinecone index."""
|
163 |
+
try:
|
164 |
+
# Get index stats
|
165 |
+
stats = self.index.describe_index_stats()
|
166 |
+
self.logger.info(f"Connection test successful. Index stats: {stats}")
|
167 |
+
|
168 |
+
except Exception as e:
|
169 |
+
raise VectorStorageError(f"Connection test failed: {str(e)}")
|
170 |
+
|
171 |
+
@error_handler(ErrorType.VECTOR_STORAGE)
|
172 |
+
def store_embeddings(self, items: List[Dict[str, Any]]) -> bool:
|
173 |
+
"""
|
174 |
+
Store embeddings in the vector database with full functionality.
|
175 |
+
|
176 |
+
Args:
|
177 |
+
items: List of dictionaries containing content, metadata, and embeddings
|
178 |
+
|
179 |
+
Returns:
|
180 |
+
True if successful, False otherwise
|
181 |
+
"""
|
182 |
+
if not self.index or not items:
|
183 |
+
self.logger.warning("No index available or empty items list")
|
184 |
+
return False
|
185 |
+
|
186 |
+
# Filter and validate items
|
187 |
+
valid_items = self._validate_items(items)
|
188 |
+
if not valid_items:
|
189 |
+
self.logger.warning("No valid embeddings to store")
|
190 |
+
return False
|
191 |
+
|
192 |
+
self.logger.info(f"Storing {len(valid_items)} embeddings in Pinecone")
|
193 |
+
start_time = time.time()
|
194 |
+
|
195 |
+
try:
|
196 |
+
# Process in batches
|
197 |
+
total_batches = (len(valid_items) + self.batch_size - 1) // self.batch_size
|
198 |
+
successful_batches = 0
|
199 |
+
|
200 |
+
for i in range(0, len(valid_items), self.batch_size):
|
201 |
+
batch_num = (i // self.batch_size) + 1
|
202 |
+
batch = valid_items[i : i + self.batch_size]
|
203 |
+
|
204 |
+
self.logger.info(
|
205 |
+
f"Processing batch {batch_num}/{total_batches} ({len(batch)} vectors)"
|
206 |
+
)
|
207 |
+
|
208 |
+
success = self._store_batch(batch)
|
209 |
+
if success:
|
210 |
+
successful_batches += 1
|
211 |
+
self.stats["vectors_stored"] += len(batch)
|
212 |
+
else:
|
213 |
+
self.stats["failed_operations"] += 1
|
214 |
+
self.logger.error(f" Batch {batch_num} failed")
|
215 |
+
|
216 |
+
# Rate limiting between batches
|
217 |
+
if i + self.batch_size < len(valid_items):
|
218 |
+
time.sleep(0.1)
|
219 |
+
|
220 |
+
self.stats["batch_operations"] += total_batches
|
221 |
+
processing_time = time.time() - start_time
|
222 |
+
|
223 |
+
success_rate = successful_batches / total_batches * 100
|
224 |
+
self.logger.info(
|
225 |
+
f"Storage completed: {successful_batches}/{total_batches} batches successful ({success_rate:.1f}%) in {processing_time:.2f}s"
|
226 |
+
)
|
227 |
+
|
228 |
+
return successful_batches > 0
|
229 |
+
|
230 |
+
except Exception as e:
|
231 |
+
self.stats["failed_operations"] += 1
|
232 |
+
raise VectorStorageError(f"Failed to store embeddings: {str(e)}")
|
233 |
+
|
234 |
+
def _validate_items(self, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
235 |
+
"""
|
236 |
+
Validate and prepare items for storage.
|
237 |
+
|
238 |
+
Args:
|
239 |
+
items: List of items to validate
|
240 |
+
|
241 |
+
Returns:
|
242 |
+
List of valid items
|
243 |
+
"""
|
244 |
+
valid_items = []
|
245 |
+
|
246 |
+
for i, item in enumerate(items):
|
247 |
+
try:
|
248 |
+
# Check required fields
|
249 |
+
if not isinstance(item, dict):
|
250 |
+
self.logger.warning(f"Item {i} is not a dictionary")
|
251 |
+
continue
|
252 |
+
|
253 |
+
if "embedding" not in item or not item["embedding"]:
|
254 |
+
self.logger.warning(f"Item {i} missing embedding")
|
255 |
+
continue
|
256 |
+
|
257 |
+
embedding = item["embedding"]
|
258 |
+
if not isinstance(embedding, list) or len(embedding) != self.dimension:
|
259 |
+
self.logger.warning(
|
260 |
+
f"Item {i} has invalid embedding dimension: {len(embedding)} != {self.dimension}"
|
261 |
+
)
|
262 |
+
continue
|
263 |
+
|
264 |
+
# Prepare item
|
265 |
+
processed_item = self._prepare_item_for_storage(item, i)
|
266 |
+
valid_items.append(processed_item)
|
267 |
+
|
268 |
+
except Exception as e:
|
269 |
+
self.logger.warning(f"Error validating item {i}: {str(e)}")
|
270 |
+
continue
|
271 |
+
|
272 |
+
return valid_items
|
273 |
+
|
274 |
+
def _prepare_item_for_storage(
|
275 |
+
self, item: Dict[str, Any], index: int
|
276 |
+
) -> Dict[str, Any]:
|
277 |
+
"""
|
278 |
+
Prepare item for Pinecone storage.
|
279 |
+
|
280 |
+
Args:
|
281 |
+
item: Item to prepare
|
282 |
+
index: Item index for ID generation
|
283 |
+
|
284 |
+
Returns:
|
285 |
+
Prepared item
|
286 |
+
"""
|
287 |
+
# 🆔 Generate unique ID
|
288 |
+
item_id = item.get("id")
|
289 |
+
if not item_id:
|
290 |
+
# Create ID from content hash + timestamp
|
291 |
+
content = item.get("content", "")
|
292 |
+
timestamp = str(int(time.time() * 1000))
|
293 |
+
content_hash = hashlib.md5(content.encode()).hexdigest()[:8]
|
294 |
+
item_id = f"doc_{content_hash}_{timestamp}_{index}"
|
295 |
+
|
296 |
+
# Prepare metadata
|
297 |
+
metadata = item.get("metadata", {}).copy()
|
298 |
+
|
299 |
+
# Add essential fields to metadata
|
300 |
+
metadata.update(
|
301 |
+
{
|
302 |
+
"content_preview": item.get("content", "")[:500], # First 500 chars
|
303 |
+
"content_length": len(item.get("content", "")),
|
304 |
+
"stored_at": datetime.now().isoformat(),
|
305 |
+
"source": item.get("source", "unknown"),
|
306 |
+
"document_type": item.get("document_type", "text"),
|
307 |
+
}
|
308 |
+
)
|
309 |
+
|
310 |
+
# Ensure metadata size limit
|
311 |
+
metadata = self._truncate_metadata(metadata)
|
312 |
+
|
313 |
+
return {"id": item_id, "values": item["embedding"], "metadata": metadata}
|
314 |
+
|
315 |
+
def _truncate_metadata(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
316 |
+
"""
|
317 |
+
Truncate metadata to fit Pinecone size limits.
|
318 |
+
|
319 |
+
Args:
|
320 |
+
metadata: Original metadata
|
321 |
+
|
322 |
+
Returns:
|
323 |
+
Truncated metadata
|
324 |
+
"""
|
325 |
+
import json
|
326 |
+
|
327 |
+
# 📏 Check current size
|
328 |
+
metadata_str = json.dumps(metadata, default=str)
|
329 |
+
if len(metadata_str.encode()) <= self.max_metadata_size:
|
330 |
+
return metadata
|
331 |
+
|
332 |
+
# Truncate large fields
|
333 |
+
truncated = metadata.copy()
|
334 |
+
|
335 |
+
# Truncate text fields progressively
|
336 |
+
text_fields = ["content_preview", "text", "description", "summary"]
|
337 |
+
for field in text_fields:
|
338 |
+
if field in truncated:
|
339 |
+
while (
|
340 |
+
len(json.dumps(truncated, default=str).encode())
|
341 |
+
> self.max_metadata_size
|
342 |
+
):
|
343 |
+
current_length = len(str(truncated[field]))
|
344 |
+
if current_length <= 50:
|
345 |
+
break
|
346 |
+
truncated[field] = (
|
347 |
+
str(truncated[field])[: current_length // 2] + "..."
|
348 |
+
)
|
349 |
+
|
350 |
+
return truncated
|
351 |
+
|
352 |
+
def _store_batch(self, batch: List[Dict[str, Any]]) -> bool:
|
353 |
+
"""
|
354 |
+
Store a batch of embeddings in Pinecone.
|
355 |
+
|
356 |
+
Args:
|
357 |
+
batch: List of prepared items
|
358 |
+
|
359 |
+
Returns:
|
360 |
+
True if successful
|
361 |
+
"""
|
362 |
+
try:
|
363 |
+
# Upsert vectors to Pinecone
|
364 |
+
upsert_response = self.index.upsert(
|
365 |
+
vectors=batch, timeout=self.upsert_timeout
|
366 |
+
)
|
367 |
+
|
368 |
+
# Verify upsert success
|
369 |
+
if hasattr(upsert_response, "upserted_count"):
|
370 |
+
expected_count = len(batch)
|
371 |
+
actual_count = upsert_response.upserted_count
|
372 |
+
|
373 |
+
if actual_count != expected_count:
|
374 |
+
self.logger.warning(
|
375 |
+
f"Upsert count mismatch: {actual_count}/{expected_count}"
|
376 |
+
)
|
377 |
+
return False
|
378 |
+
|
379 |
+
self.logger.info(f"Successfully stored batch of {len(batch)} vectors")
|
380 |
+
return True
|
381 |
+
|
382 |
+
except Exception as e:
|
383 |
+
self.logger.error(f" Error storing batch: {str(e)}")
|
384 |
+
return False
|
385 |
+
|
386 |
+
@error_handler(ErrorType.VECTOR_STORAGE)
|
387 |
+
def search(
|
388 |
+
self,
|
389 |
+
query_embedding: List[float],
|
390 |
+
top_k: int = 5,
|
391 |
+
filter: Optional[Dict[str, Any]] = None,
|
392 |
+
include_metadata: bool = True,
|
393 |
+
include_values: bool = False,
|
394 |
+
) -> List[Dict[str, Any]]:
|
395 |
+
"""
|
396 |
+
Search for similar vectors with advanced filtering.
|
397 |
+
|
398 |
+
Args:
|
399 |
+
query_embedding: Query vector to search for
|
400 |
+
top_k: Number of results to return
|
401 |
+
filter: Optional metadata filter
|
402 |
+
include_metadata: Whether to include metadata in results
|
403 |
+
include_values: Whether to include vector values in results
|
404 |
+
|
405 |
+
Returns:
|
406 |
+
List of search results with scores and metadata
|
407 |
+
"""
|
408 |
+
if not self.index or not query_embedding:
|
409 |
+
self.logger.warning("No index available or empty query embedding")
|
410 |
+
return []
|
411 |
+
|
412 |
+
# Validate query embedding
|
413 |
+
if len(query_embedding) != self.dimension:
|
414 |
+
raise VectorStorageError(
|
415 |
+
f"Query embedding dimension {len(query_embedding)} != {self.dimension}"
|
416 |
+
)
|
417 |
+
|
418 |
+
self.logger.info(f"Searching for similar vectors (top_k={top_k})")
|
419 |
+
start_time = time.time()
|
420 |
+
|
421 |
+
try:
|
422 |
+
# Perform similarity search
|
423 |
+
search_response = self.index.query(
|
424 |
+
vector=query_embedding,
|
425 |
+
top_k=top_k,
|
426 |
+
filter=filter,
|
427 |
+
include_metadata=include_metadata,
|
428 |
+
include_values=include_values,
|
429 |
+
timeout=self.query_timeout,
|
430 |
+
)
|
431 |
+
|
432 |
+
# Process results
|
433 |
+
results = []
|
434 |
+
if hasattr(search_response, "matches"):
|
435 |
+
for match in search_response.matches:
|
436 |
+
result = {
|
437 |
+
"id": match.id,
|
438 |
+
"score": float(match.score),
|
439 |
+
}
|
440 |
+
|
441 |
+
if include_metadata and hasattr(match, "metadata"):
|
442 |
+
result["metadata"] = (
|
443 |
+
dict(match.metadata) if match.metadata else {}
|
444 |
+
)
|
445 |
+
|
446 |
+
if include_values and hasattr(match, "values"):
|
447 |
+
result["values"] = match.values
|
448 |
+
|
449 |
+
results.append(result)
|
450 |
+
|
451 |
+
self.stats["vectors_queried"] += len(results)
|
452 |
+
search_time = time.time() - start_time
|
453 |
+
|
454 |
+
self.logger.info(
|
455 |
+
f"Search completed: {len(results)} results in {search_time:.3f}s"
|
456 |
+
)
|
457 |
+
return results
|
458 |
+
|
459 |
+
except Exception as e:
|
460 |
+
self.stats["failed_operations"] += 1
|
461 |
+
raise VectorStorageError(f"Search failed: {str(e)}")
|
462 |
+
|
463 |
+
@error_handler(ErrorType.VECTOR_STORAGE)
|
464 |
+
def delete(
|
465 |
+
self,
|
466 |
+
ids: Optional[List[str]] = None,
|
467 |
+
filter: Optional[Dict[str, Any]] = None,
|
468 |
+
delete_all: bool = False,
|
469 |
+
) -> bool:
|
470 |
+
"""
|
471 |
+
Delete vectors from the database.
|
472 |
+
|
473 |
+
Args:
|
474 |
+
ids: Optional list of vector IDs to delete
|
475 |
+
filter: Optional metadata filter for vectors to delete
|
476 |
+
delete_all: Whether to delete all vectors
|
477 |
+
|
478 |
+
Returns:
|
479 |
+
True if successful
|
480 |
+
"""
|
481 |
+
if not self.index:
|
482 |
+
self.logger.warning("No index available")
|
483 |
+
return False
|
484 |
+
|
485 |
+
try:
|
486 |
+
if delete_all:
|
487 |
+
# Delete all vectors
|
488 |
+
self.index.delete(delete_all=True)
|
489 |
+
self.logger.info("Deleted all vectors from index")
|
490 |
+
self.stats["vectors_deleted"] += 1 # Approximate
|
491 |
+
|
492 |
+
elif ids:
|
493 |
+
# Delete by IDs
|
494 |
+
self.index.delete(ids=ids)
|
495 |
+
self.logger.info(f"Deleted {len(ids)} vectors by ID")
|
496 |
+
self.stats["vectors_deleted"] += len(ids)
|
497 |
+
|
498 |
+
elif filter:
|
499 |
+
# Delete by filter
|
500 |
+
self.index.delete(filter=filter)
|
501 |
+
self.logger.info(f"Deleted vectors by filter: {filter}")
|
502 |
+
self.stats["vectors_deleted"] += 1 # Approximate
|
503 |
+
|
504 |
+
else:
|
505 |
+
self.logger.warning("No deletion criteria provided")
|
506 |
+
return False
|
507 |
+
|
508 |
+
return True
|
509 |
+
|
510 |
+
except Exception as e:
|
511 |
+
self.stats["failed_operations"] += 1
|
512 |
+
raise VectorStorageError(f"Delete operation failed: {str(e)}")
|
513 |
+
|
514 |
+
def get_index_stats(self) -> Dict[str, Any]:
|
515 |
+
"""
|
516 |
+
Get comprehensive index statistics.
|
517 |
+
|
518 |
+
Returns:
|
519 |
+
Dictionary with index statistics
|
520 |
+
"""
|
521 |
+
if not self.index:
|
522 |
+
return {}
|
523 |
+
|
524 |
+
try:
|
525 |
+
# Get Pinecone index stats
|
526 |
+
pinecone_stats = self.index.describe_index_stats()
|
527 |
+
|
528 |
+
# Combine with internal stats
|
529 |
+
runtime = datetime.now() - self.stats["start_time"]
|
530 |
+
|
531 |
+
return {
|
532 |
+
"pinecone_stats": {
|
533 |
+
"total_vector_count": pinecone_stats.total_vector_count,
|
534 |
+
"dimension": pinecone_stats.dimension,
|
535 |
+
"index_fullness": pinecone_stats.index_fullness,
|
536 |
+
"namespaces": (
|
537 |
+
dict(pinecone_stats.namespaces)
|
538 |
+
if pinecone_stats.namespaces
|
539 |
+
else {}
|
540 |
+
),
|
541 |
+
},
|
542 |
+
"internal_stats": {
|
543 |
+
**self.stats,
|
544 |
+
"runtime_seconds": runtime.total_seconds(),
|
545 |
+
"avg_vectors_per_batch": (
|
546 |
+
self.stats["vectors_stored"]
|
547 |
+
/ max(1, self.stats["batch_operations"])
|
548 |
+
),
|
549 |
+
"success_rate": (
|
550 |
+
(
|
551 |
+
self.stats["batch_operations"]
|
552 |
+
- self.stats["failed_operations"]
|
553 |
+
)
|
554 |
+
/ max(1, self.stats["batch_operations"])
|
555 |
+
* 100
|
556 |
+
),
|
557 |
+
},
|
558 |
+
"configuration": {
|
559 |
+
"index_name": self.index_name,
|
560 |
+
"dimension": self.dimension,
|
561 |
+
"metric": self.metric,
|
562 |
+
"batch_size": self.batch_size,
|
563 |
+
},
|
564 |
+
}
|
565 |
+
|
566 |
+
except Exception as e:
|
567 |
+
self.logger.error(f" Error getting index stats: {str(e)}")
|
568 |
+
return {"error": str(e)}
|
569 |
+
|
570 |
+
def health_check(self) -> Dict[str, Any]:
|
571 |
+
"""
|
572 |
+
Perform health check on the vector database.
|
573 |
+
|
574 |
+
Returns:
|
575 |
+
Health check results
|
576 |
+
"""
|
577 |
+
health = {
|
578 |
+
"status": "unknown",
|
579 |
+
"timestamp": datetime.now().isoformat(),
|
580 |
+
"checks": {},
|
581 |
+
}
|
582 |
+
|
583 |
+
try:
|
584 |
+
# Check API connection
|
585 |
+
if self.pc:
|
586 |
+
health["checks"]["api_connection"] = "Connected"
|
587 |
+
else:
|
588 |
+
health["checks"]["api_connection"] = " Not connected"
|
589 |
+
health["status"] = "unhealthy"
|
590 |
+
return health
|
591 |
+
|
592 |
+
# Check index availability
|
593 |
+
if self.index:
|
594 |
+
health["checks"]["index_available"] = "Available"
|
595 |
+
else:
|
596 |
+
health["checks"]["index_available"] = " Not available"
|
597 |
+
health["status"] = "unhealthy"
|
598 |
+
return health
|
599 |
+
|
600 |
+
# Test query operation
|
601 |
+
try:
|
602 |
+
test_vector = [0.1] * self.dimension
|
603 |
+
self.index.query(vector=test_vector, top_k=1, timeout=5)
|
604 |
+
health["checks"]["query_operation"] = "Working"
|
605 |
+
except Exception as e:
|
606 |
+
health["checks"]["query_operation"] = f" Failed: {str(e)}"
|
607 |
+
health["status"] = "degraded"
|
608 |
+
|
609 |
+
# Check index stats
|
610 |
+
try:
|
611 |
+
stats = self.index.describe_index_stats()
|
612 |
+
health["checks"]["index_stats"] = f"{stats.total_vector_count} vectors"
|
613 |
+
except Exception as e:
|
614 |
+
health["checks"]["index_stats"] = f" Failed: {str(e)}"
|
615 |
+
|
616 |
+
# 🎯 Overall status
|
617 |
+
if health["status"] == "unknown":
|
618 |
+
health["status"] = "healthy"
|
619 |
+
|
620 |
+
except Exception as e:
|
621 |
+
health["status"] = "unhealthy"
|
622 |
+
health["error"] = str(e)
|
623 |
+
|
624 |
+
return health
|
625 |
+
|
626 |
+
def reset_stats(self):
|
627 |
+
"""Reset internal statistics."""
|
628 |
+
self.stats = {
|
629 |
+
"vectors_stored": 0,
|
630 |
+
"vectors_queried": 0,
|
631 |
+
"vectors_deleted": 0,
|
632 |
+
"batch_operations": 0,
|
633 |
+
"failed_operations": 0,
|
634 |
+
"start_time": datetime.now(),
|
635 |
+
}
|
636 |
+
self.logger.info("Statistics reset")
|
637 |
+
|
638 |
+
def get_stats(self) -> Dict[str, Any]:
|
639 |
+
"""
|
640 |
+
Get simplified stats for UI display.
|
641 |
+
|
642 |
+
Returns:
|
643 |
+
Dictionary with basic statistics
|
644 |
+
"""
|
645 |
+
try:
|
646 |
+
if not self.index:
|
647 |
+
return {"total_vectors": 0, "status": "disconnected"}
|
648 |
+
|
649 |
+
# Get Pinecone stats
|
650 |
+
pinecone_stats = self.index.describe_index_stats()
|
651 |
+
|
652 |
+
return {
|
653 |
+
"total_vectors": pinecone_stats.total_vector_count,
|
654 |
+
"dimension": pinecone_stats.dimension,
|
655 |
+
"index_fullness": pinecone_stats.index_fullness,
|
656 |
+
"status": "connected",
|
657 |
+
}
|
658 |
+
except Exception as e:
|
659 |
+
self.logger.warning(f"Could not get stats: {e}")
|
660 |
+
return {"total_vectors": 0, "status": "error", "error": str(e)}
|
661 |
+
|
662 |
+
def get_unique_sources(self) -> List[Dict[str, Any]]:
|
663 |
+
"""
|
664 |
+
Get unique sources from stored vectors.
|
665 |
+
|
666 |
+
Returns:
|
667 |
+
List of unique sources with metadata
|
668 |
+
"""
|
669 |
+
try:
|
670 |
+
if not self.index:
|
671 |
+
return []
|
672 |
+
|
673 |
+
# This is a simplified approach - in a real implementation,
|
674 |
+
# you might want to maintain a separate metadata index
|
675 |
+
# For now, we'll return mock data based on what might be stored
|
676 |
+
|
677 |
+
# Try to get some sample vectors to extract sources
|
678 |
+
test_vector = [0.1] * self.dimension
|
679 |
+
results = self.index.query(
|
680 |
+
vector=test_vector,
|
681 |
+
top_k=100, # Get more results to find unique sources
|
682 |
+
include_metadata=True,
|
683 |
+
)
|
684 |
+
|
685 |
+
sources = {}
|
686 |
+
for match in results.matches:
|
687 |
+
if hasattr(match, "metadata") and match.metadata:
|
688 |
+
source = match.metadata.get("source", "Unknown")
|
689 |
+
if source not in sources:
|
690 |
+
sources[source] = {
|
691 |
+
"source": source,
|
692 |
+
"chunk_count": 1,
|
693 |
+
"added_date": match.metadata.get("stored_at", "Unknown"),
|
694 |
+
}
|
695 |
+
else:
|
696 |
+
sources[source]["chunk_count"] += 1
|
697 |
+
|
698 |
+
return list(sources.values())
|
699 |
+
|
700 |
+
except Exception as e:
|
701 |
+
self.logger.warning(f"Could not get unique sources: {e}")
|
702 |
+
return []
|
703 |
+
|
704 |
+
def list_documents(self) -> List[Dict[str, Any]]:
|
705 |
+
"""
|
706 |
+
List all documents in the vector database.
|
707 |
+
|
708 |
+
Returns:
|
709 |
+
List of document information
|
710 |
+
"""
|
711 |
+
try:
|
712 |
+
# Get unique sources and format as documents
|
713 |
+
sources = self.get_unique_sources()
|
714 |
+
documents = []
|
715 |
+
|
716 |
+
for source_info in sources:
|
717 |
+
documents.append(
|
718 |
+
{
|
719 |
+
"name": source_info["source"],
|
720 |
+
"chunks": source_info["chunk_count"],
|
721 |
+
"date": source_info["added_date"],
|
722 |
+
}
|
723 |
+
)
|
724 |
+
|
725 |
+
return documents
|
726 |
+
|
727 |
+
except Exception as e:
|
728 |
+
self.logger.warning(f"Could not list documents: {e}")
|
729 |
+
return []
|
src/ui/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
UI module for the Gradio user interface.
|
3 |
+
|
4 |
+
This module contains components for providing an intuitive
|
5 |
+
interface for document upload, URL input, and querying.
|
6 |
+
"""
|
src/ui/gradio_app.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
src/utils/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utils module for configuration management and error handling.
|
3 |
+
|
4 |
+
This module contains utility components for managing configuration
|
5 |
+
and handling errors throughout the application.
|
6 |
+
"""
|
src/utils/config_manager.py
ADDED
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Configuration Manager Module
|
3 |
+
|
4 |
+
This module handles loading and managing configuration settings
|
5 |
+
for the RAG AI system.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import yaml
|
10 |
+
import logging
|
11 |
+
from typing import Dict, Any, Optional
|
12 |
+
from pathlib import Path
|
13 |
+
|
14 |
+
|
15 |
+
class ConfigManager:
|
16 |
+
"""
|
17 |
+
Manages configuration settings for the RAG AI system.
|
18 |
+
|
19 |
+
Features:
|
20 |
+
- YAML configuration file loading
|
21 |
+
- Environment variable override
|
22 |
+
- Default configuration values
|
23 |
+
- Configuration validation
|
24 |
+
"""
|
25 |
+
|
26 |
+
def __init__(self, config_path: Optional[str] = None):
|
27 |
+
"""
|
28 |
+
Initialize the ConfigManager.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
config_path: Path to the configuration file (defaults to config/config.yaml)
|
32 |
+
"""
|
33 |
+
self.logger = logging.getLogger(__name__)
|
34 |
+
|
35 |
+
# Set default config path
|
36 |
+
if config_path is None:
|
37 |
+
config_path = os.path.join(
|
38 |
+
os.path.dirname(__file__), "..", "..", "config", "config.yaml"
|
39 |
+
)
|
40 |
+
|
41 |
+
self.config_path = Path(config_path)
|
42 |
+
self.config = self._load_config()
|
43 |
+
|
44 |
+
def _load_config(self) -> Dict[str, Any]:
|
45 |
+
"""
|
46 |
+
Load configuration from file and environment variables.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
Configuration dictionary
|
50 |
+
"""
|
51 |
+
# Start with default configuration
|
52 |
+
config = self._get_default_config()
|
53 |
+
|
54 |
+
# Load from YAML file if it exists
|
55 |
+
if self.config_path.exists():
|
56 |
+
try:
|
57 |
+
with open(self.config_path, "r", encoding="utf-8") as f:
|
58 |
+
file_config = yaml.safe_load(f) or {}
|
59 |
+
config = self._merge_configs(config, file_config)
|
60 |
+
self.logger.info(f"Loaded configuration from {self.config_path}")
|
61 |
+
except Exception as e:
|
62 |
+
self.logger.warning(
|
63 |
+
f"Failed to load config file {self.config_path}: {str(e)}"
|
64 |
+
)
|
65 |
+
else:
|
66 |
+
self.logger.warning(f"Config file not found: {self.config_path}")
|
67 |
+
|
68 |
+
# Override with environment variables
|
69 |
+
config = self._apply_env_overrides(config)
|
70 |
+
|
71 |
+
# Validate configuration
|
72 |
+
self._validate_config(config)
|
73 |
+
|
74 |
+
return config
|
75 |
+
|
76 |
+
def _get_default_config(self) -> Dict[str, Any]:
|
77 |
+
"""
|
78 |
+
Get default configuration values.
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
Default configuration dictionary
|
82 |
+
"""
|
83 |
+
return {
|
84 |
+
"api_keys": {
|
85 |
+
"gemini_api_key": "",
|
86 |
+
"pinecone_api_key": "",
|
87 |
+
"openai_api_key": "",
|
88 |
+
},
|
89 |
+
"vector_db": {
|
90 |
+
"provider": "pinecone",
|
91 |
+
"index_name": "rag-ai-index",
|
92 |
+
"dimension": 3072, # ✅ Fixed: Match Gemini embedding dimension
|
93 |
+
"metric": "cosine",
|
94 |
+
"environment": "us-west1-gcp",
|
95 |
+
},
|
96 |
+
"embedding": {
|
97 |
+
"model": "gemini-embedding-exp-03-07",
|
98 |
+
"batch_size": 5,
|
99 |
+
"max_retries": 3,
|
100 |
+
"retry_delay": 1,
|
101 |
+
},
|
102 |
+
"document_processing": {
|
103 |
+
"chunk_size": 1000,
|
104 |
+
"chunk_overlap": 200,
|
105 |
+
"min_chunk_size": 100,
|
106 |
+
"max_file_size_mb": 50,
|
107 |
+
},
|
108 |
+
"url_processing": {
|
109 |
+
"max_depth": 1,
|
110 |
+
"follow_links": True,
|
111 |
+
"max_pages": 10,
|
112 |
+
"timeout": 10,
|
113 |
+
},
|
114 |
+
"rag": {
|
115 |
+
"top_k": 5,
|
116 |
+
"similarity_threshold": 0.7,
|
117 |
+
"max_context_length": 4000,
|
118 |
+
"model": "gpt-3.5-turbo",
|
119 |
+
"max_tokens": 500,
|
120 |
+
"temperature": 0.7,
|
121 |
+
},
|
122 |
+
"ui": {
|
123 |
+
"title": "AI Embedded Knowledge Agent",
|
124 |
+
"description": "Upload documents or provide URLs to build your knowledge base, then ask questions!",
|
125 |
+
"theme": "default",
|
126 |
+
"share": False,
|
127 |
+
"port": 7860,
|
128 |
+
},
|
129 |
+
"logging": {
|
130 |
+
"level": "INFO",
|
131 |
+
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
132 |
+
},
|
133 |
+
}
|
134 |
+
|
135 |
+
def _merge_configs(
|
136 |
+
self, base: Dict[str, Any], override: Dict[str, Any]
|
137 |
+
) -> Dict[str, Any]:
|
138 |
+
"""
|
139 |
+
Recursively merge two configuration dictionaries.
|
140 |
+
|
141 |
+
Args:
|
142 |
+
base: Base configuration dictionary
|
143 |
+
override: Override configuration dictionary
|
144 |
+
|
145 |
+
Returns:
|
146 |
+
Merged configuration dictionary
|
147 |
+
"""
|
148 |
+
result = base.copy()
|
149 |
+
|
150 |
+
for key, value in override.items():
|
151 |
+
if (
|
152 |
+
key in result
|
153 |
+
and isinstance(result[key], dict)
|
154 |
+
and isinstance(value, dict)
|
155 |
+
):
|
156 |
+
result[key] = self._merge_configs(result[key], value)
|
157 |
+
else:
|
158 |
+
result[key] = value
|
159 |
+
|
160 |
+
return result
|
161 |
+
|
162 |
+
def _apply_env_overrides(self, config: Dict[str, Any]) -> Dict[str, Any]:
|
163 |
+
"""
|
164 |
+
Apply environment variable overrides to configuration.
|
165 |
+
|
166 |
+
Args:
|
167 |
+
config: Configuration dictionary
|
168 |
+
|
169 |
+
Returns:
|
170 |
+
Configuration with environment overrides applied
|
171 |
+
"""
|
172 |
+
# API Keys
|
173 |
+
if os.environ.get("GEMINI_API_KEY"):
|
174 |
+
config["api_keys"]["gemini_api_key"] = os.environ["GEMINI_API_KEY"]
|
175 |
+
|
176 |
+
if os.environ.get("PINECONE_API_KEY"):
|
177 |
+
config["api_keys"]["pinecone_api_key"] = os.environ["PINECONE_API_KEY"]
|
178 |
+
|
179 |
+
if os.environ.get("OPENAI_API_KEY"):
|
180 |
+
config["api_keys"]["openai_api_key"] = os.environ["OPENAI_API_KEY"]
|
181 |
+
|
182 |
+
# Pinecone settings
|
183 |
+
if os.environ.get("PINECONE_ENVIRONMENT"):
|
184 |
+
config["vector_db"]["environment"] = os.environ["PINECONE_ENVIRONMENT"]
|
185 |
+
|
186 |
+
if os.environ.get("PINECONE_INDEX_NAME"):
|
187 |
+
config["vector_db"]["index_name"] = os.environ["PINECONE_INDEX_NAME"]
|
188 |
+
|
189 |
+
# UI settings
|
190 |
+
if os.environ.get("GRADIO_SHARE"):
|
191 |
+
config["ui"]["share"] = os.environ["GRADIO_SHARE"].lower() == "true"
|
192 |
+
|
193 |
+
if os.environ.get("PORT"):
|
194 |
+
try:
|
195 |
+
config["ui"]["port"] = int(os.environ["PORT"])
|
196 |
+
except ValueError:
|
197 |
+
self.logger.warning(f"Invalid PORT value: {os.environ['PORT']}")
|
198 |
+
|
199 |
+
return config
|
200 |
+
|
201 |
+
def _validate_config(self, config: Dict[str, Any]) -> None:
|
202 |
+
"""
|
203 |
+
Validate configuration values.
|
204 |
+
|
205 |
+
Args:
|
206 |
+
config: Configuration dictionary to validate
|
207 |
+
"""
|
208 |
+
# Check required API keys
|
209 |
+
if not config["api_keys"]["gemini_api_key"]:
|
210 |
+
self.logger.warning("Gemini API key not configured")
|
211 |
+
|
212 |
+
if not config["api_keys"]["pinecone_api_key"]:
|
213 |
+
self.logger.warning("Pinecone API key not configured")
|
214 |
+
|
215 |
+
# Validate numeric values
|
216 |
+
if config["document_processing"]["chunk_size"] <= 0:
|
217 |
+
raise ValueError("chunk_size must be positive")
|
218 |
+
|
219 |
+
if config["rag"]["top_k"] <= 0:
|
220 |
+
raise ValueError("top_k must be positive")
|
221 |
+
|
222 |
+
if not 0 <= config["rag"]["similarity_threshold"] <= 1:
|
223 |
+
raise ValueError("similarity_threshold must be between 0 and 1")
|
224 |
+
|
225 |
+
def get(self, key: str, default: Any = None) -> Any:
|
226 |
+
"""
|
227 |
+
Get a configuration value using dot notation.
|
228 |
+
|
229 |
+
Args:
|
230 |
+
key: Configuration key (e.g., 'vector_db.index_name')
|
231 |
+
default: Default value if key not found
|
232 |
+
|
233 |
+
Returns:
|
234 |
+
Configuration value
|
235 |
+
"""
|
236 |
+
keys = key.split(".")
|
237 |
+
value = self.config
|
238 |
+
|
239 |
+
try:
|
240 |
+
for k in keys:
|
241 |
+
value = value[k]
|
242 |
+
return value
|
243 |
+
except (KeyError, TypeError):
|
244 |
+
return default
|
245 |
+
|
246 |
+
def set(self, key: str, value: Any) -> None:
|
247 |
+
"""
|
248 |
+
Set a configuration value using dot notation.
|
249 |
+
|
250 |
+
Args:
|
251 |
+
key: Configuration key (e.g., 'vector_db.index_name')
|
252 |
+
value: Value to set
|
253 |
+
"""
|
254 |
+
keys = key.split(".")
|
255 |
+
config = self.config
|
256 |
+
|
257 |
+
for k in keys[:-1]:
|
258 |
+
if k not in config:
|
259 |
+
config[k] = {}
|
260 |
+
config = config[k]
|
261 |
+
|
262 |
+
config[keys[-1]] = value
|
263 |
+
|
264 |
+
def get_section(self, section: str) -> Dict[str, Any]:
|
265 |
+
"""
|
266 |
+
Get an entire configuration section.
|
267 |
+
|
268 |
+
Args:
|
269 |
+
section: Section name
|
270 |
+
|
271 |
+
Returns:
|
272 |
+
Configuration section dictionary
|
273 |
+
"""
|
274 |
+
return self.config.get(section, {})
|
275 |
+
|
276 |
+
def reload(self) -> None:
|
277 |
+
"""Reload configuration from file."""
|
278 |
+
self.config = self._load_config()
|
279 |
+
self.logger.info("Configuration reloaded")
|
src/utils/error_handler.py
ADDED
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Error Handler Module
|
3 |
+
|
4 |
+
This module provides centralized error handling and logging
|
5 |
+
for the RAG AI system.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import logging
|
9 |
+
import traceback
|
10 |
+
import functools
|
11 |
+
from typing import Any, Callable, Dict, Optional, Type, Union
|
12 |
+
from enum import Enum
|
13 |
+
|
14 |
+
|
15 |
+
class ErrorType(Enum):
|
16 |
+
"""Enumeration of error types in the system."""
|
17 |
+
|
18 |
+
DOCUMENT_PROCESSING = "document_processing"
|
19 |
+
URL_PROCESSING = "url_processing"
|
20 |
+
EMBEDDING_GENERATION = "embedding_generation"
|
21 |
+
VECTOR_STORAGE = "vector_storage"
|
22 |
+
QUERY_PROCESSING = "query_processing"
|
23 |
+
RESPONSE_GENERATION = "response_generation"
|
24 |
+
API_ERROR = "api_error"
|
25 |
+
CONFIGURATION = "configuration"
|
26 |
+
UI_ERROR = "ui_error"
|
27 |
+
UNKNOWN = "unknown"
|
28 |
+
|
29 |
+
|
30 |
+
class RAGError(Exception):
|
31 |
+
"""Base exception class for RAG AI system errors."""
|
32 |
+
|
33 |
+
def __init__(
|
34 |
+
self,
|
35 |
+
message: str,
|
36 |
+
error_type: ErrorType = ErrorType.UNKNOWN,
|
37 |
+
details: Optional[Dict[str, Any]] = None,
|
38 |
+
):
|
39 |
+
"""
|
40 |
+
Initialize RAGError.
|
41 |
+
|
42 |
+
Args:
|
43 |
+
message: Error message
|
44 |
+
error_type: Type of error
|
45 |
+
details: Additional error details
|
46 |
+
"""
|
47 |
+
super().__init__(message)
|
48 |
+
self.error_type = error_type
|
49 |
+
self.details = details or {}
|
50 |
+
self.message = message
|
51 |
+
|
52 |
+
|
53 |
+
class DocumentProcessingError(RAGError):
|
54 |
+
"""Exception for document processing errors."""
|
55 |
+
|
56 |
+
def __init__(
|
57 |
+
self,
|
58 |
+
message: str,
|
59 |
+
file_path: Optional[str] = None,
|
60 |
+
details: Optional[Dict[str, Any]] = None,
|
61 |
+
):
|
62 |
+
details = details or {}
|
63 |
+
if file_path:
|
64 |
+
details["file_path"] = file_path
|
65 |
+
super().__init__(message, ErrorType.DOCUMENT_PROCESSING, details)
|
66 |
+
|
67 |
+
|
68 |
+
class URLProcessingError(RAGError):
|
69 |
+
"""Exception for URL processing errors."""
|
70 |
+
|
71 |
+
def __init__(
|
72 |
+
self,
|
73 |
+
message: str,
|
74 |
+
url: Optional[str] = None,
|
75 |
+
details: Optional[Dict[str, Any]] = None,
|
76 |
+
):
|
77 |
+
details = details or {}
|
78 |
+
if url:
|
79 |
+
details["url"] = url
|
80 |
+
super().__init__(message, ErrorType.URL_PROCESSING, details)
|
81 |
+
|
82 |
+
|
83 |
+
class EmbeddingError(RAGError):
|
84 |
+
"""Exception for embedding generation errors."""
|
85 |
+
|
86 |
+
def __init__(self, message: str, details: Optional[Dict[str, Any]] = None):
|
87 |
+
super().__init__(message, ErrorType.EMBEDDING_GENERATION, details)
|
88 |
+
|
89 |
+
|
90 |
+
class VectorStorageError(RAGError):
|
91 |
+
"""Exception for vector storage errors."""
|
92 |
+
|
93 |
+
def __init__(self, message: str, details: Optional[Dict[str, Any]] = None):
|
94 |
+
super().__init__(message, ErrorType.VECTOR_STORAGE, details)
|
95 |
+
|
96 |
+
|
97 |
+
class QueryProcessingError(RAGError):
|
98 |
+
"""Exception for query processing errors."""
|
99 |
+
|
100 |
+
def __init__(
|
101 |
+
self,
|
102 |
+
message: str,
|
103 |
+
query: Optional[str] = None,
|
104 |
+
details: Optional[Dict[str, Any]] = None,
|
105 |
+
):
|
106 |
+
details = details or {}
|
107 |
+
if query:
|
108 |
+
details["query"] = query
|
109 |
+
super().__init__(message, ErrorType.QUERY_PROCESSING, details)
|
110 |
+
|
111 |
+
|
112 |
+
class ResponseGenerationError(RAGError):
|
113 |
+
"""Exception for response generation errors."""
|
114 |
+
|
115 |
+
def __init__(self, message: str, details: Optional[Dict[str, Any]] = None):
|
116 |
+
super().__init__(message, ErrorType.RESPONSE_GENERATION, details)
|
117 |
+
|
118 |
+
|
119 |
+
class APIError(RAGError):
|
120 |
+
"""Exception for API-related errors."""
|
121 |
+
|
122 |
+
def __init__(
|
123 |
+
self,
|
124 |
+
message: str,
|
125 |
+
api_name: Optional[str] = None,
|
126 |
+
status_code: Optional[int] = None,
|
127 |
+
details: Optional[Dict[str, Any]] = None,
|
128 |
+
):
|
129 |
+
details = details or {}
|
130 |
+
if api_name:
|
131 |
+
details["api_name"] = api_name
|
132 |
+
if status_code:
|
133 |
+
details["status_code"] = status_code
|
134 |
+
super().__init__(message, ErrorType.API_ERROR, details)
|
135 |
+
|
136 |
+
|
137 |
+
class ConfigurationError(RAGError):
|
138 |
+
"""Exception for configuration errors."""
|
139 |
+
|
140 |
+
def __init__(
|
141 |
+
self,
|
142 |
+
message: str,
|
143 |
+
config_key: Optional[str] = None,
|
144 |
+
details: Optional[Dict[str, Any]] = None,
|
145 |
+
):
|
146 |
+
details = details or {}
|
147 |
+
if config_key:
|
148 |
+
details["config_key"] = config_key
|
149 |
+
super().__init__(message, ErrorType.CONFIGURATION, details)
|
150 |
+
|
151 |
+
|
152 |
+
class ErrorHandler:
|
153 |
+
"""
|
154 |
+
Centralized error handler for the RAG AI system.
|
155 |
+
|
156 |
+
Features:
|
157 |
+
- Error logging with context
|
158 |
+
- Error categorization
|
159 |
+
- Error recovery suggestions
|
160 |
+
- Performance monitoring
|
161 |
+
"""
|
162 |
+
|
163 |
+
def __init__(self, logger_name: str = __name__):
|
164 |
+
"""
|
165 |
+
Initialize the ErrorHandler.
|
166 |
+
|
167 |
+
Args:
|
168 |
+
logger_name: Name for the logger instance
|
169 |
+
"""
|
170 |
+
self.logger = logging.getLogger(logger_name)
|
171 |
+
self.error_counts = {}
|
172 |
+
|
173 |
+
def handle_error(
|
174 |
+
self, error: Exception, context: Optional[Dict[str, Any]] = None
|
175 |
+
) -> Dict[str, Any]:
|
176 |
+
"""
|
177 |
+
Handle an error with logging and context.
|
178 |
+
|
179 |
+
Args:
|
180 |
+
error: The exception that occurred
|
181 |
+
context: Additional context information
|
182 |
+
|
183 |
+
Returns:
|
184 |
+
Dictionary containing error information
|
185 |
+
"""
|
186 |
+
context = context or {}
|
187 |
+
|
188 |
+
# Determine error type
|
189 |
+
if isinstance(error, RAGError):
|
190 |
+
error_type = error.error_type
|
191 |
+
error_details = error.details
|
192 |
+
else:
|
193 |
+
error_type = ErrorType.UNKNOWN
|
194 |
+
error_details = {}
|
195 |
+
|
196 |
+
# Create error info
|
197 |
+
error_info = {
|
198 |
+
"type": error_type.value,
|
199 |
+
"message": str(error),
|
200 |
+
"details": error_details,
|
201 |
+
"context": context,
|
202 |
+
"traceback": traceback.format_exc(),
|
203 |
+
}
|
204 |
+
|
205 |
+
# Log the error
|
206 |
+
self._log_error(error_info)
|
207 |
+
|
208 |
+
# Update error counts
|
209 |
+
self._update_error_counts(error_type)
|
210 |
+
|
211 |
+
# Add recovery suggestions
|
212 |
+
error_info["recovery_suggestions"] = self._get_recovery_suggestions(error_type)
|
213 |
+
|
214 |
+
return error_info
|
215 |
+
|
216 |
+
def _log_error(self, error_info: Dict[str, Any]) -> None:
|
217 |
+
"""
|
218 |
+
Log error information.
|
219 |
+
|
220 |
+
Args:
|
221 |
+
error_info: Error information dictionary
|
222 |
+
"""
|
223 |
+
error_type = error_info["type"]
|
224 |
+
message = error_info["message"]
|
225 |
+
context = error_info.get("context", {})
|
226 |
+
|
227 |
+
log_message = f"[{error_type.upper()}] {message}"
|
228 |
+
|
229 |
+
if context:
|
230 |
+
context_str = ", ".join([f"{k}={v}" for k, v in context.items()])
|
231 |
+
log_message += f" | Context: {context_str}"
|
232 |
+
|
233 |
+
self.logger.error(log_message)
|
234 |
+
|
235 |
+
# Log traceback at debug level
|
236 |
+
if error_info.get("traceback"):
|
237 |
+
self.logger.debug(f"Traceback: {error_info['traceback']}")
|
238 |
+
|
239 |
+
def _update_error_counts(self, error_type: ErrorType) -> None:
|
240 |
+
"""
|
241 |
+
Update error count statistics.
|
242 |
+
|
243 |
+
Args:
|
244 |
+
error_type: Type of error that occurred
|
245 |
+
"""
|
246 |
+
if error_type not in self.error_counts:
|
247 |
+
self.error_counts[error_type] = 0
|
248 |
+
self.error_counts[error_type] += 1
|
249 |
+
|
250 |
+
def _get_recovery_suggestions(self, error_type: ErrorType) -> list:
|
251 |
+
"""
|
252 |
+
Get recovery suggestions for an error type.
|
253 |
+
|
254 |
+
Args:
|
255 |
+
error_type: Type of error
|
256 |
+
|
257 |
+
Returns:
|
258 |
+
List of recovery suggestions
|
259 |
+
"""
|
260 |
+
suggestions = {
|
261 |
+
ErrorType.DOCUMENT_PROCESSING: [
|
262 |
+
"Check if the document format is supported",
|
263 |
+
"Verify the document is not corrupted",
|
264 |
+
"Ensure sufficient disk space for processing",
|
265 |
+
],
|
266 |
+
ErrorType.URL_PROCESSING: [
|
267 |
+
"Verify the URL is accessible",
|
268 |
+
"Check internet connectivity",
|
269 |
+
"Ensure the website allows scraping",
|
270 |
+
],
|
271 |
+
ErrorType.EMBEDDING_GENERATION: [
|
272 |
+
"Check Gemini API key configuration",
|
273 |
+
"Verify API quota and rate limits",
|
274 |
+
"Ensure text content is not empty",
|
275 |
+
],
|
276 |
+
ErrorType.VECTOR_STORAGE: [
|
277 |
+
"Check Pinecone API key configuration",
|
278 |
+
"Verify Pinecone index exists",
|
279 |
+
"Check vector dimensions match index configuration",
|
280 |
+
],
|
281 |
+
ErrorType.QUERY_PROCESSING: [
|
282 |
+
"Ensure query is not empty",
|
283 |
+
"Check if knowledge base has content",
|
284 |
+
"Verify embedding generation is working",
|
285 |
+
],
|
286 |
+
ErrorType.RESPONSE_GENERATION: [
|
287 |
+
"Check language model configuration",
|
288 |
+
"Verify retrieved context is valid",
|
289 |
+
"Ensure API keys are configured",
|
290 |
+
],
|
291 |
+
ErrorType.API_ERROR: [
|
292 |
+
"Check API key validity",
|
293 |
+
"Verify network connectivity",
|
294 |
+
"Check API rate limits and quotas",
|
295 |
+
],
|
296 |
+
ErrorType.CONFIGURATION: [
|
297 |
+
"Check configuration file syntax",
|
298 |
+
"Verify all required settings are present",
|
299 |
+
"Ensure environment variables are set",
|
300 |
+
],
|
301 |
+
ErrorType.UI_ERROR: [
|
302 |
+
"Refresh the page",
|
303 |
+
"Check browser compatibility",
|
304 |
+
"Verify Gradio is properly installed",
|
305 |
+
],
|
306 |
+
}
|
307 |
+
|
308 |
+
return suggestions.get(error_type, ["Contact support for assistance"])
|
309 |
+
|
310 |
+
def get_error_statistics(self) -> Dict[str, Any]:
|
311 |
+
"""
|
312 |
+
Get error statistics.
|
313 |
+
|
314 |
+
Returns:
|
315 |
+
Dictionary containing error statistics
|
316 |
+
"""
|
317 |
+
total_errors = sum(self.error_counts.values())
|
318 |
+
|
319 |
+
return {
|
320 |
+
"total_errors": total_errors,
|
321 |
+
"error_counts": {
|
322 |
+
error_type.value: count
|
323 |
+
for error_type, count in self.error_counts.items()
|
324 |
+
},
|
325 |
+
"most_common_error": (
|
326 |
+
max(self.error_counts.items(), key=lambda x: x[1])[0].value
|
327 |
+
if self.error_counts
|
328 |
+
else None
|
329 |
+
),
|
330 |
+
}
|
331 |
+
|
332 |
+
|
333 |
+
def error_handler(
|
334 |
+
error_type: ErrorType = ErrorType.UNKNOWN, context: Optional[Dict[str, Any]] = None
|
335 |
+
):
|
336 |
+
"""
|
337 |
+
Decorator for automatic error handling.
|
338 |
+
|
339 |
+
Args:
|
340 |
+
error_type: Type of error to handle
|
341 |
+
context: Additional context information
|
342 |
+
|
343 |
+
Returns:
|
344 |
+
Decorated function
|
345 |
+
"""
|
346 |
+
|
347 |
+
def decorator(func: Callable) -> Callable:
|
348 |
+
@functools.wraps(func)
|
349 |
+
def wrapper(*args, **kwargs):
|
350 |
+
handler = ErrorHandler()
|
351 |
+
try:
|
352 |
+
return func(*args, **kwargs)
|
353 |
+
except Exception as e:
|
354 |
+
error_context = context or {}
|
355 |
+
error_context.update(
|
356 |
+
{
|
357 |
+
"function": func.__name__,
|
358 |
+
"args": str(args)[:100], # Truncate for logging
|
359 |
+
"kwargs": str(kwargs)[:100],
|
360 |
+
}
|
361 |
+
)
|
362 |
+
|
363 |
+
error_info = handler.handle_error(e, error_context)
|
364 |
+
|
365 |
+
# Re-raise as appropriate RAG error type
|
366 |
+
if error_type == ErrorType.DOCUMENT_PROCESSING:
|
367 |
+
raise DocumentProcessingError(str(e), details=error_info)
|
368 |
+
elif error_type == ErrorType.URL_PROCESSING:
|
369 |
+
raise URLProcessingError(str(e), details=error_info)
|
370 |
+
elif error_type == ErrorType.EMBEDDING_GENERATION:
|
371 |
+
raise EmbeddingError(str(e), details=error_info)
|
372 |
+
elif error_type == ErrorType.VECTOR_STORAGE:
|
373 |
+
raise VectorStorageError(str(e), details=error_info)
|
374 |
+
elif error_type == ErrorType.QUERY_PROCESSING:
|
375 |
+
raise QueryProcessingError(str(e), details=error_info)
|
376 |
+
elif error_type == ErrorType.RESPONSE_GENERATION:
|
377 |
+
raise ResponseGenerationError(str(e), details=error_info)
|
378 |
+
else:
|
379 |
+
raise RAGError(str(e), error_type, error_info)
|
380 |
+
|
381 |
+
return wrapper
|
382 |
+
|
383 |
+
return decorator
|
src/utils/settings_manager.py
ADDED
@@ -0,0 +1,676 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Settings Manager Module
|
3 |
+
|
4 |
+
This module provides secure environment variable management with UI integration,
|
5 |
+
supporting both cache and .env file storage options.
|
6 |
+
|
7 |
+
Features:
|
8 |
+
- 🔐 Secure API key handling with masking
|
9 |
+
- ⚡ Real-time validation and testing
|
10 |
+
- 💾 Dual storage backends (cache + .env file)
|
11 |
+
- 🛡️ Input sanitization and validation
|
12 |
+
- 🔄 Live system updates
|
13 |
+
"""
|
14 |
+
|
15 |
+
import os
|
16 |
+
import re
|
17 |
+
import logging
|
18 |
+
import json
|
19 |
+
import time
|
20 |
+
from typing import Dict, Any, Optional, Tuple, List
|
21 |
+
from pathlib import Path
|
22 |
+
from datetime import datetime
|
23 |
+
import tempfile
|
24 |
+
|
25 |
+
|
26 |
+
class SettingsManager:
|
27 |
+
"""
|
28 |
+
Manages environment variables with secure storage and validation.
|
29 |
+
|
30 |
+
Features:
|
31 |
+
- Secure API key masking and validation
|
32 |
+
- Real-time connection testing
|
33 |
+
- Cache and .env file storage options
|
34 |
+
- Integration with existing ConfigManager
|
35 |
+
"""
|
36 |
+
|
37 |
+
def __init__(self, config_manager=None):
|
38 |
+
"""
|
39 |
+
Initialize the SettingsManager.
|
40 |
+
|
41 |
+
Args:
|
42 |
+
config_manager: Optional ConfigManager instance for integration
|
43 |
+
"""
|
44 |
+
self.logger = logging.getLogger(__name__)
|
45 |
+
self.config_manager = config_manager
|
46 |
+
|
47 |
+
# 🔧 Cache storage for temporary settings
|
48 |
+
self._cache_storage = {}
|
49 |
+
|
50 |
+
# 📁 Project root for .env file
|
51 |
+
self.project_root = Path(__file__).parent.parent.parent
|
52 |
+
self.env_file_path = self.project_root / ".env"
|
53 |
+
|
54 |
+
# 🛡️ Supported environment variables with validation rules
|
55 |
+
self.supported_env_vars = {
|
56 |
+
"GEMINI_API_KEY": {
|
57 |
+
"required": True,
|
58 |
+
"description": "Google Gemini API Key for embeddings and LLM",
|
59 |
+
"format": r"^AIzaSy[A-Za-z0-9_-]{33}$",
|
60 |
+
"mask": True,
|
61 |
+
"test_function": self._test_gemini_connection,
|
62 |
+
"placeholder": "AIzaSy...",
|
63 |
+
"help_url": "https://aistudio.google.com/",
|
64 |
+
},
|
65 |
+
"PINECONE_API_KEY": {
|
66 |
+
"required": False,
|
67 |
+
"description": "Pinecone API Key for vector database",
|
68 |
+
"format": r"^pc-[A-Za-z0-9]{32}$",
|
69 |
+
"mask": True,
|
70 |
+
"test_function": self._test_pinecone_connection,
|
71 |
+
"placeholder": "pc-...",
|
72 |
+
"help_url": "https://www.pinecone.io/",
|
73 |
+
},
|
74 |
+
"OPENAI_API_KEY": {
|
75 |
+
"required": False,
|
76 |
+
"description": "OpenAI API Key for alternative LLM",
|
77 |
+
"format": r"^sk-[A-Za-z0-9]{48}$",
|
78 |
+
"mask": True,
|
79 |
+
"test_function": self._test_openai_connection,
|
80 |
+
"placeholder": "sk-...",
|
81 |
+
"help_url": "https://platform.openai.com/api-keys",
|
82 |
+
},
|
83 |
+
"TAVILY_API_KEY": {
|
84 |
+
"required": False,
|
85 |
+
"description": "Tavily API Key for live web search",
|
86 |
+
"format": r"^tvly-[A-Za-z0-9-]{20,50}$",
|
87 |
+
"mask": True,
|
88 |
+
"test_function": self._test_tavily_connection,
|
89 |
+
"placeholder": "tvly-dev-...",
|
90 |
+
"help_url": "https://app.tavily.com/sign-in",
|
91 |
+
},
|
92 |
+
"PINECONE_ENVIRONMENT": {
|
93 |
+
"required": False,
|
94 |
+
"description": "Pinecone environment region",
|
95 |
+
"format": r"^[a-z0-9-]+$",
|
96 |
+
"mask": False,
|
97 |
+
"default": "us-east-1",
|
98 |
+
"placeholder": "us-east-1",
|
99 |
+
"options": [
|
100 |
+
"us-east-1",
|
101 |
+
"us-west1-gcp",
|
102 |
+
"eu-west1-gcp",
|
103 |
+
"asia-southeast1-gcp",
|
104 |
+
],
|
105 |
+
},
|
106 |
+
"PINECONE_INDEX_NAME": {
|
107 |
+
"required": False,
|
108 |
+
"description": "Pinecone index name",
|
109 |
+
"format": r"^[a-z0-9-]+$",
|
110 |
+
"mask": False,
|
111 |
+
"default": "rag-ai-index",
|
112 |
+
"placeholder": "rag-ai-index",
|
113 |
+
},
|
114 |
+
"GRADIO_SHARE": {
|
115 |
+
"required": False,
|
116 |
+
"description": "Enable Gradio public sharing",
|
117 |
+
"format": r"^(true|false)$",
|
118 |
+
"mask": False,
|
119 |
+
"default": "false",
|
120 |
+
"options": ["true", "false"],
|
121 |
+
},
|
122 |
+
"PORT": {
|
123 |
+
"required": False,
|
124 |
+
"description": "Server port number",
|
125 |
+
"format": r"^[1-9][0-9]{3,4}$",
|
126 |
+
"mask": False,
|
127 |
+
"default": "7860",
|
128 |
+
"placeholder": "7860",
|
129 |
+
},
|
130 |
+
}
|
131 |
+
|
132 |
+
self.logger.info("SettingsManager initialized successfully")
|
133 |
+
|
134 |
+
def get_current_settings(self) -> Dict[str, Any]:
|
135 |
+
"""
|
136 |
+
Get current environment variable settings with status.
|
137 |
+
|
138 |
+
Returns:
|
139 |
+
Dictionary with current settings and their status
|
140 |
+
"""
|
141 |
+
settings = {}
|
142 |
+
|
143 |
+
for var_name, config in self.supported_env_vars.items():
|
144 |
+
# 🔍 Get value from cache, environment, or default
|
145 |
+
value = self._get_env_value(var_name)
|
146 |
+
|
147 |
+
settings[var_name] = {
|
148 |
+
"value": (
|
149 |
+
self._mask_value(value, config.get("mask", False)) if value else ""
|
150 |
+
),
|
151 |
+
"raw_value": value or "",
|
152 |
+
"is_set": bool(value),
|
153 |
+
"is_valid": (
|
154 |
+
self._validate_format(value, config.get("format"))
|
155 |
+
if value
|
156 |
+
else False
|
157 |
+
),
|
158 |
+
"is_required": config.get("required", False),
|
159 |
+
"description": config.get("description", ""),
|
160 |
+
"placeholder": config.get("placeholder", ""),
|
161 |
+
"help_url": config.get("help_url", ""),
|
162 |
+
"options": config.get("options", []),
|
163 |
+
"source": self._get_value_source(var_name),
|
164 |
+
"last_tested": self._cache_storage.get(f"{var_name}_last_tested"),
|
165 |
+
"test_status": self._cache_storage.get(
|
166 |
+
f"{var_name}_test_status", "untested"
|
167 |
+
),
|
168 |
+
}
|
169 |
+
|
170 |
+
return settings
|
171 |
+
|
172 |
+
def update_setting(
|
173 |
+
self, var_name: str, value: str, storage_type: str = "cache"
|
174 |
+
) -> Dict[str, Any]:
|
175 |
+
"""
|
176 |
+
Update an environment variable setting.
|
177 |
+
|
178 |
+
Args:
|
179 |
+
var_name: Environment variable name
|
180 |
+
value: New value
|
181 |
+
storage_type: "cache" or "env_file"
|
182 |
+
|
183 |
+
Returns:
|
184 |
+
Dictionary with operation result
|
185 |
+
"""
|
186 |
+
try:
|
187 |
+
if var_name not in self.supported_env_vars:
|
188 |
+
return {
|
189 |
+
"success": False,
|
190 |
+
"error": f"Unsupported environment variable: {var_name}",
|
191 |
+
"status": "❌ Invalid variable",
|
192 |
+
}
|
193 |
+
|
194 |
+
config = self.supported_env_vars[var_name]
|
195 |
+
|
196 |
+
# 🛡️ Validate format
|
197 |
+
if value and not self._validate_format(value, config.get("format")):
|
198 |
+
return {
|
199 |
+
"success": False,
|
200 |
+
"error": f"Invalid format for {var_name}",
|
201 |
+
"status": "❌ Invalid format",
|
202 |
+
"expected_format": config.get("placeholder", ""),
|
203 |
+
}
|
204 |
+
|
205 |
+
# 💾 Store based on storage type
|
206 |
+
if storage_type == "cache":
|
207 |
+
self._cache_storage[var_name] = value
|
208 |
+
os.environ[var_name] = value # ⚡ Update current session
|
209 |
+
status_msg = "💾 Saved to cache"
|
210 |
+
elif storage_type == "env_file":
|
211 |
+
self._save_to_env_file(var_name, value)
|
212 |
+
os.environ[var_name] = value # ⚡ Update current session
|
213 |
+
status_msg = "📁 Saved to .env file"
|
214 |
+
else:
|
215 |
+
return {
|
216 |
+
"success": False,
|
217 |
+
"error": f"Invalid storage type: {storage_type}",
|
218 |
+
"status": "❌ Invalid storage type",
|
219 |
+
}
|
220 |
+
|
221 |
+
# 🔄 Update config manager if available
|
222 |
+
if self.config_manager:
|
223 |
+
try:
|
224 |
+
self.config_manager.reload()
|
225 |
+
except Exception as e:
|
226 |
+
self.logger.warning(f"Could not reload config manager: {e}")
|
227 |
+
|
228 |
+
self.logger.info(f"Updated {var_name} via {storage_type}")
|
229 |
+
|
230 |
+
return {
|
231 |
+
"success": True,
|
232 |
+
"status": f" {status_msg}",
|
233 |
+
"value": self._mask_value(value, config.get("mask", False)),
|
234 |
+
"storage_type": storage_type,
|
235 |
+
"timestamp": datetime.now().isoformat(),
|
236 |
+
}
|
237 |
+
|
238 |
+
except Exception as e:
|
239 |
+
self.logger.error(f"Error updating {var_name}: {e}")
|
240 |
+
return {"success": False, "error": str(e), "status": " Update failed"}
|
241 |
+
|
242 |
+
def test_connection(self, var_name: str) -> Dict[str, Any]:
|
243 |
+
"""
|
244 |
+
Test API connection for a given environment variable.
|
245 |
+
|
246 |
+
Args:
|
247 |
+
var_name: Environment variable name
|
248 |
+
|
249 |
+
Returns:
|
250 |
+
Dictionary with test results
|
251 |
+
"""
|
252 |
+
try:
|
253 |
+
if var_name not in self.supported_env_vars:
|
254 |
+
return {
|
255 |
+
"success": False,
|
256 |
+
"error": f"Cannot test {var_name}: not supported",
|
257 |
+
"status": "❌ Not testable",
|
258 |
+
}
|
259 |
+
|
260 |
+
config = self.supported_env_vars[var_name]
|
261 |
+
test_function = config.get("test_function")
|
262 |
+
|
263 |
+
if not test_function:
|
264 |
+
return {
|
265 |
+
"success": False,
|
266 |
+
"error": f"No test function available for {var_name}",
|
267 |
+
"status": "⚠️ No test available",
|
268 |
+
}
|
269 |
+
|
270 |
+
value = self._get_env_value(var_name)
|
271 |
+
if not value:
|
272 |
+
return {
|
273 |
+
"success": False,
|
274 |
+
"error": f"{var_name} is not set",
|
275 |
+
"status": "❌ Not configured",
|
276 |
+
}
|
277 |
+
|
278 |
+
# 🧪 Run the test
|
279 |
+
self.logger.info(f"Testing connection for {var_name}")
|
280 |
+
test_result = test_function(value)
|
281 |
+
|
282 |
+
# 📊 Cache test results
|
283 |
+
timestamp = datetime.now().isoformat()
|
284 |
+
self._cache_storage[f"{var_name}_last_tested"] = timestamp
|
285 |
+
self._cache_storage[f"{var_name}_test_status"] = (
|
286 |
+
"success" if test_result["success"] else "failed"
|
287 |
+
)
|
288 |
+
|
289 |
+
return {**test_result, "timestamp": timestamp, "variable": var_name}
|
290 |
+
|
291 |
+
except Exception as e:
|
292 |
+
self.logger.error(f"Error testing {var_name}: {e}")
|
293 |
+
error_result = {
|
294 |
+
"success": False,
|
295 |
+
"error": str(e),
|
296 |
+
"status": "❌ Test failed",
|
297 |
+
"timestamp": datetime.now().isoformat(),
|
298 |
+
}
|
299 |
+
|
300 |
+
# 📊 Cache failed test
|
301 |
+
self._cache_storage[f"{var_name}_last_tested"] = error_result["timestamp"]
|
302 |
+
self._cache_storage[f"{var_name}_test_status"] = "failed"
|
303 |
+
|
304 |
+
return error_result
|
305 |
+
|
306 |
+
def load_from_env_file(self) -> Dict[str, Any]:
|
307 |
+
"""
|
308 |
+
Load settings from .env file.
|
309 |
+
|
310 |
+
Returns:
|
311 |
+
Dictionary with load results
|
312 |
+
"""
|
313 |
+
try:
|
314 |
+
if not self.env_file_path.exists():
|
315 |
+
return {
|
316 |
+
"success": False,
|
317 |
+
"error": ".env file not found",
|
318 |
+
"status": "📁 No .env file found",
|
319 |
+
"loaded_count": 0,
|
320 |
+
}
|
321 |
+
|
322 |
+
loaded_vars = []
|
323 |
+
|
324 |
+
with open(self.env_file_path, "r", encoding="utf-8") as f:
|
325 |
+
for line_num, line in enumerate(f, 1):
|
326 |
+
line = line.strip()
|
327 |
+
if line and not line.startswith("#") and "=" in line:
|
328 |
+
try:
|
329 |
+
key, value = line.split("=", 1)
|
330 |
+
key = key.strip()
|
331 |
+
value = value.strip().strip("\"'") # Remove quotes
|
332 |
+
|
333 |
+
if key in self.supported_env_vars:
|
334 |
+
os.environ[key] = value
|
335 |
+
loaded_vars.append(key)
|
336 |
+
except Exception as e:
|
337 |
+
self.logger.warning(
|
338 |
+
f"Error parsing line {line_num} in .env: {e}"
|
339 |
+
)
|
340 |
+
|
341 |
+
# 🔄 Reload config manager
|
342 |
+
if self.config_manager:
|
343 |
+
try:
|
344 |
+
self.config_manager.reload()
|
345 |
+
except Exception as e:
|
346 |
+
self.logger.warning(f"Could not reload config manager: {e}")
|
347 |
+
|
348 |
+
return {
|
349 |
+
"success": True,
|
350 |
+
"status": f" Loaded {len(loaded_vars)} variables from .env",
|
351 |
+
"loaded_count": len(loaded_vars),
|
352 |
+
"loaded_variables": loaded_vars,
|
353 |
+
}
|
354 |
+
|
355 |
+
except Exception as e:
|
356 |
+
self.logger.error(f"Error loading from .env file: {e}")
|
357 |
+
return {
|
358 |
+
"success": False,
|
359 |
+
"error": str(e),
|
360 |
+
"status": " Failed to load .env file",
|
361 |
+
"loaded_count": 0,
|
362 |
+
}
|
363 |
+
|
364 |
+
def clear_cache(self) -> Dict[str, Any]:
|
365 |
+
"""
|
366 |
+
Clear cached settings.
|
367 |
+
|
368 |
+
Returns:
|
369 |
+
Dictionary with operation result
|
370 |
+
"""
|
371 |
+
try:
|
372 |
+
# 🗑️ Clear cache but preserve test results
|
373 |
+
cached_vars = [
|
374 |
+
key
|
375 |
+
for key in self._cache_storage.keys()
|
376 |
+
if key in self.supported_env_vars
|
377 |
+
]
|
378 |
+
|
379 |
+
for var in cached_vars:
|
380 |
+
if var in self._cache_storage:
|
381 |
+
del self._cache_storage[var]
|
382 |
+
# Remove from current environment if it was cached
|
383 |
+
if var in os.environ:
|
384 |
+
del os.environ[var]
|
385 |
+
|
386 |
+
return {
|
387 |
+
"success": True,
|
388 |
+
"status": f"🗑️ Cleared {len(cached_vars)} cached variables",
|
389 |
+
"cleared_count": len(cached_vars),
|
390 |
+
}
|
391 |
+
|
392 |
+
except Exception as e:
|
393 |
+
self.logger.error(f"Error clearing cache: {e}")
|
394 |
+
return {
|
395 |
+
"success": False,
|
396 |
+
"error": str(e),
|
397 |
+
"status": " Failed to clear cache",
|
398 |
+
}
|
399 |
+
|
400 |
+
def export_settings(self, include_sensitive: bool = False) -> Dict[str, Any]:
|
401 |
+
"""
|
402 |
+
Export current settings for backup/sharing.
|
403 |
+
|
404 |
+
Args:
|
405 |
+
include_sensitive: Whether to include API keys (masked)
|
406 |
+
|
407 |
+
Returns:
|
408 |
+
Dictionary with exported settings
|
409 |
+
"""
|
410 |
+
try:
|
411 |
+
settings = self.get_current_settings()
|
412 |
+
exported = {}
|
413 |
+
|
414 |
+
for var_name, config in settings.items():
|
415 |
+
var_config = self.supported_env_vars[var_name]
|
416 |
+
|
417 |
+
# 🔐 Skip sensitive data if not requested
|
418 |
+
if var_config.get("mask", False) and not include_sensitive:
|
419 |
+
continue
|
420 |
+
|
421 |
+
exported[var_name] = {
|
422 |
+
"value": (
|
423 |
+
config["value"] if include_sensitive else config["raw_value"]
|
424 |
+
),
|
425 |
+
"is_set": config["is_set"],
|
426 |
+
"source": config["source"],
|
427 |
+
"description": config["description"],
|
428 |
+
}
|
429 |
+
|
430 |
+
return {
|
431 |
+
"success": True,
|
432 |
+
"settings": exported,
|
433 |
+
"export_timestamp": datetime.now().isoformat(),
|
434 |
+
"include_sensitive": include_sensitive,
|
435 |
+
}
|
436 |
+
|
437 |
+
except Exception as e:
|
438 |
+
self.logger.error(f"Error exporting settings: {e}")
|
439 |
+
return {"success": False, "error": str(e)}
|
440 |
+
|
441 |
+
# 🔧 Private helper methods
|
442 |
+
|
443 |
+
def _get_env_value(self, var_name: str) -> Optional[str]:
|
444 |
+
"""Get environment variable value from cache or environment."""
|
445 |
+
# Priority: cache > environment > default
|
446 |
+
if var_name in self._cache_storage:
|
447 |
+
return self._cache_storage[var_name]
|
448 |
+
|
449 |
+
env_value = os.environ.get(var_name)
|
450 |
+
if env_value:
|
451 |
+
return env_value
|
452 |
+
|
453 |
+
return self.supported_env_vars[var_name].get("default")
|
454 |
+
|
455 |
+
def _get_value_source(self, var_name: str) -> str:
|
456 |
+
"""Determine the source of an environment variable value."""
|
457 |
+
if var_name in self._cache_storage:
|
458 |
+
return "cache"
|
459 |
+
elif os.environ.get(var_name):
|
460 |
+
return "environment"
|
461 |
+
elif self.supported_env_vars[var_name].get("default"):
|
462 |
+
return "default"
|
463 |
+
else:
|
464 |
+
return "unset"
|
465 |
+
|
466 |
+
def _mask_value(self, value: str, should_mask: bool) -> str:
|
467 |
+
"""Mask sensitive values for display."""
|
468 |
+
if not value or not should_mask:
|
469 |
+
return value
|
470 |
+
|
471 |
+
if len(value) <= 8:
|
472 |
+
return "*" * len(value)
|
473 |
+
|
474 |
+
return value[:4] + "*" * (len(value) - 8) + value[-4:]
|
475 |
+
|
476 |
+
def _validate_format(self, value: str, format_pattern: Optional[str]) -> bool:
|
477 |
+
"""Validate value against format pattern."""
|
478 |
+
if not format_pattern or not value:
|
479 |
+
return True
|
480 |
+
|
481 |
+
try:
|
482 |
+
return bool(re.match(format_pattern, value))
|
483 |
+
except Exception:
|
484 |
+
return False
|
485 |
+
|
486 |
+
def _save_to_env_file(self, var_name: str, value: str):
|
487 |
+
"""Save environment variable to .env file."""
|
488 |
+
env_vars = {}
|
489 |
+
|
490 |
+
# 📖 Read existing .env file
|
491 |
+
if self.env_file_path.exists():
|
492 |
+
with open(self.env_file_path, "r", encoding="utf-8") as f:
|
493 |
+
for line in f:
|
494 |
+
line = line.strip()
|
495 |
+
if line and not line.startswith("#") and "=" in line:
|
496 |
+
try:
|
497 |
+
key, val = line.split("=", 1)
|
498 |
+
env_vars[key.strip()] = val.strip().strip("\"'")
|
499 |
+
except Exception as e:
|
500 |
+
self.logger.warning(f"Error parsing line in .env: {e}")
|
501 |
+
|
502 |
+
# ✏️ Update the variable
|
503 |
+
env_vars[var_name] = value
|
504 |
+
|
505 |
+
# 💾 Write back to file
|
506 |
+
with open(self.env_file_path, "w", encoding="utf-8") as f:
|
507 |
+
f.write("# Environment Variables for RAG AI System\n")
|
508 |
+
f.write(f"# Generated on {datetime.now().isoformat()}\n\n")
|
509 |
+
|
510 |
+
for key, val in env_vars.items():
|
511 |
+
# 🔐 Quote values that contain spaces or special characters
|
512 |
+
if " " in val or any(char in val for char in ["$", '"', "'"]):
|
513 |
+
f.write(f'{key}="{val}"\n')
|
514 |
+
else:
|
515 |
+
f.write(f"{key}={val}\n")
|
516 |
+
|
517 |
+
# 🧪 API Testing Functions
|
518 |
+
|
519 |
+
# Cache for Gemini client to avoid recreating it
|
520 |
+
_gemini_client_cache = None
|
521 |
+
_gemini_client_key = None
|
522 |
+
_gemini_last_test_time = None
|
523 |
+
_gemini_test_cooldown = 10 # seconds between tests
|
524 |
+
|
525 |
+
def _test_gemini_connection(self, api_key: str) -> Dict[str, Any]:
|
526 |
+
"""Test Gemini API connection with caching and optimization."""
|
527 |
+
try:
|
528 |
+
# Check if we've tested this key recently
|
529 |
+
current_time = time.time()
|
530 |
+
if (
|
531 |
+
self._gemini_last_test_time
|
532 |
+
and api_key == self._gemini_client_key
|
533 |
+
and current_time - self._gemini_last_test_time
|
534 |
+
< self._gemini_test_cooldown
|
535 |
+
):
|
536 |
+
|
537 |
+
self.logger.info(
|
538 |
+
"Using cached Gemini test result (within cooldown period)"
|
539 |
+
)
|
540 |
+
return {
|
541 |
+
"success": True,
|
542 |
+
"status": "✅ Gemini API connected (cached)",
|
543 |
+
"details": "Using cached test result",
|
544 |
+
}
|
545 |
+
|
546 |
+
import google.generativeai as genai
|
547 |
+
|
548 |
+
# Use cached client if the API key is the same
|
549 |
+
if api_key == self._gemini_client_key and self._gemini_client_cache:
|
550 |
+
self.logger.info("Using cached Gemini client")
|
551 |
+
client = self._gemini_client_cache
|
552 |
+
else:
|
553 |
+
# Configure new client
|
554 |
+
genai.configure(api_key=api_key)
|
555 |
+
self._gemini_client_cache = genai
|
556 |
+
self._gemini_client_key = api_key
|
557 |
+
client = genai
|
558 |
+
|
559 |
+
# 🧪 Simple test call - use embedding API instead of GenerativeModel
|
560 |
+
# This is faster and more efficient for testing connection
|
561 |
+
test_result = client.embed_content(
|
562 |
+
model="gemini-embedding-exp-03-07",
|
563 |
+
content="test connection",
|
564 |
+
task_type="retrieval_document",
|
565 |
+
)
|
566 |
+
|
567 |
+
# Update last test time
|
568 |
+
self._gemini_last_test_time = current_time
|
569 |
+
|
570 |
+
if test_result and "embedding" in test_result:
|
571 |
+
return {
|
572 |
+
"success": True,
|
573 |
+
"status": "✅ Gemini API connected",
|
574 |
+
"details": "API key is valid and working",
|
575 |
+
}
|
576 |
+
else:
|
577 |
+
return {
|
578 |
+
"success": False,
|
579 |
+
"status": "❌ Gemini API failed",
|
580 |
+
"error": "No embedding in response",
|
581 |
+
}
|
582 |
+
|
583 |
+
except Exception as e:
|
584 |
+
return {
|
585 |
+
"success": False,
|
586 |
+
"status": "❌ Gemini connection failed",
|
587 |
+
"error": str(e),
|
588 |
+
}
|
589 |
+
|
590 |
+
def _test_pinecone_connection(self, api_key: str) -> Dict[str, Any]:
|
591 |
+
"""Test Pinecone API connection."""
|
592 |
+
try:
|
593 |
+
from pinecone import Pinecone
|
594 |
+
|
595 |
+
pc = Pinecone(api_key=api_key)
|
596 |
+
|
597 |
+
# 🧪 Test by listing indexes
|
598 |
+
indexes = pc.list_indexes()
|
599 |
+
|
600 |
+
return {
|
601 |
+
"success": True,
|
602 |
+
"status": "✅ Pinecone API connected",
|
603 |
+
"details": f"Found {len(indexes)} indexes",
|
604 |
+
}
|
605 |
+
|
606 |
+
except Exception as e:
|
607 |
+
return {
|
608 |
+
"success": False,
|
609 |
+
"status": "❌ Pinecone connection failed",
|
610 |
+
"error": str(e),
|
611 |
+
}
|
612 |
+
|
613 |
+
def _test_openai_connection(self, api_key: str) -> Dict[str, Any]:
|
614 |
+
"""Test OpenAI API connection."""
|
615 |
+
try:
|
616 |
+
import openai
|
617 |
+
|
618 |
+
client = openai.OpenAI(api_key=api_key)
|
619 |
+
|
620 |
+
# 🧪 Test with a simple completion
|
621 |
+
response = client.chat.completions.create(
|
622 |
+
model="gpt-3.5-turbo",
|
623 |
+
messages=[{"role": "user", "content": "Hello"}],
|
624 |
+
max_tokens=5,
|
625 |
+
)
|
626 |
+
|
627 |
+
if response and response.choices:
|
628 |
+
return {
|
629 |
+
"success": True,
|
630 |
+
"status": "✅ OpenAI API connected",
|
631 |
+
"details": "API key is valid and working",
|
632 |
+
}
|
633 |
+
else:
|
634 |
+
return {
|
635 |
+
"success": False,
|
636 |
+
"status": "❌ OpenAI API failed",
|
637 |
+
"error": "No response from API",
|
638 |
+
}
|
639 |
+
|
640 |
+
except Exception as e:
|
641 |
+
return {
|
642 |
+
"success": False,
|
643 |
+
"status": " OpenAI connection failed",
|
644 |
+
"error": str(e),
|
645 |
+
}
|
646 |
+
|
647 |
+
def _test_tavily_connection(self, api_key: str) -> Dict[str, Any]:
|
648 |
+
"""Test Tavily API connection."""
|
649 |
+
try:
|
650 |
+
from tavily import TavilyClient
|
651 |
+
|
652 |
+
# 🧪 Initialize client and test with a simple search
|
653 |
+
client = TavilyClient(api_key=api_key)
|
654 |
+
|
655 |
+
# Test with a minimal search query
|
656 |
+
response = client.search(query="test", max_results=1, search_depth="basic")
|
657 |
+
|
658 |
+
if response and isinstance(response, dict):
|
659 |
+
return {
|
660 |
+
"success": True,
|
661 |
+
"status": "✅ Tavily API connected",
|
662 |
+
"details": "API key is valid and working",
|
663 |
+
}
|
664 |
+
else:
|
665 |
+
return {
|
666 |
+
"success": False,
|
667 |
+
"status": "❌ Tavily API failed",
|
668 |
+
"error": "No valid response from API",
|
669 |
+
}
|
670 |
+
|
671 |
+
except Exception as e:
|
672 |
+
return {
|
673 |
+
"success": False,
|
674 |
+
"status": "❌ Tavily connection failed",
|
675 |
+
"error": str(e),
|
676 |
+
}
|