Resolved Dropdown issue And MCP Server
Browse files- README.md +152 -1
- app.py +202 -443
- config.py +5 -0
- core/chunker.py +1 -0
- mcp_server.py +1 -1
- requirements.txt +4 -2
- services/llm_service.py +229 -127
README.md
CHANGED
@@ -6,8 +6,159 @@ colorTo: green
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.32.0
|
8 |
app_file: app.py
|
|
|
|
|
|
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.32.0
|
8 |
app_file: app.py
|
9 |
+
tag :
|
10 |
+
-mcp-server-track
|
11 |
+
-Agents-MCP-Hackathon
|
12 |
pinned: false
|
13 |
license: mit
|
14 |
---
|
15 |
|
16 |
+
A powerful Model Context Protocol (MCP) server for intelligent content management with semantic search, summarization, and Q&A capabilities powered by **OpenAI, Mistral AI, and Anthropic Claude**.
|
17 |
+
|
18 |
+
## 🎯 Features
|
19 |
+
|
20 |
+
### 🔧 MCP Tools Available
|
21 |
+
|
22 |
+
- **📄 Document Ingestion**: Upload and process documents (PDF, TXT, DOCX, images with OCR)
|
23 |
+
- **🔍 Semantic Search**: Find relevant content using natural language queries
|
24 |
+
- **📝 Summarization**: Generate summaries in different styles (concise, detailed, bullet points, executive)
|
25 |
+
- **🏷️ Tag Generation**: Automatically generate relevant tags for content
|
26 |
+
- **❓ Q&A System**: Ask questions about your documents using RAG (Retrieval-Augmented Generation)
|
27 |
+
- **📊 Categorization**: Classify content into predefined or custom categories
|
28 |
+
- **🔄 Batch Processing**: Process multiple documents at once
|
29 |
+
- **📈 Analytics**: Get insights and statistics about your content
|
30 |
+
|
31 |
+
### 🚀 Powered By
|
32 |
+
|
33 |
+
- **🧠 OpenAI GPT models** for powerful text generation and understanding
|
34 |
+
- **🔥 Mistral AI** for efficient text processing and analysis
|
35 |
+
- **🤖 Anthropic Claude** for advanced reasoning (available as a specific choice or fallback)
|
36 |
+
- **🔗 Sentence Transformers** for semantic embeddings
|
37 |
+
- **📚 FAISS** for fast similarity search
|
38 |
+
- **👁️ Tesseract OCR** for image text extraction
|
39 |
+
- **🎨 Gradio** for the user interface and MCP server functionality
|
40 |
+
|
41 |
+
**LLM Strategy**: The agent intelligently selects the best available LLM for most generative tasks when 'auto' model selection is used, prioritizing OpenAI, then Mistral, and finally Anthropic. Users can also specify a particular model family (e.g., 'gpt-', 'mistral-', 'claude-').
|
42 |
+
|
43 |
+
## 📋 Complete File Structure
|
44 |
+
intelligent-content-organizer/
|
45 |
+
├── app.py # Main Gradio app and MCP server
|
46 |
+
├── config.py # Configuration management
|
47 |
+
├── mcp_server.py # mcp server tools
|
48 |
+
├── requirements.txt # Dependencies
|
49 |
+
├── README.md # Documentation
|
50 |
+
├── .gitignore # Git ignore rules
|
51 |
+
├── core/ # Core processing logic
|
52 |
+
│ ├── init.py
|
53 |
+
│ ├── models.py # Data models
|
54 |
+
│ ├── document_parser.py # Document processing
|
55 |
+
│ ├── text_preprocessor.py # Text cleaning and processing
|
56 |
+
│ └── chunker.py # Text chunking strategies
|
57 |
+
├── services/ # Backend services
|
58 |
+
│ ├── init.py
|
59 |
+
│ ├── embedding_service.py # Sentence transformers integration
|
60 |
+
│ ├── llm_service.py # Anthropic + Mistral integration
|
61 |
+
│ ├── ocr_service.py # Mistral OCR integration
|
62 |
+
│ ├── vector_store_service.py # FAISS vector storage
|
63 |
+
│ └── document_store_service.py # Document metadata storage
|
64 |
+
└── mcp_tools/ # MCP tool definitions
|
65 |
+
├── init.py
|
66 |
+
├── ingestion_tool.py # Document ingestion tool
|
67 |
+
├── search_tool.py # Semantic search tool
|
68 |
+
├── generative_tool.py # AI generation tool
|
69 |
+
└── utils.py # Utility functions
|
70 |
+
|
71 |
+
## 🎯 Key Features Implemented
|
72 |
+
|
73 |
+
1. **Full MCP Server**: Complete implementation with all tools exposed
|
74 |
+
2. **Multi-Modal Processing**: PDF, TXT, DOCX, and image processing with OCR
|
75 |
+
3. **Advanced Search**: Semantic search with FAISS, filtering, and multi-query support
|
76 |
+
4. **AI-Powered Features**: Summarization, tagging, categorization, Q&A with RAG
|
77 |
+
5. **Production Ready**: Error handling, logging, caching, rate limiting
|
78 |
+
6. **Gradio UI**: Beautiful web interface for testing and direct use
|
79 |
+
7. **Anthropic + Mistral**: Dual LLM support with fallbacks
|
80 |
+
|
81 |
+
## 🎥 Demo Video
|
82 |
+
|
83 |
+
[📹 Watch the demo video](https://your-demo-video-url.com)
|
84 |
+
|
85 |
+
*The demo shows the MCP server in action, demonstrating document ingestion, semantic search, and Q&A capabilities, utilizing the configured LLM providers.*
|
86 |
+
|
87 |
+
## 🛠️ Installation
|
88 |
+
|
89 |
+
### Prerequisites
|
90 |
+
|
91 |
+
- Python 3.9+
|
92 |
+
- API keys for OpenAI and Mistral AI. An Anthropic API key.
|
93 |
+
|
94 |
+
- **MCP Tools Reference** (Tool parameters like model allow specifying "auto" or a specific model family like "gpt-", "mistral-", "claude-")
|
95 |
+
|
96 |
+
- **ingest_document**
|
97 |
+
- Process and index a document for searching.
|
98 |
+
- **Parameters:**
|
99 |
+
- `file_path` (string): Path to the document file (e.g., an uploaded file path).
|
100 |
+
- `file_type` (string, optional): File type/extension (e.g., ".pdf", ".txt"). If not provided, it's inferred from file_path.
|
101 |
+
- **Returns:**
|
102 |
+
- `success` (boolean): Whether the operation succeeded.
|
103 |
+
- `document_id` (string): Unique identifier for the processed document.
|
104 |
+
- `chunks_created` (integer): Number of text chunks created.
|
105 |
+
- `message` (string): Human-readable result message.
|
106 |
+
|
107 |
+
- **semantic_search**
|
108 |
+
- Search through indexed content using natural language.
|
109 |
+
- **Parameters:**
|
110 |
+
- `query` (string): Search query.
|
111 |
+
- `top_k` (integer, optional): Number of results to return (default: 5).
|
112 |
+
- `filters` (object, optional): Search filters (e.g., {"document_id": "some_id"}).
|
113 |
+
- **Returns:**
|
114 |
+
- `success` (boolean): Whether the search succeeded.
|
115 |
+
- `results` (array of objects): Array of search results, each with content and score.
|
116 |
+
- `total_results` (integer): Number of results found.
|
117 |
+
|
118 |
+
- **summarize_content**
|
119 |
+
- Generate a summary of provided content.
|
120 |
+
- **Parameters:**
|
121 |
+
- `content` (string, optional): Text content to summarize.
|
122 |
+
- `document_id` (string, optional): ID of document to summarize. (Either content or document_id must be provided).
|
123 |
+
- `style` (string, optional): Summary style: "concise", "detailed", "bullet_points", "executive" (default: "concise").
|
124 |
+
- `model` (string, optional): Specific LLM to use (e.g., "gpt-4o-mini", "mistral-large-latest", "auto"). Default: "auto".
|
125 |
+
- **Returns:**
|
126 |
+
- `success` (boolean): Whether summarization succeeded.
|
127 |
+
- `summary` (string): Generated summary.
|
128 |
+
- `original_length` (integer): Character length of original content.
|
129 |
+
- `summary_length` (integer): Character length of summary.
|
130 |
+
|
131 |
+
- **generate_tags**
|
132 |
+
- Generate relevant tags for content.
|
133 |
+
- **Parameters:**
|
134 |
+
- `content` (string, optional): Text content to tag.
|
135 |
+
- `document_id` (string, optional): ID of document to tag. (Either content or document_id must be provided).
|
136 |
+
- `max_tags` (integer, optional): Maximum number of tags (default: 5).
|
137 |
+
- `model` (string, optional): Specific LLM to use. Default: "auto".
|
138 |
+
- **Returns:**
|
139 |
+
- `success` (boolean): Whether tag generation succeeded.
|
140 |
+
- `tags` (array of strings): Array of generated tags.
|
141 |
+
|
142 |
+
- **answer_question**
|
143 |
+
- Answer questions using RAG over your indexed content.
|
144 |
+
- **Parameters:**
|
145 |
+
- `question` (string): Question to answer.
|
146 |
+
- `context_filter` (object, optional): Filters for context retrieval (e.g., {"document_id": "some_id"}).
|
147 |
+
- `model` (string, optional): Specific LLM to use. Default: "auto".
|
148 |
+
- **Returns:**
|
149 |
+
- `success` (boolean): Whether question answering succeeded.
|
150 |
+
- `answer` (string): Generated answer.
|
151 |
+
- `sources` (array of objects): Source document chunks used for context, each with document_id, chunk_id, and content.
|
152 |
+
- `confidence` (string, optional): Confidence level in the answer (LLM-dependent, might not always be present).
|
153 |
+
|
154 |
+
📊 Performance
|
155 |
+
|
156 |
+
Embedding Generation: ~100-500ms per document chunk
|
157 |
+
Search: <50ms for most queries
|
158 |
+
Summarization: 1-5s depending on content length
|
159 |
+
Memory Usage: ~200-500MB base + ~1MB per 1000 document chunks
|
160 |
+
Supported File Types: PDF, TXT, DOCX, PNG, JPG, JPEG, BMP, TIFF
|
161 |
+
|
162 |
+
|
163 |
+
|
164 |
+
|
app.py
CHANGED
@@ -33,7 +33,6 @@ class ContentOrganizerMCPServer:
|
|
33 |
def __init__(self):
|
34 |
# Initialize services
|
35 |
logger.info("Initializing Content Organizer MCP Server...")
|
36 |
-
|
37 |
self.vector_store = VectorStoreService()
|
38 |
self.document_store = DocumentStoreService()
|
39 |
self.embedding_service = EmbeddingService()
|
@@ -56,13 +55,12 @@ class ContentOrganizerMCPServer:
|
|
56 |
llm_service=self.llm_service,
|
57 |
search_tool=self.search_tool
|
58 |
)
|
59 |
-
|
60 |
# Track processing status
|
61 |
self.processing_status = {}
|
62 |
|
63 |
# Document cache for quick access
|
64 |
self.document_cache = {}
|
65 |
-
|
66 |
logger.info("Content Organizer MCP Server initialized successfully!")
|
67 |
|
68 |
def run_async(self, coro):
|
@@ -72,7 +70,6 @@ class ContentOrganizerMCPServer:
|
|
72 |
except RuntimeError:
|
73 |
loop = asyncio.new_event_loop()
|
74 |
asyncio.set_event_loop(loop)
|
75 |
-
|
76 |
if loop.is_running():
|
77 |
# If loop is already running, create a task
|
78 |
import concurrent.futures
|
@@ -87,31 +84,22 @@ class ContentOrganizerMCPServer:
|
|
87 |
try:
|
88 |
task_id = str(uuid.uuid4())
|
89 |
self.processing_status[task_id] = {"status": "processing", "progress": 0}
|
90 |
-
|
91 |
result = await self.ingestion_tool.process_document(file_path, file_type, task_id)
|
92 |
-
|
93 |
if result.get("success"):
|
94 |
self.processing_status[task_id] = {"status": "completed", "progress": 100}
|
95 |
-
# Update document cache
|
96 |
doc_id = result.get("document_id")
|
97 |
if doc_id:
|
98 |
doc = await self.document_store.get_document(doc_id)
|
99 |
if doc:
|
100 |
self.document_cache[doc_id] = doc
|
101 |
-
|
102 |
return result
|
103 |
else:
|
104 |
self.processing_status[task_id] = {"status": "failed", "error": result.get("error")}
|
105 |
return result
|
106 |
-
|
107 |
except Exception as e:
|
108 |
logger.error(f"Document ingestion failed: {str(e)}")
|
109 |
-
return {
|
110 |
-
|
111 |
-
"error": str(e),
|
112 |
-
"message": "Failed to process document"
|
113 |
-
}
|
114 |
-
|
115 |
async def get_document_content_async(self, document_id: str) -> Optional[str]:
|
116 |
"""Get document content by ID"""
|
117 |
try:
|
@@ -124,7 +112,6 @@ class ContentOrganizerMCPServer:
|
|
124 |
if doc:
|
125 |
self.document_cache[document_id] = doc
|
126 |
return doc.content
|
127 |
-
|
128 |
return None
|
129 |
except Exception as e:
|
130 |
logger.error(f"Error getting document content: {str(e)}")
|
@@ -134,149 +121,78 @@ class ContentOrganizerMCPServer:
|
|
134 |
"""MCP Tool: Perform semantic search"""
|
135 |
try:
|
136 |
results = await self.search_tool.search(query, top_k, filters)
|
137 |
-
return {
|
138 |
-
"success": True,
|
139 |
-
"query": query,
|
140 |
-
"results": [result.to_dict() for result in results],
|
141 |
-
"total_results": len(results)
|
142 |
-
}
|
143 |
except Exception as e:
|
144 |
logger.error(f"Semantic search failed: {str(e)}")
|
145 |
-
return {
|
146 |
-
"success": False,
|
147 |
-
"error": str(e),
|
148 |
-
"query": query,
|
149 |
-
"results": []
|
150 |
-
}
|
151 |
|
152 |
async def summarize_content_async(self, content: str = None, document_id: str = None, style: str = "concise") -> Dict[str, Any]:
|
153 |
-
"""MCP Tool: Summarize content or document"""
|
154 |
try:
|
155 |
-
# If document_id provided, get content from document
|
156 |
if document_id and document_id != "none":
|
157 |
content = await self.get_document_content_async(document_id)
|
158 |
if not content:
|
159 |
return {"success": False, "error": f"Document {document_id} not found"}
|
160 |
-
|
161 |
if not content or not content.strip():
|
162 |
return {"success": False, "error": "No content provided for summarization"}
|
163 |
-
|
164 |
-
# Truncate content if too long (for API limits)
|
165 |
max_content_length = 4000
|
166 |
if len(content) > max_content_length:
|
167 |
content = content[:max_content_length] + "..."
|
168 |
-
|
169 |
summary = await self.generative_tool.summarize(content, style)
|
170 |
-
return {
|
171 |
-
"success": True,
|
172 |
-
"summary": summary,
|
173 |
-
"original_length": len(content),
|
174 |
-
"summary_length": len(summary),
|
175 |
-
"style": style,
|
176 |
-
"document_id": document_id
|
177 |
-
}
|
178 |
except Exception as e:
|
179 |
logger.error(f"Summarization failed: {str(e)}")
|
180 |
-
return {
|
181 |
-
"success": False,
|
182 |
-
"error": str(e)
|
183 |
-
}
|
184 |
|
185 |
async def generate_tags_async(self, content: str = None, document_id: str = None, max_tags: int = 5) -> Dict[str, Any]:
|
186 |
"""MCP Tool: Generate tags for content"""
|
187 |
try:
|
188 |
-
# If document_id provided, get content from document
|
189 |
if document_id and document_id != "none":
|
190 |
content = await self.get_document_content_async(document_id)
|
191 |
if not content:
|
192 |
return {"success": False, "error": f"Document {document_id} not found"}
|
193 |
-
|
194 |
if not content or not content.strip():
|
195 |
return {"success": False, "error": "No content provided for tag generation"}
|
196 |
-
|
197 |
tags = await self.generative_tool.generate_tags(content, max_tags)
|
198 |
-
|
199 |
-
# Update document tags if document_id provided
|
200 |
if document_id and document_id != "none" and tags:
|
201 |
await self.document_store.update_document_metadata(document_id, {"tags": tags})
|
202 |
-
|
203 |
-
return {
|
204 |
-
"success": True,
|
205 |
-
"tags": tags,
|
206 |
-
"content_length": len(content),
|
207 |
-
"document_id": document_id
|
208 |
-
}
|
209 |
except Exception as e:
|
210 |
logger.error(f"Tag generation failed: {str(e)}")
|
211 |
-
return {
|
212 |
-
"success": False,
|
213 |
-
"error": str(e)
|
214 |
-
}
|
215 |
|
216 |
async def answer_question_async(self, question: str, context_filter: Optional[Dict] = None) -> Dict[str, Any]:
|
217 |
-
"""MCP Tool: Answer questions using RAG"""
|
218 |
try:
|
219 |
-
# Search for relevant context
|
220 |
search_results = await self.search_tool.search(question, top_k=5, filters=context_filter)
|
221 |
-
|
222 |
if not search_results:
|
223 |
-
return {
|
224 |
-
"success": False,
|
225 |
-
"error": "No relevant context found in your documents. Please make sure you have uploaded relevant documents.",
|
226 |
-
"question": question
|
227 |
-
}
|
228 |
-
|
229 |
-
# Generate answer using context
|
230 |
answer = await self.generative_tool.answer_question(question, search_results)
|
231 |
-
|
232 |
-
return {
|
233 |
-
"success": True,
|
234 |
-
"question": question,
|
235 |
-
"answer": answer,
|
236 |
-
"sources": [result.to_dict() for result in search_results],
|
237 |
-
"confidence": "high" if len(search_results) >= 3 else "medium"
|
238 |
-
}
|
239 |
except Exception as e:
|
240 |
logger.error(f"Question answering failed: {str(e)}")
|
241 |
-
return {
|
242 |
-
"success": False,
|
243 |
-
"error": str(e),
|
244 |
-
"question": question
|
245 |
-
}
|
246 |
|
247 |
def list_documents_sync(self, limit: int = 100, offset: int = 0) -> Dict[str, Any]:
|
248 |
-
"""List stored documents"""
|
249 |
try:
|
250 |
documents = self.run_async(self.document_store.list_documents(limit, offset))
|
251 |
-
return {
|
252 |
-
"success": True,
|
253 |
-
"documents": [doc.to_dict() for doc in documents],
|
254 |
-
"total": len(documents)
|
255 |
-
}
|
256 |
except Exception as e:
|
257 |
-
return {
|
258 |
-
"success": False,
|
259 |
-
"error": str(e)
|
260 |
-
}
|
261 |
|
262 |
-
# Initialize the MCP server
|
263 |
mcp_server = ContentOrganizerMCPServer()
|
264 |
|
265 |
-
# Helper functions
|
266 |
def get_document_list():
|
267 |
-
"""Get list of documents for display"""
|
268 |
try:
|
269 |
result = mcp_server.list_documents_sync(limit=100)
|
270 |
if result["success"]:
|
271 |
if result["documents"]:
|
272 |
-
|
273 |
-
for i,
|
274 |
-
|
275 |
-
|
276 |
-
if
|
277 |
-
|
278 |
-
|
279 |
-
return
|
280 |
else:
|
281 |
return "No documents in library yet. Upload some documents to get started!"
|
282 |
else:
|
@@ -285,17 +201,10 @@ def get_document_list():
|
|
285 |
return f"Error: {str(e)}"
|
286 |
|
287 |
def get_document_choices():
|
288 |
-
"""Get document choices for dropdown"""
|
289 |
try:
|
290 |
result = mcp_server.list_documents_sync(limit=100)
|
291 |
if result["success"] and result["documents"]:
|
292 |
-
choices = []
|
293 |
-
for doc in result["documents"]:
|
294 |
-
# Create label with filename and shortened ID
|
295 |
-
choice_label = f"{doc['filename']} ({doc['id'][:8]}...)"
|
296 |
-
# Use full document ID as the value
|
297 |
-
choices.append((choice_label, doc['id']))
|
298 |
-
|
299 |
logger.info(f"Generated {len(choices)} document choices")
|
300 |
return choices
|
301 |
return []
|
@@ -303,78 +212,82 @@ def get_document_choices():
|
|
303 |
logger.error(f"Error getting document choices: {str(e)}")
|
304 |
return []
|
305 |
|
306 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
def upload_and_process_file(file):
|
308 |
-
"""Gradio interface for file upload"""
|
309 |
if file is None:
|
310 |
-
|
311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
try:
|
313 |
-
# Get file path
|
314 |
file_path = file.name if hasattr(file, 'name') else str(file)
|
315 |
-
file_type = Path(file_path).suffix.lower()
|
316 |
-
|
317 |
-
logger.info(f"Processing file: {file_path}")
|
318 |
-
|
319 |
-
# Process document
|
320 |
result = mcp_server.run_async(mcp_server.ingest_document_async(file_path, file_type))
|
321 |
|
|
|
|
|
|
|
322 |
if result["success"]:
|
323 |
-
# Get updated document list and choices
|
324 |
-
doc_list = get_document_list()
|
325 |
-
doc_choices = get_document_choices()
|
326 |
-
|
327 |
return (
|
328 |
-
f"✅ Success: {result['message']}\nDocument ID: {result['document_id']}\nChunks created: {result['chunks_created']}",
|
329 |
result["document_id"],
|
330 |
-
|
331 |
-
gr.update(choices=
|
332 |
-
gr.update(choices=
|
333 |
-
gr.update(choices=
|
334 |
-
gr.update(choices=doc_choices)
|
335 |
)
|
336 |
else:
|
337 |
return (
|
338 |
-
f"❌ Error: {result.get('error', 'Unknown error')}",
|
339 |
-
|
340 |
-
|
341 |
-
gr.update(choices=
|
342 |
-
gr.update(choices=
|
343 |
-
gr.update(choices=get_document_choices()),
|
344 |
-
gr.update(choices=get_document_choices())
|
345 |
)
|
346 |
except Exception as e:
|
347 |
logger.error(f"Error processing file: {str(e)}")
|
|
|
|
|
348 |
return (
|
349 |
-
f"❌ Error: {str(e)}",
|
350 |
-
|
351 |
-
|
352 |
-
gr.update(choices=
|
353 |
-
gr.update(choices=
|
354 |
-
gr.update(choices=get_document_choices()),
|
355 |
-
gr.update(choices=get_document_choices())
|
356 |
)
|
357 |
|
358 |
def perform_search(query, top_k):
|
359 |
-
"""Gradio interface for search"""
|
360 |
if not query.strip():
|
361 |
return "Please enter a search query"
|
362 |
-
|
363 |
try:
|
364 |
result = mcp_server.run_async(mcp_server.semantic_search_async(query, int(top_k)))
|
365 |
-
|
366 |
if result["success"]:
|
367 |
if result["results"]:
|
368 |
-
|
369 |
-
for i,
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
if 'document_filename' in
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
return
|
378 |
else:
|
379 |
return f"No results found for: '{query}'\n\nMake sure you have uploaded relevant documents first."
|
380 |
else:
|
@@ -384,19 +297,10 @@ def perform_search(query, top_k):
|
|
384 |
return f"❌ Error: {str(e)}"
|
385 |
|
386 |
def summarize_document(doc_choice, custom_text, style):
|
387 |
-
"""Gradio interface for summarization"""
|
388 |
try:
|
389 |
-
# Debug logging
|
390 |
logger.info(f"Summarize called with doc_choice: {doc_choice}, type: {type(doc_choice)}")
|
|
|
391 |
|
392 |
-
# Get document ID from dropdown choice
|
393 |
-
document_id = None
|
394 |
-
if doc_choice and doc_choice != "none" and doc_choice != "":
|
395 |
-
# When Gradio dropdown returns a choice, it returns the value part of the (label, value) tuple
|
396 |
-
document_id = doc_choice
|
397 |
-
logger.info(f"Using document ID: {document_id}")
|
398 |
-
|
399 |
-
# Use custom text if provided, otherwise use document
|
400 |
if custom_text and custom_text.strip():
|
401 |
logger.info("Using custom text for summarization")
|
402 |
result = mcp_server.run_async(mcp_server.summarize_content_async(content=custom_text, style=style))
|
@@ -407,14 +311,14 @@ def summarize_document(doc_choice, custom_text, style):
|
|
407 |
return "Please select a document from the dropdown or enter text to summarize"
|
408 |
|
409 |
if result["success"]:
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
if result.get('document_id'):
|
416 |
-
|
417 |
-
return
|
418 |
else:
|
419 |
return f"❌ Summarization failed: {result['error']}"
|
420 |
except Exception as e:
|
@@ -422,19 +326,10 @@ def summarize_document(doc_choice, custom_text, style):
|
|
422 |
return f"❌ Error: {str(e)}"
|
423 |
|
424 |
def generate_tags_for_document(doc_choice, custom_text, max_tags):
|
425 |
-
"""Gradio interface for tag generation"""
|
426 |
try:
|
427 |
-
# Debug logging
|
428 |
logger.info(f"Generate tags called with doc_choice: {doc_choice}, type: {type(doc_choice)}")
|
429 |
-
|
430 |
-
|
431 |
-
document_id = None
|
432 |
-
if doc_choice and doc_choice != "none" and doc_choice != "":
|
433 |
-
# When Gradio dropdown returns a choice, it returns the value part of the (label, value) tuple
|
434 |
-
document_id = doc_choice
|
435 |
-
logger.info(f"Using document ID: {document_id}")
|
436 |
-
|
437 |
-
# Use custom text if provided, otherwise use document
|
438 |
if custom_text and custom_text.strip():
|
439 |
logger.info("Using custom text for tag generation")
|
440 |
result = mcp_server.run_async(mcp_server.generate_tags_async(content=custom_text, max_tags=int(max_tags)))
|
@@ -446,14 +341,14 @@ def generate_tags_for_document(doc_choice, custom_text, max_tags):
|
|
446 |
|
447 |
if result["success"]:
|
448 |
tags_str = ", ".join(result["tags"])
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
if result.get('document_id'):
|
454 |
-
|
455 |
-
|
456 |
-
return
|
457 |
else:
|
458 |
return f"❌ Tag generation failed: {result['error']}"
|
459 |
except Exception as e:
|
@@ -461,310 +356,174 @@ def generate_tags_for_document(doc_choice, custom_text, max_tags):
|
|
461 |
return f"❌ Error: {str(e)}"
|
462 |
|
463 |
def ask_question(question):
|
464 |
-
"""Gradio interface for Q&A"""
|
465 |
if not question.strip():
|
466 |
return "Please enter a question"
|
467 |
-
|
468 |
try:
|
469 |
result = mcp_server.run_async(mcp_server.answer_question_async(question))
|
470 |
-
|
471 |
if result["success"]:
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
for i,
|
477 |
-
filename =
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
return
|
482 |
else:
|
483 |
return f"❌ {result.get('error', 'Failed to answer question')}"
|
484 |
except Exception as e:
|
485 |
return f"❌ Error: {str(e)}"
|
486 |
|
487 |
def delete_document_from_library(document_id):
|
488 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
489 |
try:
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
|
|
|
|
494 |
else:
|
495 |
-
msg
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
return f"❌ Error: {str(e)}", get_document_list(), gr.update(choices=get_document_choices()), gr.update(choices=get_document_choices()), gr.update(choices=get_document_choices()), gr.update(choices=get_document_choices())
|
502 |
|
503 |
-
def refresh_library():
|
504 |
-
"""Refresh the document library display"""
|
505 |
-
doc_list = get_document_list()
|
506 |
-
doc_choices = get_document_choices()
|
507 |
-
return doc_list, gr.update(choices=doc_choices), gr.update(choices=doc_choices), gr.update(choices=doc_choices), gr.update(choices=doc_choices)
|
508 |
|
509 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
510 |
def create_gradio_interface():
|
511 |
with gr.Blocks(title="🧠 Intelligent Content Organizer MCP Agent", theme=gr.themes.Soft()) as interface:
|
512 |
gr.Markdown("""
|
513 |
# 🧠 Intelligent Content Organizer MCP Agent
|
514 |
-
|
515 |
A powerful MCP (Model Context Protocol) server for intelligent content management with semantic search,
|
516 |
-
summarization, and Q&A capabilities
|
517 |
-
|
518 |
## 🚀 Quick Start:
|
519 |
-
1. **
|
520 |
-
2. **
|
521 |
-
3. **
|
522 |
-
4. **
|
|
|
|
|
|
|
|
|
523 |
""")
|
524 |
|
525 |
-
# State components for dropdowns
|
526 |
-
with gr.Row(visible=False):
|
527 |
-
doc_dropdown_sum = gr.Dropdown(label="Hidden", choices=get_document_choices())
|
528 |
-
doc_dropdown_tag = gr.Dropdown(label="Hidden", choices=get_document_choices())
|
529 |
-
delete_doc_dropdown = gr.Dropdown(label="Hidden", choices=get_document_choices())
|
530 |
-
|
531 |
with gr.Tabs():
|
532 |
-
# 📚 Document Library Tab
|
533 |
with gr.Tab("📚 Document Library"):
|
534 |
with gr.Row():
|
535 |
with gr.Column():
|
536 |
gr.Markdown("### Your Document Collection")
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
lines=20,
|
541 |
-
interactive=False
|
542 |
-
)
|
543 |
-
refresh_btn = gr.Button("🔄 Refresh Library", variant="secondary")
|
544 |
-
|
545 |
-
delete_doc_dropdown_visible = gr.Dropdown(
|
546 |
-
label="Select Document to Delete",
|
547 |
-
choices=get_document_choices(),
|
548 |
-
value=None,
|
549 |
-
interactive=True,
|
550 |
-
allow_custom_value=False
|
551 |
-
)
|
552 |
delete_btn = gr.Button("🗑️ Delete Selected Document", variant="stop")
|
553 |
-
|
554 |
-
|
555 |
-
refresh_btn.click(
|
556 |
-
fn=refresh_library,
|
557 |
-
outputs=[document_list, delete_doc_dropdown_visible, doc_dropdown_sum, doc_dropdown_tag, delete_doc_dropdown]
|
558 |
-
)
|
559 |
-
|
560 |
-
delete_btn.click(
|
561 |
-
delete_document_from_library,
|
562 |
-
inputs=[delete_doc_dropdown_visible],
|
563 |
-
outputs=[delete_output, document_list, delete_doc_dropdown_visible, doc_dropdown_sum, doc_dropdown_tag, delete_doc_dropdown]
|
564 |
-
)
|
565 |
-
|
566 |
-
# 📄 Upload Documents Tab
|
567 |
with gr.Tab("📄 Upload Documents"):
|
568 |
with gr.Row():
|
569 |
with gr.Column():
|
570 |
gr.Markdown("### Add Documents to Your Library")
|
571 |
-
|
572 |
-
|
573 |
-
file_types=[".pdf", ".txt", ".docx", ".png", ".jpg", ".jpeg"],
|
574 |
-
type="filepath"
|
575 |
-
)
|
576 |
-
upload_btn = gr.Button("🚀 Process & Add to Library", variant="primary", size="lg")
|
577 |
with gr.Column():
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
placeholder="Upload a document to see processing results..."
|
582 |
-
)
|
583 |
-
doc_id_output = gr.Textbox(
|
584 |
-
label="Document ID",
|
585 |
-
placeholder="Document ID will appear here after processing..."
|
586 |
-
)
|
587 |
-
|
588 |
-
upload_btn.click(
|
589 |
-
upload_and_process_file,
|
590 |
-
inputs=[file_input],
|
591 |
-
outputs=[upload_output, doc_id_output, document_list, delete_doc_dropdown_visible, doc_dropdown_sum, doc_dropdown_tag, delete_doc_dropdown]
|
592 |
-
)
|
593 |
-
|
594 |
-
# 🔍 Search Documents Tab
|
595 |
with gr.Tab("🔍 Search Documents"):
|
596 |
with gr.Row():
|
597 |
with gr.Column(scale=1):
|
598 |
gr.Markdown("### Search Your Document Library")
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
lines=2
|
603 |
-
)
|
604 |
-
search_top_k = gr.Slider(
|
605 |
-
label="Number of Results",
|
606 |
-
minimum=1,
|
607 |
-
maximum=20,
|
608 |
-
value=5,
|
609 |
-
step=1
|
610 |
-
)
|
611 |
-
search_btn = gr.Button("🔍 Search Library", variant="primary", size="lg")
|
612 |
with gr.Column(scale=2):
|
613 |
-
|
614 |
-
|
615 |
-
lines=20,
|
616 |
-
placeholder="Search results will appear here..."
|
617 |
-
)
|
618 |
-
|
619 |
-
search_btn.click(
|
620 |
-
perform_search,
|
621 |
-
inputs=[search_query, search_top_k],
|
622 |
-
outputs=[search_output]
|
623 |
-
)
|
624 |
-
|
625 |
-
# 📝 Summarize Tab
|
626 |
with gr.Tab("📝 Summarize"):
|
627 |
with gr.Row():
|
628 |
with gr.Column():
|
629 |
gr.Markdown("### Generate Document Summaries")
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
value=None,
|
635 |
-
interactive=True,
|
636 |
-
allow_custom_value=False
|
637 |
-
)
|
638 |
-
|
639 |
-
summary_text = gr.Textbox(
|
640 |
-
label="Or Paste Text to Summarize",
|
641 |
-
placeholder="Paste any text here to summarize...",
|
642 |
-
lines=8
|
643 |
-
)
|
644 |
-
|
645 |
-
summary_style = gr.Dropdown(
|
646 |
-
label="Summary Style",
|
647 |
-
choices=["concise", "detailed", "bullet_points", "executive"],
|
648 |
-
value="concise",
|
649 |
-
info="Choose how you want the summary formatted"
|
650 |
-
)
|
651 |
-
summarize_btn = gr.Button("📝 Generate Summary", variant="primary", size="lg")
|
652 |
-
|
653 |
with gr.Column():
|
654 |
-
|
655 |
-
|
656 |
-
lines=20,
|
657 |
-
placeholder="Summary will appear here..."
|
658 |
-
)
|
659 |
-
|
660 |
-
summarize_btn.click(
|
661 |
-
summarize_document,
|
662 |
-
inputs=[doc_dropdown_sum_visible, summary_text, summary_style],
|
663 |
-
outputs=[summary_output]
|
664 |
-
)
|
665 |
-
|
666 |
-
# 🏷️ Generate Tags Tab
|
667 |
with gr.Tab("🏷️ Generate Tags"):
|
668 |
with gr.Row():
|
669 |
with gr.Column():
|
670 |
-
gr.Markdown("###
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
value=None,
|
676 |
-
interactive=True,
|
677 |
-
allow_custom_value=False
|
678 |
-
)
|
679 |
-
|
680 |
-
tag_text = gr.Textbox(
|
681 |
-
label="Or Paste Text to Generate Tags",
|
682 |
-
placeholder="Paste any text here to generate tags...",
|
683 |
-
lines=8
|
684 |
-
)
|
685 |
-
|
686 |
-
max_tags = gr.Slider(
|
687 |
-
label="Number of Tags",
|
688 |
-
minimum=3,
|
689 |
-
maximum=15,
|
690 |
-
value=5,
|
691 |
-
step=1
|
692 |
-
)
|
693 |
-
tag_btn = gr.Button("🏷️ Generate Tags", variant="primary", size="lg")
|
694 |
-
|
695 |
with gr.Column():
|
696 |
-
|
697 |
-
|
698 |
-
lines=10,
|
699 |
-
placeholder="Tags will appear here..."
|
700 |
-
)
|
701 |
-
|
702 |
-
tag_btn.click(
|
703 |
-
generate_tags_for_document,
|
704 |
-
inputs=[doc_dropdown_tag_visible, tag_text, max_tags],
|
705 |
-
outputs=[tag_output]
|
706 |
-
)
|
707 |
-
|
708 |
-
# ❓ Ask Questions Tab
|
709 |
with gr.Tab("❓ Ask Questions"):
|
710 |
with gr.Row():
|
711 |
with gr.Column():
|
712 |
-
gr.Markdown("""
|
713 |
-
### Ask Questions About Your Documents
|
714 |
-
|
715 |
The AI will search through all your uploaded documents to find relevant information
|
716 |
-
and provide comprehensive answers with sources.
|
717 |
-
""")
|
718 |
-
|
719 |
-
label="Your Question",
|
720 |
-
placeholder="Ask anything about your documents...",
|
721 |
-
lines=3
|
722 |
-
)
|
723 |
-
qa_btn = gr.Button("❓ Get Answer", variant="primary", size="lg")
|
724 |
-
|
725 |
with gr.Column():
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
placeholder="Answer will appear here with sources..."
|
730 |
-
)
|
731 |
-
|
732 |
-
qa_btn.click(
|
733 |
-
ask_question,
|
734 |
-
inputs=[qa_question],
|
735 |
-
outputs=[qa_output]
|
736 |
-
)
|
737 |
-
|
738 |
-
# Update hidden dropdowns when visible ones change
|
739 |
-
doc_dropdown_sum_visible.change(
|
740 |
-
lambda x: x,
|
741 |
-
inputs=[doc_dropdown_sum_visible],
|
742 |
-
outputs=[doc_dropdown_sum]
|
743 |
-
)
|
744 |
|
745 |
-
|
746 |
-
|
747 |
-
inputs=[doc_dropdown_tag_visible],
|
748 |
-
outputs=[doc_dropdown_tag]
|
749 |
-
)
|
750 |
|
751 |
-
|
752 |
-
|
753 |
-
inputs=[delete_doc_dropdown_visible],
|
754 |
-
outputs=[delete_doc_dropdown]
|
755 |
-
)
|
756 |
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
)
|
|
|
|
|
762 |
|
|
|
763 |
return interface
|
764 |
|
765 |
-
# Create and launch the interface
|
766 |
if __name__ == "__main__":
|
767 |
-
|
768 |
-
|
769 |
-
# Launch with proper configuration for Hugging Face Spaces
|
770 |
-
interface.launch(mcp_server=True)
|
|
|
33 |
def __init__(self):
|
34 |
# Initialize services
|
35 |
logger.info("Initializing Content Organizer MCP Server...")
|
|
|
36 |
self.vector_store = VectorStoreService()
|
37 |
self.document_store = DocumentStoreService()
|
38 |
self.embedding_service = EmbeddingService()
|
|
|
55 |
llm_service=self.llm_service,
|
56 |
search_tool=self.search_tool
|
57 |
)
|
58 |
+
|
59 |
# Track processing status
|
60 |
self.processing_status = {}
|
61 |
|
62 |
# Document cache for quick access
|
63 |
self.document_cache = {}
|
|
|
64 |
logger.info("Content Organizer MCP Server initialized successfully!")
|
65 |
|
66 |
def run_async(self, coro):
|
|
|
70 |
except RuntimeError:
|
71 |
loop = asyncio.new_event_loop()
|
72 |
asyncio.set_event_loop(loop)
|
|
|
73 |
if loop.is_running():
|
74 |
# If loop is already running, create a task
|
75 |
import concurrent.futures
|
|
|
84 |
try:
|
85 |
task_id = str(uuid.uuid4())
|
86 |
self.processing_status[task_id] = {"status": "processing", "progress": 0}
|
|
|
87 |
result = await self.ingestion_tool.process_document(file_path, file_type, task_id)
|
|
|
88 |
if result.get("success"):
|
89 |
self.processing_status[task_id] = {"status": "completed", "progress": 100}
|
|
|
90 |
doc_id = result.get("document_id")
|
91 |
if doc_id:
|
92 |
doc = await self.document_store.get_document(doc_id)
|
93 |
if doc:
|
94 |
self.document_cache[doc_id] = doc
|
|
|
95 |
return result
|
96 |
else:
|
97 |
self.processing_status[task_id] = {"status": "failed", "error": result.get("error")}
|
98 |
return result
|
|
|
99 |
except Exception as e:
|
100 |
logger.error(f"Document ingestion failed: {str(e)}")
|
101 |
+
return {"success": False, "error": str(e), "message": "Failed to process document"}
|
102 |
+
|
|
|
|
|
|
|
|
|
103 |
async def get_document_content_async(self, document_id: str) -> Optional[str]:
|
104 |
"""Get document content by ID"""
|
105 |
try:
|
|
|
112 |
if doc:
|
113 |
self.document_cache[document_id] = doc
|
114 |
return doc.content
|
|
|
115 |
return None
|
116 |
except Exception as e:
|
117 |
logger.error(f"Error getting document content: {str(e)}")
|
|
|
121 |
"""MCP Tool: Perform semantic search"""
|
122 |
try:
|
123 |
results = await self.search_tool.search(query, top_k, filters)
|
124 |
+
return {"success": True, "query": query, "results": [result.to_dict() for result in results], "total_results": len(results)}
|
|
|
|
|
|
|
|
|
|
|
125 |
except Exception as e:
|
126 |
logger.error(f"Semantic search failed: {str(e)}")
|
127 |
+
return {"success": False, "error": str(e), "query": query, "results": []}
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
async def summarize_content_async(self, content: str = None, document_id: str = None, style: str = "concise") -> Dict[str, Any]:
|
|
|
130 |
try:
|
|
|
131 |
if document_id and document_id != "none":
|
132 |
content = await self.get_document_content_async(document_id)
|
133 |
if not content:
|
134 |
return {"success": False, "error": f"Document {document_id} not found"}
|
|
|
135 |
if not content or not content.strip():
|
136 |
return {"success": False, "error": "No content provided for summarization"}
|
|
|
|
|
137 |
max_content_length = 4000
|
138 |
if len(content) > max_content_length:
|
139 |
content = content[:max_content_length] + "..."
|
|
|
140 |
summary = await self.generative_tool.summarize(content, style)
|
141 |
+
return {"success": True, "summary": summary, "original_length": len(content), "summary_length": len(summary), "style": style, "document_id": document_id}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
except Exception as e:
|
143 |
logger.error(f"Summarization failed: {str(e)}")
|
144 |
+
return {"success": False, "error": str(e)}
|
|
|
|
|
|
|
145 |
|
146 |
async def generate_tags_async(self, content: str = None, document_id: str = None, max_tags: int = 5) -> Dict[str, Any]:
|
147 |
"""MCP Tool: Generate tags for content"""
|
148 |
try:
|
|
|
149 |
if document_id and document_id != "none":
|
150 |
content = await self.get_document_content_async(document_id)
|
151 |
if not content:
|
152 |
return {"success": False, "error": f"Document {document_id} not found"}
|
|
|
153 |
if not content or not content.strip():
|
154 |
return {"success": False, "error": "No content provided for tag generation"}
|
|
|
155 |
tags = await self.generative_tool.generate_tags(content, max_tags)
|
|
|
|
|
156 |
if document_id and document_id != "none" and tags:
|
157 |
await self.document_store.update_document_metadata(document_id, {"tags": tags})
|
158 |
+
return {"success": True, "tags": tags, "content_length": len(content), "document_id": document_id}
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
except Exception as e:
|
160 |
logger.error(f"Tag generation failed: {str(e)}")
|
161 |
+
return {"success": False, "error": str(e)}
|
|
|
|
|
|
|
162 |
|
163 |
async def answer_question_async(self, question: str, context_filter: Optional[Dict] = None) -> Dict[str, Any]:
|
|
|
164 |
try:
|
|
|
165 |
search_results = await self.search_tool.search(question, top_k=5, filters=context_filter)
|
|
|
166 |
if not search_results:
|
167 |
+
return {"success": False, "error": "No relevant context found in your documents. Please make sure you have uploaded relevant documents.", "question": question}
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
answer = await self.generative_tool.answer_question(question, search_results)
|
169 |
+
return {"success": True, "question": question, "answer": answer, "sources": [result.to_dict() for result in search_results], "confidence": "high" if len(search_results) >= 3 else "medium"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
except Exception as e:
|
171 |
logger.error(f"Question answering failed: {str(e)}")
|
172 |
+
return {"success": False, "error": str(e), "question": question}
|
|
|
|
|
|
|
|
|
173 |
|
174 |
def list_documents_sync(self, limit: int = 100, offset: int = 0) -> Dict[str, Any]:
|
|
|
175 |
try:
|
176 |
documents = self.run_async(self.document_store.list_documents(limit, offset))
|
177 |
+
return {"success": True, "documents": [doc.to_dict() for doc in documents], "total": len(documents)}
|
|
|
|
|
|
|
|
|
178 |
except Exception as e:
|
179 |
+
return {"success": False, "error": str(e)}
|
|
|
|
|
|
|
180 |
|
|
|
181 |
mcp_server = ContentOrganizerMCPServer()
|
182 |
|
|
|
183 |
def get_document_list():
|
|
|
184 |
try:
|
185 |
result = mcp_server.list_documents_sync(limit=100)
|
186 |
if result["success"]:
|
187 |
if result["documents"]:
|
188 |
+
doc_list_str = "📚 Documents in Library:\n\n"
|
189 |
+
for i, doc_item in enumerate(result["documents"], 1):
|
190 |
+
doc_list_str += f"{i}. {doc_item['filename']} (ID: {doc_item['id'][:8]}...)\n"
|
191 |
+
doc_list_str += f" Type: {doc_item['doc_type']}, Size: {doc_item['file_size']} bytes\n"
|
192 |
+
if doc_item.get('tags'):
|
193 |
+
doc_list_str += f" Tags: {', '.join(doc_item['tags'])}\n"
|
194 |
+
doc_list_str += f" Created: {doc_item['created_at'][:10]}\n\n"
|
195 |
+
return doc_list_str
|
196 |
else:
|
197 |
return "No documents in library yet. Upload some documents to get started!"
|
198 |
else:
|
|
|
201 |
return f"Error: {str(e)}"
|
202 |
|
203 |
def get_document_choices():
|
|
|
204 |
try:
|
205 |
result = mcp_server.list_documents_sync(limit=100)
|
206 |
if result["success"] and result["documents"]:
|
207 |
+
choices = [(f"{doc['filename']} ({doc['id'][:8]}...)", doc['id']) for doc in result["documents"]]
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
logger.info(f"Generated {len(choices)} document choices")
|
209 |
return choices
|
210 |
return []
|
|
|
212 |
logger.error(f"Error getting document choices: {str(e)}")
|
213 |
return []
|
214 |
|
215 |
+
def refresh_library():
|
216 |
+
doc_list_refreshed = get_document_list()
|
217 |
+
doc_choices_refreshed = get_document_choices()
|
218 |
+
logger.info(f"Refreshing library. Found {len(doc_choices_refreshed)} choices.")
|
219 |
+
return (
|
220 |
+
doc_list_refreshed,
|
221 |
+
gr.update(choices=doc_choices_refreshed),
|
222 |
+
gr.update(choices=doc_choices_refreshed),
|
223 |
+
gr.update(choices=doc_choices_refreshed)
|
224 |
+
)
|
225 |
+
|
226 |
def upload_and_process_file(file):
|
|
|
227 |
if file is None:
|
228 |
+
doc_list_initial = get_document_list()
|
229 |
+
doc_choices_initial = get_document_choices()
|
230 |
+
return (
|
231 |
+
"No file uploaded", "", doc_list_initial,
|
232 |
+
gr.update(choices=doc_choices_initial),
|
233 |
+
gr.update(choices=doc_choices_initial),
|
234 |
+
gr.update(choices=doc_choices_initial)
|
235 |
+
)
|
236 |
try:
|
|
|
237 |
file_path = file.name if hasattr(file, 'name') else str(file)
|
238 |
+
file_type = Path(file_path).suffix.lower().strip('.') # Ensure suffix is clean
|
239 |
+
logger.info(f"Processing file: {file_path}, type: {file_type}")
|
|
|
|
|
|
|
240 |
result = mcp_server.run_async(mcp_server.ingest_document_async(file_path, file_type))
|
241 |
|
242 |
+
doc_list_updated = get_document_list()
|
243 |
+
doc_choices_updated = get_document_choices()
|
244 |
+
|
245 |
if result["success"]:
|
|
|
|
|
|
|
|
|
246 |
return (
|
247 |
+
f"✅ Success: {result['message']}\nDocument ID: {result['document_id']}\nChunks created: {result['chunks_created']}",
|
248 |
result["document_id"],
|
249 |
+
doc_list_updated,
|
250 |
+
gr.update(choices=doc_choices_updated),
|
251 |
+
gr.update(choices=doc_choices_updated),
|
252 |
+
gr.update(choices=doc_choices_updated)
|
|
|
253 |
)
|
254 |
else:
|
255 |
return (
|
256 |
+
f"❌ Error: {result.get('error', 'Unknown error')}", "",
|
257 |
+
doc_list_updated,
|
258 |
+
gr.update(choices=doc_choices_updated),
|
259 |
+
gr.update(choices=doc_choices_updated),
|
260 |
+
gr.update(choices=doc_choices_updated)
|
|
|
|
|
261 |
)
|
262 |
except Exception as e:
|
263 |
logger.error(f"Error processing file: {str(e)}")
|
264 |
+
doc_list_error = get_document_list()
|
265 |
+
doc_choices_error = get_document_choices()
|
266 |
return (
|
267 |
+
f"❌ Error: {str(e)}", "",
|
268 |
+
doc_list_error,
|
269 |
+
gr.update(choices=doc_choices_error),
|
270 |
+
gr.update(choices=doc_choices_error),
|
271 |
+
gr.update(choices=doc_choices_error)
|
|
|
|
|
272 |
)
|
273 |
|
274 |
def perform_search(query, top_k):
|
|
|
275 |
if not query.strip():
|
276 |
return "Please enter a search query"
|
|
|
277 |
try:
|
278 |
result = mcp_server.run_async(mcp_server.semantic_search_async(query, int(top_k)))
|
|
|
279 |
if result["success"]:
|
280 |
if result["results"]:
|
281 |
+
output_str = f"🔍 Found {result['total_results']} results for: '{query}'\n\n"
|
282 |
+
for i, res_item in enumerate(result["results"], 1):
|
283 |
+
output_str += f"Result {i}:\n"
|
284 |
+
output_str += f"📊 Relevance Score: {res_item['score']:.3f}\n"
|
285 |
+
output_str += f"📄 Content: {res_item['content'][:300]}...\n"
|
286 |
+
if 'document_filename' in res_item.get('metadata', {}):
|
287 |
+
output_str += f"📁 Source: {res_item['metadata']['document_filename']}\n"
|
288 |
+
output_str += f"🔗 Document ID: {res_item.get('document_id', 'Unknown')}\n"
|
289 |
+
output_str += "-" * 80 + "\n\n"
|
290 |
+
return output_str
|
291 |
else:
|
292 |
return f"No results found for: '{query}'\n\nMake sure you have uploaded relevant documents first."
|
293 |
else:
|
|
|
297 |
return f"❌ Error: {str(e)}"
|
298 |
|
299 |
def summarize_document(doc_choice, custom_text, style):
|
|
|
300 |
try:
|
|
|
301 |
logger.info(f"Summarize called with doc_choice: {doc_choice}, type: {type(doc_choice)}")
|
302 |
+
document_id = doc_choice if doc_choice and doc_choice != "none" and doc_choice != "" else None
|
303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
if custom_text and custom_text.strip():
|
305 |
logger.info("Using custom text for summarization")
|
306 |
result = mcp_server.run_async(mcp_server.summarize_content_async(content=custom_text, style=style))
|
|
|
311 |
return "Please select a document from the dropdown or enter text to summarize"
|
312 |
|
313 |
if result["success"]:
|
314 |
+
output_str = f"📝 Summary ({style} style):\n\n{result['summary']}\n\n"
|
315 |
+
output_str += f"📊 Statistics:\n"
|
316 |
+
output_str += f"- Original length: {result['original_length']} characters\n"
|
317 |
+
output_str += f"- Summary length: {result['summary_length']} characters\n"
|
318 |
+
output_str += f"- Compression ratio: {(1 - result['summary_length']/max(1,result['original_length']))*100:.1f}%\n" # Avoid division by zero
|
319 |
if result.get('document_id'):
|
320 |
+
output_str += f"- Document ID: {result['document_id']}\n"
|
321 |
+
return output_str
|
322 |
else:
|
323 |
return f"❌ Summarization failed: {result['error']}"
|
324 |
except Exception as e:
|
|
|
326 |
return f"❌ Error: {str(e)}"
|
327 |
|
328 |
def generate_tags_for_document(doc_choice, custom_text, max_tags):
|
|
|
329 |
try:
|
|
|
330 |
logger.info(f"Generate tags called with doc_choice: {doc_choice}, type: {type(doc_choice)}")
|
331 |
+
document_id = doc_choice if doc_choice and doc_choice != "none" and doc_choice != "" else None
|
332 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
if custom_text and custom_text.strip():
|
334 |
logger.info("Using custom text for tag generation")
|
335 |
result = mcp_server.run_async(mcp_server.generate_tags_async(content=custom_text, max_tags=int(max_tags)))
|
|
|
341 |
|
342 |
if result["success"]:
|
343 |
tags_str = ", ".join(result["tags"])
|
344 |
+
output_str = f"🏷️ Generated Tags:\n\n{tags_str}\n\n"
|
345 |
+
output_str += f"📊 Statistics:\n"
|
346 |
+
output_str += f"- Content length: {result['content_length']} characters\n"
|
347 |
+
output_str += f"- Number of tags: {len(result['tags'])}\n"
|
348 |
if result.get('document_id'):
|
349 |
+
output_str += f"- Document ID: {result['document_id']}\n"
|
350 |
+
output_str += f"\n✅ Tags have been saved to the document."
|
351 |
+
return output_str
|
352 |
else:
|
353 |
return f"❌ Tag generation failed: {result['error']}"
|
354 |
except Exception as e:
|
|
|
356 |
return f"❌ Error: {str(e)}"
|
357 |
|
358 |
def ask_question(question):
|
|
|
359 |
if not question.strip():
|
360 |
return "Please enter a question"
|
|
|
361 |
try:
|
362 |
result = mcp_server.run_async(mcp_server.answer_question_async(question))
|
|
|
363 |
if result["success"]:
|
364 |
+
output_str = f"❓ Question: {result['question']}\n\n"
|
365 |
+
output_str += f"💡 Answer:\n{result['answer']}\n\n"
|
366 |
+
output_str += f"🎯 Confidence: {result['confidence']}\n\n"
|
367 |
+
output_str += f"📚 Sources Used ({len(result['sources'])}):\n"
|
368 |
+
for i, source_item in enumerate(result['sources'], 1):
|
369 |
+
filename = source_item.get('metadata', {}).get('document_filename', 'Unknown')
|
370 |
+
output_str += f"\n{i}. 📄 {filename}\n"
|
371 |
+
output_str += f" 📝 Excerpt: {source_item['content'][:150]}...\n"
|
372 |
+
output_str += f" 📊 Relevance: {source_item['score']:.3f}\n"
|
373 |
+
return output_str
|
374 |
else:
|
375 |
return f"❌ {result.get('error', 'Failed to answer question')}"
|
376 |
except Exception as e:
|
377 |
return f"❌ Error: {str(e)}"
|
378 |
|
379 |
def delete_document_from_library(document_id):
|
380 |
+
if not document_id:
|
381 |
+
doc_list_current = get_document_list()
|
382 |
+
doc_choices_current = get_document_choices()
|
383 |
+
return (
|
384 |
+
"No document selected to delete.",
|
385 |
+
doc_list_current,
|
386 |
+
gr.update(choices=doc_choices_current),
|
387 |
+
gr.update(choices=doc_choices_current),
|
388 |
+
gr.update(choices=doc_choices_current)
|
389 |
+
)
|
390 |
try:
|
391 |
+
delete_doc_store_result = mcp_server.run_async(mcp_server.document_store.delete_document(document_id))
|
392 |
+
delete_vec_store_result = mcp_server.run_async(mcp_server.vector_store.delete_document(document_id))
|
393 |
+
|
394 |
+
msg = ""
|
395 |
+
if delete_doc_store_result:
|
396 |
+
msg += f"🗑️ Document {document_id[:8]}... deleted from document store. "
|
397 |
else:
|
398 |
+
msg += f"❌ Failed to delete document {document_id[:8]}... from document store. "
|
399 |
+
|
400 |
+
if delete_vec_store_result:
|
401 |
+
msg += "Embeddings deleted from vector store."
|
402 |
+
else:
|
403 |
+
msg += "Failed to delete embeddings from vector store (or no embeddings existed)."
|
|
|
404 |
|
|
|
|
|
|
|
|
|
|
|
405 |
|
406 |
+
doc_list_updated = get_document_list()
|
407 |
+
doc_choices_updated = get_document_choices()
|
408 |
+
return (
|
409 |
+
msg,
|
410 |
+
doc_list_updated,
|
411 |
+
gr.update(choices=doc_choices_updated),
|
412 |
+
gr.update(choices=doc_choices_updated),
|
413 |
+
gr.update(choices=doc_choices_updated)
|
414 |
+
)
|
415 |
+
except Exception as e:
|
416 |
+
logger.error(f"Error deleting document: {str(e)}")
|
417 |
+
doc_list_error = get_document_list()
|
418 |
+
doc_choices_error = get_document_choices()
|
419 |
+
return (
|
420 |
+
f"❌ Error deleting document: {str(e)}",
|
421 |
+
doc_list_error,
|
422 |
+
gr.update(choices=doc_choices_error),
|
423 |
+
gr.update(choices=doc_choices_error),
|
424 |
+
gr.update(choices=doc_choices_error)
|
425 |
+
)
|
426 |
+
|
427 |
def create_gradio_interface():
|
428 |
with gr.Blocks(title="🧠 Intelligent Content Organizer MCP Agent", theme=gr.themes.Soft()) as interface:
|
429 |
gr.Markdown("""
|
430 |
# 🧠 Intelligent Content Organizer MCP Agent
|
|
|
431 |
A powerful MCP (Model Context Protocol) server for intelligent content management with semantic search,
|
432 |
+
summarization, and Q&A capabilities.
|
|
|
433 |
## 🚀 Quick Start:
|
434 |
+
1. **Documents in Library** → View your uploaded documents in the "📚 Document Library" tab
|
435 |
+
2. **Upload Documents** → Go to "📄 Upload Documents" tab
|
436 |
+
3. **Search Your Content** → Use "🔍 Search Documents" to find information
|
437 |
+
4. **Get Summaries** → Select any document in "📝 Summarize" tab
|
438 |
+
5. **Generate Tags** → Auto-generate tags for your documents in "🏷️ Generate Tags" tab
|
439 |
+
6. **Ask Questions** → Get answers from your documents in "❓ Ask Questions" tab
|
440 |
+
7. **Delete Documents** → Remove documents from your library in "📚 Document Library" tab
|
441 |
+
8. **Refresh Library** → Click the 🔄 button to refresh the document list
|
442 |
""")
|
443 |
|
|
|
|
|
|
|
|
|
|
|
|
|
444 |
with gr.Tabs():
|
|
|
445 |
with gr.Tab("📚 Document Library"):
|
446 |
with gr.Row():
|
447 |
with gr.Column():
|
448 |
gr.Markdown("### Your Document Collection")
|
449 |
+
document_list_display = gr.Textbox(label="Documents in Library", value=get_document_list(), lines=20, interactive=False)
|
450 |
+
refresh_btn_library = gr.Button("🔄 Refresh Library", variant="secondary")
|
451 |
+
delete_doc_dropdown_visible = gr.Dropdown(label="Select Document to Delete", choices=get_document_choices(), value=None, interactive=True, allow_custom_value=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
452 |
delete_btn = gr.Button("🗑️ Delete Selected Document", variant="stop")
|
453 |
+
delete_output_display = gr.Textbox(label="Delete Status", visible=True)
|
454 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
455 |
with gr.Tab("📄 Upload Documents"):
|
456 |
with gr.Row():
|
457 |
with gr.Column():
|
458 |
gr.Markdown("### Add Documents to Your Library")
|
459 |
+
file_input_upload = gr.File(label="Select Document to Upload", file_types=[".pdf", ".txt", ".docx", ".png", ".jpg", ".jpeg"], type="filepath")
|
460 |
+
upload_btn_process = gr.Button("🚀 Process & Add to Library", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
461 |
with gr.Column():
|
462 |
+
upload_output_display = gr.Textbox(label="Processing Result", lines=6, placeholder="Upload a document to see processing results...")
|
463 |
+
doc_id_output_display = gr.Textbox(label="Document ID", placeholder="Document ID will appear here after processing...")
|
464 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
465 |
with gr.Tab("🔍 Search Documents"):
|
466 |
with gr.Row():
|
467 |
with gr.Column(scale=1):
|
468 |
gr.Markdown("### Search Your Document Library")
|
469 |
+
search_query_input = gr.Textbox(label="What are you looking for?", placeholder="Enter your search query...", lines=2)
|
470 |
+
search_top_k_slider = gr.Slider(label="Number of Results", minimum=1, maximum=20, value=5, step=1)
|
471 |
+
search_btn_action = gr.Button("🔍 Search Library", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
472 |
with gr.Column(scale=2):
|
473 |
+
search_output_display = gr.Textbox(label="Search Results", lines=20, placeholder="Search results will appear here...")
|
474 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
with gr.Tab("📝 Summarize"):
|
476 |
with gr.Row():
|
477 |
with gr.Column():
|
478 |
gr.Markdown("### Generate Document Summaries")
|
479 |
+
doc_dropdown_sum_visible = gr.Dropdown(label="Select Document to Summarize", choices=get_document_choices(), value=None, interactive=True, allow_custom_value=False)
|
480 |
+
summary_text_input = gr.Textbox(label="Or Paste Text to Summarize", placeholder="Paste any text here to summarize...", lines=8)
|
481 |
+
summary_style_dropdown = gr.Dropdown(label="Summary Style", choices=["concise", "detailed", "bullet_points", "executive"], value="concise", info="Choose how you want the summary formatted")
|
482 |
+
summarize_btn_action = gr.Button("📝 Generate Summary", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
483 |
with gr.Column():
|
484 |
+
summary_output_display = gr.Textbox(label="Generated Summary", lines=20, placeholder="Summary will appear here...")
|
485 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
486 |
with gr.Tab("🏷️ Generate Tags"):
|
487 |
with gr.Row():
|
488 |
with gr.Column():
|
489 |
+
gr.Markdown("### Generate Document Tags")
|
490 |
+
doc_dropdown_tag_visible = gr.Dropdown(label="Select Document to Tag", choices=get_document_choices(), value=None, interactive=True, allow_custom_value=False)
|
491 |
+
tag_text_input = gr.Textbox(label="Or Paste Text to Generate Tags", placeholder="Paste any text here to generate tags...", lines=8)
|
492 |
+
max_tags_slider = gr.Slider(label="Number of Tags", minimum=3, maximum=15, value=5, step=1)
|
493 |
+
tag_btn_action = gr.Button("🏷��� Generate Tags", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
with gr.Column():
|
495 |
+
tag_output_display = gr.Textbox(label="Generated Tags", lines=10, placeholder="Tags will appear here...")
|
496 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
with gr.Tab("❓ Ask Questions"):
|
498 |
with gr.Row():
|
499 |
with gr.Column():
|
500 |
+
gr.Markdown("""### Ask Questions About Your Documents
|
|
|
|
|
501 |
The AI will search through all your uploaded documents to find relevant information
|
502 |
+
and provide comprehensive answers with sources.""")
|
503 |
+
qa_question_input = gr.Textbox(label="Your Question", placeholder="Ask anything about your documents...", lines=3)
|
504 |
+
qa_btn_action = gr.Button("❓ Get Answer", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
|
|
|
|
505 |
with gr.Column():
|
506 |
+
qa_output_display = gr.Textbox(label="AI Answer", lines=20, placeholder="Answer will appear here with sources...")
|
507 |
+
|
508 |
+
all_dropdowns_to_update = [delete_doc_dropdown_visible, doc_dropdown_sum_visible, doc_dropdown_tag_visible]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
|
510 |
+
refresh_outputs = [document_list_display] + [dd for dd in all_dropdowns_to_update]
|
511 |
+
refresh_btn_library.click(fn=refresh_library, outputs=refresh_outputs)
|
|
|
|
|
|
|
512 |
|
513 |
+
upload_outputs = [upload_output_display, doc_id_output_display, document_list_display] + [dd for dd in all_dropdowns_to_update]
|
514 |
+
upload_btn_process.click(upload_and_process_file, inputs=[file_input_upload], outputs=upload_outputs)
|
|
|
|
|
|
|
515 |
|
516 |
+
delete_outputs = [delete_output_display, document_list_display] + [dd for dd in all_dropdowns_to_update]
|
517 |
+
delete_btn.click(delete_document_from_library, inputs=[delete_doc_dropdown_visible], outputs=delete_outputs)
|
518 |
+
|
519 |
+
search_btn_action.click(perform_search, inputs=[search_query_input, search_top_k_slider], outputs=[search_output_display])
|
520 |
+
summarize_btn_action.click(summarize_document, inputs=[doc_dropdown_sum_visible, summary_text_input, summary_style_dropdown], outputs=[summary_output_display])
|
521 |
+
tag_btn_action.click(generate_tags_for_document, inputs=[doc_dropdown_tag_visible, tag_text_input, max_tags_slider], outputs=[tag_output_display])
|
522 |
+
qa_btn_action.click(ask_question, inputs=[qa_question_input], outputs=[qa_output_display])
|
523 |
|
524 |
+
interface.load(fn=refresh_library, outputs=refresh_outputs)
|
525 |
return interface
|
526 |
|
|
|
527 |
if __name__ == "__main__":
|
528 |
+
gradio_interface = create_gradio_interface()
|
529 |
+
gradio_interface.launch(mcp_server=True)
|
|
|
|
config.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
import os
|
2 |
from typing import Optional
|
|
|
|
|
|
|
3 |
|
4 |
|
5 |
class Config:
|
@@ -7,11 +10,13 @@ class Config:
|
|
7 |
ANTHROPIC_API_KEY: Optional[str] = os.getenv("ANTHROPIC_API_KEY")
|
8 |
MISTRAL_API_KEY: Optional[str] = os.getenv("MISTRAL_API_KEY")
|
9 |
HUGGINGFACE_API_KEY: Optional[str] = os.getenv("HUGGINGFACE_API_KEY", os.getenv("HF_TOKEN"))
|
|
|
10 |
|
11 |
# Model Configuration
|
12 |
EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
13 |
ANTHROPIC_MODEL: str = os.getenv("ANTHROPIC_MODEL", "claude-3-haiku-20240307") # Using faster model
|
14 |
MISTRAL_MODEL: str = os.getenv("MISTRAL_MODEL", "mistral-small-latest") # Using smaller model
|
|
|
15 |
|
16 |
# Vector Store Configuration
|
17 |
VECTOR_STORE_PATH: str = os.getenv("VECTOR_STORE_PATH", "./data/vector_store")
|
|
|
1 |
import os
|
2 |
from typing import Optional
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
|
7 |
|
8 |
class Config:
|
|
|
10 |
ANTHROPIC_API_KEY: Optional[str] = os.getenv("ANTHROPIC_API_KEY")
|
11 |
MISTRAL_API_KEY: Optional[str] = os.getenv("MISTRAL_API_KEY")
|
12 |
HUGGINGFACE_API_KEY: Optional[str] = os.getenv("HUGGINGFACE_API_KEY", os.getenv("HF_TOKEN"))
|
13 |
+
OPENAI_API_KEY: Optional[str] = os.getenv("OPENAI_API_KEY")
|
14 |
|
15 |
# Model Configuration
|
16 |
EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
17 |
ANTHROPIC_MODEL: str = os.getenv("ANTHROPIC_MODEL", "claude-3-haiku-20240307") # Using faster model
|
18 |
MISTRAL_MODEL: str = os.getenv("MISTRAL_MODEL", "mistral-small-latest") # Using smaller model
|
19 |
+
OPENAI_MODEL: str = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
|
20 |
|
21 |
# Vector Store Configuration
|
22 |
VECTOR_STORE_PATH: str = os.getenv("VECTOR_STORE_PATH", "./data/vector_store")
|
core/chunker.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import logging
|
2 |
from typing import List, Dict, Any, Optional
|
3 |
import re
|
|
|
1 |
+
# chunker.py
|
2 |
import logging
|
3 |
from typing import List, Dict, Any, Optional
|
4 |
import re
|
mcp_server.py
CHANGED
@@ -41,7 +41,7 @@ generative_tool_instance = GenerativeTool(
|
|
41 |
search_tool=search_tool_instance
|
42 |
)
|
43 |
|
44 |
-
mcp = FastMCP("
|
45 |
logger.info("FastMCP server initialized.")
|
46 |
|
47 |
@mcp.tool()
|
|
|
41 |
search_tool=search_tool_instance
|
42 |
)
|
43 |
|
44 |
+
mcp = FastMCP("content")
|
45 |
logger.info("FastMCP server initialized.")
|
46 |
|
47 |
@mcp.tool()
|
requirements.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
gradio
|
2 |
anthropic>=0.7.0
|
3 |
-
mistralai
|
4 |
sentence-transformers>=2.2.2
|
5 |
transformers>=4.30.0
|
6 |
torch>=2.0.0
|
@@ -20,4 +20,6 @@ asyncio-mqtt>=0.11.1
|
|
20 |
nest-asyncio>=1.5.6
|
21 |
httpx
|
22 |
fastmcp
|
23 |
-
mcp
|
|
|
|
|
|
1 |
gradio
|
2 |
anthropic>=0.7.0
|
3 |
+
mistralai
|
4 |
sentence-transformers>=2.2.2
|
5 |
transformers>=4.30.0
|
6 |
torch>=2.0.0
|
|
|
20 |
nest-asyncio>=1.5.6
|
21 |
httpx
|
22 |
fastmcp
|
23 |
+
mcp
|
24 |
+
openai
|
25 |
+
python-dotenv
|
services/llm_service.py
CHANGED
@@ -1,8 +1,10 @@
|
|
|
|
1 |
import logging
|
2 |
import asyncio
|
3 |
from typing import List, Dict, Any, Optional
|
|
|
4 |
import anthropic
|
5 |
-
|
6 |
import config
|
7 |
|
8 |
logger = logging.getLogger(__name__)
|
@@ -11,9 +13,9 @@ class LLMService:
|
|
11 |
def __init__(self):
|
12 |
self.config = config.config
|
13 |
|
14 |
-
# Initialize clients
|
15 |
self.anthropic_client = None
|
16 |
-
self.mistral_client = None
|
|
|
17 |
|
18 |
self._initialize_clients()
|
19 |
|
@@ -27,51 +29,110 @@ class LLMService:
|
|
27 |
logger.info("Anthropic client initialized")
|
28 |
|
29 |
if self.config.MISTRAL_API_KEY:
|
30 |
-
self.mistral_client = Mistral(
|
31 |
api_key=self.config.MISTRAL_API_KEY
|
32 |
)
|
33 |
logger.info("Mistral client initialized")
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
if
|
36 |
-
|
|
|
|
|
|
|
37 |
|
38 |
except Exception as e:
|
39 |
logger.error(f"Error initializing LLM clients: {str(e)}")
|
40 |
raise
|
41 |
|
42 |
async def generate_text(self, prompt: str, model: str = "auto", max_tokens: int = 1000, temperature: float = 0.7) -> str:
|
43 |
-
"""Generate text using the specified model"""
|
44 |
try:
|
|
|
|
|
45 |
if model == "auto":
|
46 |
-
#
|
47 |
-
if self.
|
48 |
-
|
49 |
-
|
50 |
-
return await self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
else:
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
57 |
elif model.startswith("mistral"):
|
58 |
if not self.mistral_client:
|
59 |
-
raise ValueError("Mistral client not available")
|
60 |
-
return await self._generate_with_mistral(prompt, max_tokens, temperature)
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
else:
|
62 |
-
raise ValueError(f"Unsupported model: {model}")
|
|
|
63 |
except Exception as e:
|
64 |
-
logger.error(f"Error generating text: {str(e)}")
|
65 |
raise
|
66 |
-
|
67 |
-
async def
|
68 |
-
"""Generate text using
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
try:
|
|
|
70 |
loop = asyncio.get_event_loop()
|
71 |
response = await loop.run_in_executor(
|
72 |
None,
|
73 |
lambda: self.anthropic_client.messages.create(
|
74 |
-
model=
|
75 |
max_tokens=max_tokens,
|
76 |
temperature=temperature,
|
77 |
messages=[
|
@@ -79,66 +140,78 @@ class LLMService:
|
|
79 |
]
|
80 |
)
|
81 |
)
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
84 |
except Exception as e:
|
85 |
-
logger.error(f"Error with Claude generation: {str(e)}")
|
86 |
raise
|
87 |
|
88 |
-
async def _generate_with_mistral(self, prompt: str, max_tokens: int, temperature: float) -> str:
|
89 |
-
"""Generate text using Mistral"""
|
|
|
|
|
90 |
try:
|
|
|
91 |
loop = asyncio.get_event_loop()
|
|
|
92 |
response = await loop.run_in_executor(
|
93 |
None,
|
94 |
lambda: self.mistral_client.chat(
|
95 |
-
model=
|
96 |
messages=[{"role": "user", "content": prompt}],
|
97 |
-
max_tokens=max_tokens,
|
98 |
temperature=temperature
|
99 |
)
|
100 |
)
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
except Exception as e:
|
104 |
-
logger.error(f"Error with Mistral generation: {str(e)}")
|
105 |
raise
|
106 |
|
|
|
107 |
async def summarize(self, text: str, style: str = "concise", max_length: Optional[int] = None) -> str:
|
108 |
-
"""Generate a summary of the given text"""
|
109 |
if not text.strip():
|
110 |
return ""
|
111 |
|
112 |
-
# Create style-specific prompts
|
113 |
style_prompts = {
|
114 |
"concise": "Provide a concise summary of the following text, focusing on the main points:",
|
115 |
"detailed": "Provide a detailed summary of the following text, including key details and supporting information:",
|
116 |
"bullet_points": "Summarize the following text as a list of bullet points highlighting the main ideas:",
|
117 |
"executive": "Provide an executive summary of the following text, focusing on key findings and actionable insights:"
|
118 |
}
|
119 |
-
|
120 |
prompt_template = style_prompts.get(style, style_prompts["concise"])
|
121 |
-
|
122 |
if max_length:
|
123 |
-
prompt_template += f" Keep the summary under {max_length} words."
|
124 |
|
125 |
prompt = f"{prompt_template}\n\nText to summarize:\n{text}\n\nSummary:"
|
126 |
|
127 |
try:
|
128 |
-
|
|
|
129 |
return summary.strip()
|
130 |
except Exception as e:
|
131 |
logger.error(f"Error generating summary: {str(e)}")
|
132 |
return "Error generating summary"
|
133 |
|
134 |
async def generate_tags(self, text: str, max_tags: int = 5) -> List[str]:
|
135 |
-
"""Generate relevant tags for the given text"""
|
136 |
if not text.strip():
|
137 |
return []
|
138 |
|
139 |
-
prompt = f"""Generate {max_tags} relevant tags for the following text.
|
140 |
-
Tags should be concise, descriptive keywords or phrases that capture the main topics
|
141 |
-
Return only the tags, separated by commas.
|
142 |
|
143 |
Text:
|
144 |
{text}
|
@@ -146,28 +219,22 @@ class LLMService:
|
|
146 |
Tags:"""
|
147 |
|
148 |
try:
|
149 |
-
response = await self.generate_text(prompt, max_tokens=100, temperature=0.5)
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
tags = [tag for tag in tags if tag and len(tag) > 1]
|
154 |
-
|
155 |
-
return tags[:max_tags]
|
156 |
except Exception as e:
|
157 |
logger.error(f"Error generating tags: {str(e)}")
|
158 |
return []
|
159 |
|
160 |
async def categorize(self, text: str, categories: List[str]) -> str:
|
161 |
-
"""Categorize text into one of the provided categories"""
|
162 |
if not text.strip() or not categories:
|
163 |
return "Uncategorized"
|
164 |
|
165 |
-
categories_str = ", ".join(categories)
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
Choose the most appropriate category based on the content and main theme of the text.
|
170 |
-
Return only the category name, nothing else.
|
171 |
|
172 |
Text to classify:
|
173 |
{text}
|
@@ -175,111 +242,146 @@ class LLMService:
|
|
175 |
Category:"""
|
176 |
|
177 |
try:
|
178 |
-
response = await self.generate_text(prompt, max_tokens=50, temperature=0.1)
|
179 |
-
|
180 |
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
for cat in categories:
|
188 |
-
if cat.lower() in category_lower or category_lower in cat.lower():
|
189 |
-
return cat
|
190 |
-
|
191 |
-
return categories[0] if categories else "Uncategorized"
|
192 |
except Exception as e:
|
193 |
logger.error(f"Error categorizing text: {str(e)}")
|
194 |
return "Uncategorized"
|
195 |
|
196 |
-
async def answer_question(self, question: str, context: str, max_context_length: int =
|
197 |
-
"""Answer a question based on the provided context"""
|
198 |
if not question.strip():
|
199 |
-
return "No question provided"
|
200 |
-
|
201 |
if not context.strip():
|
202 |
-
return "I don't have enough context to answer this question. Please provide
|
203 |
|
204 |
-
# Truncate context if too long
|
205 |
if len(context) > max_context_length:
|
206 |
context = context[:max_context_length] + "..."
|
|
|
207 |
|
208 |
-
prompt = f"""
|
|
|
|
|
209 |
|
210 |
-
|
211 |
-
|
|
|
|
|
212 |
|
213 |
-
|
214 |
|
215 |
-
|
216 |
|
217 |
try:
|
218 |
-
answer = await self.generate_text(prompt, max_tokens=300, temperature=0.
|
219 |
return answer.strip()
|
220 |
except Exception as e:
|
221 |
logger.error(f"Error answering question: {str(e)}")
|
222 |
return "I encountered an error while trying to answer your question."
|
223 |
|
224 |
async def extract_key_information(self, text: str) -> Dict[str, Any]:
|
225 |
-
"""Extract key information from text"""
|
226 |
if not text.strip():
|
227 |
return {}
|
228 |
|
229 |
-
prompt = f"""Analyze the following text and extract key information.
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
|
|
|
|
236 |
|
237 |
Text to analyze:
|
|
|
238 |
{text}
|
|
|
239 |
|
240 |
-
Analysis:"""
|
241 |
|
242 |
try:
|
243 |
-
|
244 |
-
|
245 |
-
# Parse the structured response
|
246 |
-
info = {}
|
247 |
-
lines = response.split('\n')
|
248 |
-
|
249 |
-
for line in lines:
|
250 |
-
if ':' in line:
|
251 |
-
key, value = line.split(':', 1)
|
252 |
-
key = key.strip().lower().replace(' ', '_')
|
253 |
-
value = value.strip()
|
254 |
-
if value:
|
255 |
-
info[key] = value
|
256 |
|
257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
except Exception as e:
|
259 |
logger.error(f"Error extracting key information: {str(e)}")
|
260 |
-
return {}
|
261 |
-
|
262 |
async def check_availability(self) -> Dict[str, bool]:
|
263 |
-
"""Check which LLM services are available"""
|
264 |
availability = {
|
265 |
-
"
|
266 |
-
"mistral": False
|
|
|
267 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
test_response = await self.
|
273 |
-
availability["
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
|
|
|
|
284 |
|
|
|
285 |
return availability
|
|
|
1 |
+
from mistralai import Mistral
|
2 |
import logging
|
3 |
import asyncio
|
4 |
from typing import List, Dict, Any, Optional
|
5 |
+
|
6 |
import anthropic
|
7 |
+
import openai
|
8 |
import config
|
9 |
|
10 |
logger = logging.getLogger(__name__)
|
|
|
13 |
def __init__(self):
|
14 |
self.config = config.config
|
15 |
|
|
|
16 |
self.anthropic_client = None
|
17 |
+
self.mistral_client = None # Synchronous Mistral client
|
18 |
+
self.openai_async_client = None # Asynchronous OpenAI client
|
19 |
|
20 |
self._initialize_clients()
|
21 |
|
|
|
29 |
logger.info("Anthropic client initialized")
|
30 |
|
31 |
if self.config.MISTRAL_API_KEY:
|
32 |
+
self.mistral_client = Mistral( # Standard sync client
|
33 |
api_key=self.config.MISTRAL_API_KEY
|
34 |
)
|
35 |
logger.info("Mistral client initialized")
|
36 |
+
|
37 |
+
if self.config.OPENAI_API_KEY:
|
38 |
+
self.openai_async_client = openai.AsyncOpenAI(
|
39 |
+
api_key=self.config.OPENAI_API_KEY
|
40 |
+
)
|
41 |
+
logger.info("OpenAI client initialized")
|
42 |
|
43 |
+
# Check if at least one client is initialized
|
44 |
+
if not any([self.openai_async_client, self.mistral_client, self.anthropic_client]):
|
45 |
+
logger.warning("No LLM clients could be initialized based on current config. Check API keys.")
|
46 |
+
else:
|
47 |
+
logger.info("LLM clients initialized successfully (at least one).")
|
48 |
|
49 |
except Exception as e:
|
50 |
logger.error(f"Error initializing LLM clients: {str(e)}")
|
51 |
raise
|
52 |
|
53 |
async def generate_text(self, prompt: str, model: str = "auto", max_tokens: int = 1000, temperature: float = 0.7) -> str:
|
54 |
+
"""Generate text using the specified model, with new priority for 'auto'."""
|
55 |
try:
|
56 |
+
selected_model_name_for_call: str = "" # Actual model name passed to the specific generator
|
57 |
+
|
58 |
if model == "auto":
|
59 |
+
# New Priority: 1. OpenAI, 2. Mistral, 3. Anthropic
|
60 |
+
if self.openai_async_client and self.config.OPENAI_MODEL:
|
61 |
+
selected_model_name_for_call = self.config.OPENAI_MODEL
|
62 |
+
logger.debug(f"Auto-selected OpenAI model: {selected_model_name_for_call}")
|
63 |
+
return await self._generate_with_openai(prompt, selected_model_name_for_call, max_tokens, temperature)
|
64 |
+
elif self.mistral_client and self.config.MISTRAL_MODEL:
|
65 |
+
selected_model_name_for_call = self.config.MISTRAL_MODEL
|
66 |
+
logger.debug(f"Auto-selected Mistral model: {selected_model_name_for_call}")
|
67 |
+
return await self._generate_with_mistral(prompt, selected_model_name_for_call, max_tokens, temperature)
|
68 |
+
elif self.anthropic_client and self.config.ANTHROPIC_MODEL:
|
69 |
+
selected_model_name_for_call = self.config.ANTHROPIC_MODEL
|
70 |
+
logger.debug(f"Auto-selected Anthropic model: {selected_model_name_for_call}")
|
71 |
+
return await self._generate_with_claude(prompt, selected_model_name_for_call, max_tokens, temperature)
|
72 |
else:
|
73 |
+
logger.error("No LLM clients available for 'auto' mode or default models not configured.")
|
74 |
+
raise ValueError("No LLM clients available for 'auto' mode or default models not configured.")
|
75 |
+
|
76 |
+
elif model.startswith("gpt-") or model.lower().startswith("openai/"):
|
77 |
+
if not self.openai_async_client:
|
78 |
+
raise ValueError("OpenAI client not available. Check API key or model prefix.")
|
79 |
+
actual_model = model.split('/')[-1] if '/' in model else model
|
80 |
+
return await self._generate_with_openai(prompt, actual_model, max_tokens, temperature)
|
81 |
+
|
82 |
elif model.startswith("mistral"):
|
83 |
if not self.mistral_client:
|
84 |
+
raise ValueError("Mistral client not available. Check API key or model prefix.")
|
85 |
+
return await self._generate_with_mistral(prompt, model, max_tokens, temperature)
|
86 |
+
|
87 |
+
elif model.startswith("claude"):
|
88 |
+
if not self.anthropic_client:
|
89 |
+
raise ValueError("Anthropic client not available. Check API key or model prefix.")
|
90 |
+
return await self._generate_with_claude(prompt, model, max_tokens, temperature)
|
91 |
+
|
92 |
else:
|
93 |
+
raise ValueError(f"Unsupported model: {model}. Must start with 'gpt-', 'openai/', 'claude', 'mistral', or be 'auto'.")
|
94 |
+
|
95 |
except Exception as e:
|
96 |
+
logger.error(f"Error generating text with model '{model}': {str(e)}")
|
97 |
raise
|
98 |
+
|
99 |
+
async def _generate_with_openai(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
|
100 |
+
"""Generate text using OpenAI (Async)"""
|
101 |
+
if not self.openai_async_client:
|
102 |
+
raise RuntimeError("OpenAI async client not initialized.")
|
103 |
+
try:
|
104 |
+
logger.debug(f"Generating with OpenAI model: {model_name}, max_tokens: {max_tokens}, temp: {temperature}, prompt: '{prompt[:50]}...'")
|
105 |
+
response = await self.openai_async_client.chat.completions.create(
|
106 |
+
model=model_name,
|
107 |
+
messages=[{"role": "user", "content": prompt}],
|
108 |
+
max_tokens=max_tokens,
|
109 |
+
temperature=temperature
|
110 |
+
)
|
111 |
+
if response.choices and response.choices[0].message:
|
112 |
+
content = response.choices[0].message.content
|
113 |
+
if content is not None:
|
114 |
+
return content.strip()
|
115 |
+
else:
|
116 |
+
logger.warning(f"OpenAI response message content is None for model {model_name}.")
|
117 |
+
return ""
|
118 |
+
else:
|
119 |
+
logger.warning(f"OpenAI response did not contain expected choices or message for model {model_name}.")
|
120 |
+
return ""
|
121 |
+
except Exception as e:
|
122 |
+
logger.error(f"Error with OpenAI generation (model: {model_name}): {str(e)}")
|
123 |
+
raise
|
124 |
+
|
125 |
+
async def _generate_with_claude(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
|
126 |
+
"""Generate text using Anthropic/Claude (Sync via run_in_executor)"""
|
127 |
+
if not self.anthropic_client:
|
128 |
+
raise RuntimeError("Anthropic client not initialized.")
|
129 |
try:
|
130 |
+
logger.debug(f"Generating with Anthropic model: {model_name}, max_tokens: {max_tokens}, temp: {temperature}, prompt: '{prompt[:50]}...'")
|
131 |
loop = asyncio.get_event_loop()
|
132 |
response = await loop.run_in_executor(
|
133 |
None,
|
134 |
lambda: self.anthropic_client.messages.create(
|
135 |
+
model=model_name, # Use the passed model_name
|
136 |
max_tokens=max_tokens,
|
137 |
temperature=temperature,
|
138 |
messages=[
|
|
|
140 |
]
|
141 |
)
|
142 |
)
|
143 |
+
if response.content and response.content[0].text:
|
144 |
+
return response.content[0].text.strip()
|
145 |
+
else:
|
146 |
+
logger.warning(f"Anthropic response did not contain expected content for model {model_name}.")
|
147 |
+
return ""
|
148 |
except Exception as e:
|
149 |
+
logger.error(f"Error with Anthropic (Claude) generation (model: {model_name}): {str(e)}")
|
150 |
raise
|
151 |
|
152 |
+
async def _generate_with_mistral(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
|
153 |
+
"""Generate text using Mistral (Sync via run_in_executor)"""
|
154 |
+
if not self.mistral_client:
|
155 |
+
raise RuntimeError("Mistral client not initialized.")
|
156 |
try:
|
157 |
+
logger.debug(f"Generating with Mistral model: {model_name}, temp: {temperature}, prompt: '{prompt[:50]}...' (max_tokens: {max_tokens} - note: not directly used by MistralClient.chat)")
|
158 |
loop = asyncio.get_event_loop()
|
159 |
+
|
160 |
response = await loop.run_in_executor(
|
161 |
None,
|
162 |
lambda: self.mistral_client.chat(
|
163 |
+
model=model_name, # Use the passed model_name
|
164 |
messages=[{"role": "user", "content": prompt}],
|
165 |
+
max_tokens=max_tokens,
|
166 |
temperature=temperature
|
167 |
)
|
168 |
)
|
169 |
+
if response.choices and response.choices[0].message:
|
170 |
+
content = response.choices[0].message.content
|
171 |
+
if content is not None:
|
172 |
+
return content.strip()
|
173 |
+
else:
|
174 |
+
logger.warning(f"Mistral response message content is None for model {model_name}.")
|
175 |
+
return ""
|
176 |
+
else:
|
177 |
+
logger.warning(f"Mistral response did not contain expected choices or message for model {model_name}.")
|
178 |
+
return ""
|
179 |
except Exception as e:
|
180 |
+
logger.error(f"Error with Mistral generation (model: {model_name}): {str(e)}")
|
181 |
raise
|
182 |
|
183 |
+
|
184 |
async def summarize(self, text: str, style: str = "concise", max_length: Optional[int] = None) -> str:
|
|
|
185 |
if not text.strip():
|
186 |
return ""
|
187 |
|
|
|
188 |
style_prompts = {
|
189 |
"concise": "Provide a concise summary of the following text, focusing on the main points:",
|
190 |
"detailed": "Provide a detailed summary of the following text, including key details and supporting information:",
|
191 |
"bullet_points": "Summarize the following text as a list of bullet points highlighting the main ideas:",
|
192 |
"executive": "Provide an executive summary of the following text, focusing on key findings and actionable insights:"
|
193 |
}
|
|
|
194 |
prompt_template = style_prompts.get(style, style_prompts["concise"])
|
|
|
195 |
if max_length:
|
196 |
+
prompt_template += f" Keep the summary under approximately {max_length} words."
|
197 |
|
198 |
prompt = f"{prompt_template}\n\nText to summarize:\n{text}\n\nSummary:"
|
199 |
|
200 |
try:
|
201 |
+
summary_max_tokens = (max_length * 2) if max_length else 500
|
202 |
+
summary = await self.generate_text(prompt, model="auto", max_tokens=summary_max_tokens, temperature=0.3)
|
203 |
return summary.strip()
|
204 |
except Exception as e:
|
205 |
logger.error(f"Error generating summary: {str(e)}")
|
206 |
return "Error generating summary"
|
207 |
|
208 |
async def generate_tags(self, text: str, max_tags: int = 5) -> List[str]:
|
|
|
209 |
if not text.strip():
|
210 |
return []
|
211 |
|
212 |
+
prompt = f"""Generate up to {max_tags} relevant tags for the following text.
|
213 |
+
Tags should be concise, descriptive keywords or phrases (1-3 words typically) that capture the main topics or themes.
|
214 |
+
Return only the tags, separated by commas. Do not include any preamble or explanation.
|
215 |
|
216 |
Text:
|
217 |
{text}
|
|
|
219 |
Tags:"""
|
220 |
|
221 |
try:
|
222 |
+
response = await self.generate_text(prompt, model="auto", max_tokens=100, temperature=0.5)
|
223 |
+
tags = [tag.strip().lower() for tag in response.split(',') if tag.strip()]
|
224 |
+
tags = [tag for tag in tags if tag and len(tag) > 1 and len(tag) < 50]
|
225 |
+
return list(dict.fromkeys(tags))[:max_tags]
|
|
|
|
|
|
|
226 |
except Exception as e:
|
227 |
logger.error(f"Error generating tags: {str(e)}")
|
228 |
return []
|
229 |
|
230 |
async def categorize(self, text: str, categories: List[str]) -> str:
|
|
|
231 |
if not text.strip() or not categories:
|
232 |
return "Uncategorized"
|
233 |
|
234 |
+
categories_str = ", ".join([f"'{cat}'" for cat in categories])
|
235 |
+
prompt = f"""Classify the following text into ONE of these categories: {categories_str}.
|
236 |
+
Choose the single most appropriate category based on the content and main theme of the text.
|
237 |
+
Return only the category name as a string, exactly as it appears in the list provided. Do not add any other text or explanation.
|
|
|
|
|
238 |
|
239 |
Text to classify:
|
240 |
{text}
|
|
|
242 |
Category:"""
|
243 |
|
244 |
try:
|
245 |
+
response = await self.generate_text(prompt, model="auto", max_tokens=50, temperature=0.1)
|
246 |
+
category_candidate = response.strip().strip("'\"")
|
247 |
|
248 |
+
for cat in categories:
|
249 |
+
if cat.lower() == category_candidate.lower():
|
250 |
+
return cat
|
251 |
+
|
252 |
+
logger.warning(f"LLM returned category '{category_candidate}' which is not in the provided list: {categories}. Falling back.")
|
253 |
+
return categories[0] if categories else "Uncategorized"
|
|
|
|
|
|
|
|
|
|
|
254 |
except Exception as e:
|
255 |
logger.error(f"Error categorizing text: {str(e)}")
|
256 |
return "Uncategorized"
|
257 |
|
258 |
+
async def answer_question(self, question: str, context: str, max_context_length: int = 3000) -> str:
|
|
|
259 |
if not question.strip():
|
260 |
+
return "No question provided."
|
|
|
261 |
if not context.strip():
|
262 |
+
return "I don't have enough context to answer this question. Please provide relevant information."
|
263 |
|
|
|
264 |
if len(context) > max_context_length:
|
265 |
context = context[:max_context_length] + "..."
|
266 |
+
logger.warning(f"Context truncated to {max_context_length} characters for question answering.")
|
267 |
|
268 |
+
prompt = f"""You are a helpful assistant. Answer the following question based ONLY on the provided context.
|
269 |
+
If the context does not contain the information to answer the question, state that the context does not provide the answer.
|
270 |
+
Do not make up information or use external knowledge.
|
271 |
|
272 |
+
Context:
|
273 |
+
---
|
274 |
+
{context}
|
275 |
+
---
|
276 |
|
277 |
+
Question: {question}
|
278 |
|
279 |
+
Answer:"""
|
280 |
|
281 |
try:
|
282 |
+
answer = await self.generate_text(prompt, model="auto", max_tokens=300, temperature=0.2)
|
283 |
return answer.strip()
|
284 |
except Exception as e:
|
285 |
logger.error(f"Error answering question: {str(e)}")
|
286 |
return "I encountered an error while trying to answer your question."
|
287 |
|
288 |
async def extract_key_information(self, text: str) -> Dict[str, Any]:
|
|
|
289 |
if not text.strip():
|
290 |
return {}
|
291 |
|
292 |
+
prompt = f"""Analyze the following text and extract key information.
|
293 |
+
Provide the response as a JSON object with the following keys:
|
294 |
+
- "main_topic": (string) The main topic or subject of the text.
|
295 |
+
- "key_points": (array of strings) A list of 3-5 key points or takeaways.
|
296 |
+
- "entities": (array of strings) Important people, places, organizations, or products mentioned.
|
297 |
+
- "sentiment": (string) Overall sentiment of the text (e.g., "positive", "neutral", "negative", "mixed").
|
298 |
+
- "content_type": (string) The perceived type of content (e.g., "article", "email", "report", "conversation", "advertisement", "other").
|
299 |
+
|
300 |
+
If a piece of information is not found or not applicable, use null or an empty array/string as appropriate for the JSON structure.
|
301 |
|
302 |
Text to analyze:
|
303 |
+
---
|
304 |
{text}
|
305 |
+
---
|
306 |
|
307 |
+
JSON Analysis:"""
|
308 |
|
309 |
try:
|
310 |
+
response_str = await self.generate_text(prompt, model="auto", max_tokens=500, temperature=0.4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
|
312 |
+
import json
|
313 |
+
try:
|
314 |
+
if response_str.startswith("```json"):
|
315 |
+
response_str = response_str.lstrip("```json").rstrip("```").strip()
|
316 |
+
|
317 |
+
info = json.loads(response_str)
|
318 |
+
expected_keys = {"main_topic", "key_points", "entities", "sentiment", "content_type"}
|
319 |
+
if not expected_keys.issubset(info.keys()):
|
320 |
+
logger.warning(f"Extracted information missing some expected keys. Got: {info.keys()}")
|
321 |
+
return info
|
322 |
+
except json.JSONDecodeError as je:
|
323 |
+
logger.error(f"Failed to parse JSON from LLM response for key_information: {je}")
|
324 |
+
logger.debug(f"LLM Response string was: {response_str}")
|
325 |
+
info_fallback = {}
|
326 |
+
lines = response_str.split('\n')
|
327 |
+
for line in lines:
|
328 |
+
if ':' in line:
|
329 |
+
key, value = line.split(':', 1)
|
330 |
+
key_clean = key.strip().lower().replace(' ', '_')
|
331 |
+
value_clean = value.strip()
|
332 |
+
if value_clean:
|
333 |
+
if key_clean in ["key_points", "entities"] and '[' in value_clean and ']' in value_clean:
|
334 |
+
try:
|
335 |
+
info_fallback[key_clean] = [item.strip().strip("'\"") for item in value_clean.strip('[]').split(',') if item.strip()]
|
336 |
+
except: info_fallback[key_clean] = value_clean
|
337 |
+
else: info_fallback[key_clean] = value_clean
|
338 |
+
if info_fallback:
|
339 |
+
logger.info("Successfully parsed key information using fallback line-based method.")
|
340 |
+
return info_fallback
|
341 |
+
return {"error": "Failed to parse LLM output", "raw_response": response_str}
|
342 |
except Exception as e:
|
343 |
logger.error(f"Error extracting key information: {str(e)}")
|
344 |
+
return {"error": f"General error extracting key information: {str(e)}"}
|
345 |
+
|
346 |
async def check_availability(self) -> Dict[str, bool]:
|
347 |
+
"""Check which LLM services are available by making a tiny test call."""
|
348 |
availability = {
|
349 |
+
"openai": False,
|
350 |
+
"mistral": False,
|
351 |
+
"anthropic": False
|
352 |
}
|
353 |
+
test_prompt = "Hello"
|
354 |
+
test_max_tokens = 5
|
355 |
+
test_temp = 0.1
|
356 |
+
|
357 |
+
logger.info("Checking LLM availability...")
|
358 |
+
|
359 |
+
if self.openai_async_client and self.config.OPENAI_MODEL:
|
360 |
+
try:
|
361 |
+
logger.debug(f"Testing OpenAI availability with model {self.config.OPENAI_MODEL}...")
|
362 |
+
test_response = await self._generate_with_openai(test_prompt, self.config.OPENAI_MODEL, test_max_tokens, test_temp)
|
363 |
+
availability["openai"] = bool(test_response.strip())
|
364 |
+
except Exception as e:
|
365 |
+
logger.warning(f"OpenAI availability check failed for model {self.config.OPENAI_MODEL}: {e}")
|
366 |
+
logger.info(f"OpenAI available: {availability['openai']}")
|
367 |
|
368 |
+
if self.mistral_client and self.config.MISTRAL_MODEL:
|
369 |
+
try:
|
370 |
+
logger.debug(f"Testing Mistral availability with model {self.config.MISTRAL_MODEL}...")
|
371 |
+
test_response = await self._generate_with_mistral(test_prompt, self.config.MISTRAL_MODEL, test_max_tokens, test_temp)
|
372 |
+
availability["mistral"] = bool(test_response.strip())
|
373 |
+
except Exception as e:
|
374 |
+
logger.warning(f"Mistral availability check failed for model {self.config.MISTRAL_MODEL}: {e}")
|
375 |
+
logger.info(f"Mistral available: {availability['mistral']}")
|
376 |
+
|
377 |
+
if self.anthropic_client and self.config.ANTHROPIC_MODEL:
|
378 |
+
try:
|
379 |
+
logger.debug(f"Testing Anthropic availability with model {self.config.ANTHROPIC_MODEL}...")
|
380 |
+
test_response = await self._generate_with_claude(test_prompt, self.config.ANTHROPIC_MODEL, test_max_tokens, test_temp)
|
381 |
+
availability["anthropic"] = bool(test_response.strip())
|
382 |
+
except Exception as e:
|
383 |
+
logger.warning(f"Anthropic availability check failed for model {self.config.ANTHROPIC_MODEL}: {e}")
|
384 |
+
logger.info(f"Anthropic available: {availability['anthropic']}")
|
385 |
|
386 |
+
logger.info(f"Final LLM Availability: {availability}")
|
387 |
return availability
|