QA version persionality
Browse files- .gitignore +3 -1
- README.md +142 -1
- app.py +47 -2
- app/__init__.py +8 -13
- app/api/mongodb_routes.py +3 -3
- app/api/pdf_routes.py +690 -92
- app/api/pdf_websocket.py +182 -4
- app/api/postgresql_routes.py +492 -231
- app/api/rag_routes.py +19 -10
- app/api/websocket_routes.py +61 -22
- app/database/models.py +21 -9
- app/database/mongodb.py +41 -9
- app/database/postgresql.py +10 -11
- app/models/pdf_models.py +23 -12
- app/utils/cache.py +3 -81
- app/utils/pdf_processor.py +375 -137
- app/utils/pinecone_fix.py +194 -0
.gitignore
CHANGED
@@ -59,7 +59,6 @@ out/
|
|
59 |
tests/
|
60 |
|
61 |
Admin_bot/
|
62 |
-
|
63 |
Pix-Agent/
|
64 |
|
65 |
# Hugging Face Spaces
|
@@ -81,3 +80,6 @@ Thumbs.db
|
|
81 |
main.py
|
82 |
|
83 |
test/
|
|
|
|
|
|
|
|
59 |
tests/
|
60 |
|
61 |
Admin_bot/
|
|
|
62 |
Pix-Agent/
|
63 |
|
64 |
# Hugging Face Spaces
|
|
|
80 |
main.py
|
81 |
|
82 |
test/
|
83 |
+
|
84 |
+
/tmp
|
85 |
+
/docs/
|
README.md
CHANGED
@@ -416,4 +416,145 @@ Lịch sử hội thoại người dùng được lưu trong queue riêng với
|
|
416 |
|
417 |
## Tác giả
|
418 |
|
419 |
-
- **PIX Project Team**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
|
417 |
## Tác giả
|
418 |
|
419 |
+
- **PIX Project Team**
|
420 |
+
|
421 |
+
# PixAgent PDF Processing
|
422 |
+
|
423 |
+
This README provides instructions for the PDF processing functionality in PixAgent, including uploading PDF documents, managing vector embeddings, and deleting documents.
|
424 |
+
|
425 |
+
## API Endpoints
|
426 |
+
|
427 |
+
### Health Check
|
428 |
+
|
429 |
+
```
|
430 |
+
GET /health
|
431 |
+
GET /pdf/health
|
432 |
+
```
|
433 |
+
|
434 |
+
Verify the API is running and the connection to databases (MongoDB, PostgreSQL, Pinecone) is established.
|
435 |
+
|
436 |
+
### Upload PDF
|
437 |
+
|
438 |
+
```
|
439 |
+
POST /pdf/upload
|
440 |
+
```
|
441 |
+
|
442 |
+
**Parameters:**
|
443 |
+
- `file`: The PDF file to upload (multipart/form-data)
|
444 |
+
- `namespace`: The namespace to store vectors in (default: "Default")
|
445 |
+
- `mock_mode`: Set to "true" or "false" (default: "false")
|
446 |
+
- `vector_database_id`: The ID of the vector database to use (required for real mode)
|
447 |
+
- `document_id`: Optional custom document ID (if not provided, a UUID will be generated)
|
448 |
+
|
449 |
+
**Example Python Request:**
|
450 |
+
```python
|
451 |
+
import requests
|
452 |
+
import uuid
|
453 |
+
|
454 |
+
document_id = str(uuid.uuid4())
|
455 |
+
files = {'file': open('your_document.pdf', 'rb')}
|
456 |
+
response = requests.post(
|
457 |
+
'http://localhost:8000/pdf/upload',
|
458 |
+
files=files,
|
459 |
+
data={
|
460 |
+
'namespace': 'my-namespace',
|
461 |
+
'mock_mode': 'false',
|
462 |
+
'vector_database_id': '9',
|
463 |
+
'document_id': document_id
|
464 |
+
}
|
465 |
+
)
|
466 |
+
print(f'Status: {response.status_code}')
|
467 |
+
print(f'Response: {response.json()}')
|
468 |
+
```
|
469 |
+
|
470 |
+
### List Documents
|
471 |
+
|
472 |
+
```
|
473 |
+
GET /pdf/documents
|
474 |
+
```
|
475 |
+
|
476 |
+
**Parameters:**
|
477 |
+
- `namespace`: The namespace to retrieve documents from
|
478 |
+
- `vector_database_id`: The ID of the vector database to use
|
479 |
+
|
480 |
+
**Example Python Request:**
|
481 |
+
```python
|
482 |
+
import requests
|
483 |
+
|
484 |
+
response = requests.get(
|
485 |
+
'http://localhost:8000/pdf/documents',
|
486 |
+
params={
|
487 |
+
'namespace': 'my-namespace',
|
488 |
+
'vector_database_id': '9'
|
489 |
+
}
|
490 |
+
)
|
491 |
+
print(f'Status: {response.status_code}')
|
492 |
+
print(f'Documents: {response.json()}')
|
493 |
+
```
|
494 |
+
|
495 |
+
### Delete Document
|
496 |
+
|
497 |
+
```
|
498 |
+
DELETE /pdf/document
|
499 |
+
```
|
500 |
+
|
501 |
+
**Parameters:**
|
502 |
+
- `document_id`: The ID of the document to delete
|
503 |
+
- `namespace`: The namespace containing the document
|
504 |
+
- `vector_database_id`: The ID of the vector database
|
505 |
+
|
506 |
+
**Example Python Request:**
|
507 |
+
```python
|
508 |
+
import requests
|
509 |
+
|
510 |
+
response = requests.delete(
|
511 |
+
'http://localhost:8000/pdf/document',
|
512 |
+
params={
|
513 |
+
'document_id': 'your-document-id',
|
514 |
+
'namespace': 'my-namespace',
|
515 |
+
'vector_database_id': '9'
|
516 |
+
}
|
517 |
+
)
|
518 |
+
print(f'Status: {response.status_code}')
|
519 |
+
print(f'Result: {response.json()}')
|
520 |
+
```
|
521 |
+
|
522 |
+
### List Available Vector Databases
|
523 |
+
|
524 |
+
```
|
525 |
+
GET /postgres/vector-databases
|
526 |
+
```
|
527 |
+
|
528 |
+
**Example Python Request:**
|
529 |
+
```python
|
530 |
+
import requests
|
531 |
+
|
532 |
+
response = requests.get('http://localhost:8000/postgres/vector-databases')
|
533 |
+
vector_dbs = response.json()
|
534 |
+
print(f'Available vector databases: {vector_dbs}')
|
535 |
+
```
|
536 |
+
|
537 |
+
## PDF Processing and Vector Embedding
|
538 |
+
|
539 |
+
The system processes PDFs in the following steps:
|
540 |
+
|
541 |
+
1. **Text Extraction**: Uses `PyPDFLoader` from LangChain to extract text from the PDF.
|
542 |
+
2. **Text Chunking**: Splits the text into manageable chunks using `RecursiveCharacterTextSplitter` with a chunk size of 1000 characters and 100 character overlap.
|
543 |
+
3. **Embedding Creation**: Uses Google's Gemini embedding model (`models/embedding-001`) to create embeddings for each text chunk.
|
544 |
+
4. **Dimension Adjustment**: Ensures the embedding dimensions match the Pinecone index requirements:
|
545 |
+
- If Gemini produces 768-dim embeddings and Pinecone expects 1536-dim, each value is duplicated.
|
546 |
+
- For other mismatches, appropriate padding or truncation is applied.
|
547 |
+
5. **Vector Storage**: Uploads the embeddings to Pinecone in the specified namespace.
|
548 |
+
|
549 |
+
## Notes
|
550 |
+
|
551 |
+
- **Mock Mode**: When `mock_mode` is set to "true", the system simulates the PDF processing without actually creating or storing embeddings.
|
552 |
+
- **Namespace Handling**: When using a vector database ID, the namespace is automatically formatted as `vdb-{vector_database_id}`.
|
553 |
+
- **Error Handling**: The system validates vector dimensions and handles errors appropriately, with detailed logging.
|
554 |
+
- **PDF Storage**: Processed PDFs are stored in the `pdf_storage` directory with the document ID as the filename.
|
555 |
+
|
556 |
+
## Troubleshooting
|
557 |
+
|
558 |
+
- **Dimension Mismatch Error**: If you receive an error about vector dimensions not matching Pinecone index configuration, check that the embedding model and Pinecone index dimensions are compatible. The system will attempt to adjust dimensions but may encounter limits.
|
559 |
+
- **Connection Issues**: Verify that MongoDB, PostgreSQL, and Pinecone credentials are correctly configured in the environment variables.
|
560 |
+
- **Processing Failures**: Check the `pdf_api_debug.log` file for detailed error messages and processing information.
|
app.py
CHANGED
@@ -6,6 +6,11 @@ import os
|
|
6 |
import sys
|
7 |
import logging
|
8 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
# Cấu hình logging
|
11 |
logging.basicConfig(
|
@@ -83,6 +88,7 @@ try:
|
|
83 |
from app.api.rag_routes import router as rag_router
|
84 |
from app.api.websocket_routes import router as websocket_router
|
85 |
from app.api.pdf_routes import router as pdf_router
|
|
|
86 |
|
87 |
# Import middlewares
|
88 |
from app.utils.middleware import RequestLoggingMiddleware, ErrorHandlingMiddleware, DatabaseCheckMiddleware
|
@@ -93,6 +99,8 @@ try:
|
|
93 |
# Import cache
|
94 |
from app.utils.cache import get_cache
|
95 |
|
|
|
|
|
96 |
except ImportError as e:
|
97 |
logger.error(f"Error importing routes or middlewares: {e}")
|
98 |
raise
|
@@ -129,6 +137,14 @@ app.include_router(postgresql_router)
|
|
129 |
app.include_router(rag_router)
|
130 |
app.include_router(websocket_router)
|
131 |
app.include_router(pdf_router)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
# Root endpoint
|
134 |
@app.get("/")
|
@@ -235,8 +251,37 @@ if DEBUG:
|
|
235 |
"history_cache_ttl": os.getenv("HISTORY_CACHE_TTL", "3600"),
|
236 |
}
|
237 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
# Run the app with uvicorn when executed directly
|
240 |
if __name__ == "__main__":
|
241 |
-
port = int(os.environ.get("PORT",
|
242 |
-
uvicorn.run("app:app", host="0.0.0.0", port=port, reload=DEBUG)
|
|
|
6 |
import sys
|
7 |
import logging
|
8 |
from dotenv import load_dotenv
|
9 |
+
from fastapi.responses import JSONResponse, PlainTextResponse
|
10 |
+
from fastapi.staticfiles import StaticFiles
|
11 |
+
import time
|
12 |
+
import uuid
|
13 |
+
import traceback
|
14 |
|
15 |
# Cấu hình logging
|
16 |
logging.basicConfig(
|
|
|
88 |
from app.api.rag_routes import router as rag_router
|
89 |
from app.api.websocket_routes import router as websocket_router
|
90 |
from app.api.pdf_routes import router as pdf_router
|
91 |
+
from app.api.pdf_websocket import router as pdf_websocket_router
|
92 |
|
93 |
# Import middlewares
|
94 |
from app.utils.middleware import RequestLoggingMiddleware, ErrorHandlingMiddleware, DatabaseCheckMiddleware
|
|
|
99 |
# Import cache
|
100 |
from app.utils.cache import get_cache
|
101 |
|
102 |
+
logger.info("Successfully imported all routers and modules")
|
103 |
+
|
104 |
except ImportError as e:
|
105 |
logger.error(f"Error importing routes or middlewares: {e}")
|
106 |
raise
|
|
|
137 |
app.include_router(rag_router)
|
138 |
app.include_router(websocket_router)
|
139 |
app.include_router(pdf_router)
|
140 |
+
app.include_router(pdf_websocket_router)
|
141 |
+
|
142 |
+
# Log all registered routes
|
143 |
+
logger.info("Registered API routes:")
|
144 |
+
for route in app.routes:
|
145 |
+
if hasattr(route, "path") and hasattr(route, "methods"):
|
146 |
+
methods = ",".join(route.methods)
|
147 |
+
logger.info(f" {methods:<10} {route.path}")
|
148 |
|
149 |
# Root endpoint
|
150 |
@app.get("/")
|
|
|
251 |
"history_cache_ttl": os.getenv("HISTORY_CACHE_TTL", "3600"),
|
252 |
}
|
253 |
}
|
254 |
+
|
255 |
+
@app.get("/debug/websocket-routes")
|
256 |
+
def debug_websocket_routes():
|
257 |
+
"""Hiển thị thông tin về các WebSocket route (chỉ trong chế độ debug)"""
|
258 |
+
ws_routes = []
|
259 |
+
for route in app.routes:
|
260 |
+
if "websocket" in str(route.__class__).lower():
|
261 |
+
ws_routes.append({
|
262 |
+
"path": route.path,
|
263 |
+
"name": route.name,
|
264 |
+
"endpoint": str(route.endpoint)
|
265 |
+
})
|
266 |
+
return {
|
267 |
+
"websocket_routes": ws_routes,
|
268 |
+
"total_count": len(ws_routes)
|
269 |
+
}
|
270 |
+
|
271 |
+
@app.get("/debug/mock-status")
|
272 |
+
def debug_mock_status():
|
273 |
+
"""Display current mock mode settings"""
|
274 |
+
# Import was: from app.api.pdf_routes import USE_MOCK_MODE
|
275 |
+
# We've disabled mock mode
|
276 |
+
|
277 |
+
return {
|
278 |
+
"mock_mode": False, # Disabled - using real database
|
279 |
+
"mock_env_variable": os.getenv("USE_MOCK_MODE", "false"),
|
280 |
+
"debug_mode": DEBUG
|
281 |
+
}
|
282 |
+
|
283 |
|
284 |
# Run the app with uvicorn when executed directly
|
285 |
if __name__ == "__main__":
|
286 |
+
port = int(os.environ.get("PORT", 7860))
|
287 |
+
uvicorn.run("app:app", host="0.0.0.0", port=port, reload=DEBUG)
|
app/__init__.py
CHANGED
@@ -10,16 +10,11 @@ import os
|
|
10 |
# Thêm thư mục gốc vào sys.path
|
11 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
22 |
-
"app.py"))
|
23 |
-
app_module = importlib.util.module_from_spec(spec)
|
24 |
-
spec.loader.exec_module(app_module)
|
25 |
-
app = app_module.app
|
|
|
10 |
# Thêm thư mục gốc vào sys.path
|
11 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
12 |
|
13 |
+
# Sử dụng importlib để tránh circular import
|
14 |
+
import importlib.util
|
15 |
+
spec = importlib.util.spec_from_file_location("app_module",
|
16 |
+
os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
17 |
+
"app.py"))
|
18 |
+
app_module = importlib.util.module_from_spec(spec)
|
19 |
+
spec.loader.exec_module(app_module)
|
20 |
+
app = app_module.app
|
|
|
|
|
|
|
|
|
|
app/api/mongodb_routes.py
CHANGED
@@ -74,8 +74,8 @@ async def create_session(session: SessionCreate, response: Response):
|
|
74 |
created_at=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
75 |
)
|
76 |
|
77 |
-
# Kiểm tra nếu session cần gửi thông báo (response bắt đầu bằng "I
|
78 |
-
if session.response and session.response.strip().lower().startswith("i
|
79 |
# Gửi thông báo qua WebSocket
|
80 |
try:
|
81 |
notification_data = {
|
@@ -93,7 +93,7 @@ async def create_session(session: SessionCreate, response: Response):
|
|
93 |
|
94 |
# Khởi tạo task để gửi thông báo - sử dụng asyncio.create_task để đảm bảo không block quá trình chính
|
95 |
asyncio.create_task(send_notification(notification_data))
|
96 |
-
logger.info(f"Notification queued for session {session.session_id} - response starts with 'I
|
97 |
except Exception as e:
|
98 |
logger.error(f"Error queueing notification: {e}")
|
99 |
# Không dừng xử lý chính khi gửi thông báo thất bại
|
|
|
74 |
created_at=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
75 |
)
|
76 |
|
77 |
+
# Kiểm tra nếu session cần gửi thông báo (response bắt đầu bằng "I'm sorry")
|
78 |
+
if session.response and session.response.strip().lower().startswith("i'm sorry"):
|
79 |
# Gửi thông báo qua WebSocket
|
80 |
try:
|
81 |
notification_data = {
|
|
|
93 |
|
94 |
# Khởi tạo task để gửi thông báo - sử dụng asyncio.create_task để đảm bảo không block quá trình chính
|
95 |
asyncio.create_task(send_notification(notification_data))
|
96 |
+
logger.info(f"Notification queued for session {session.session_id} - response starts with 'I'm sorry'")
|
97 |
except Exception as e:
|
98 |
logger.error(f"Error queueing notification: {e}")
|
99 |
# Không dừng xử lý chính khi gửi thông báo thất bại
|
app/api/pdf_routes.py
CHANGED
@@ -1,12 +1,23 @@
|
|
1 |
import os
|
2 |
import shutil
|
3 |
import uuid
|
4 |
-
|
|
|
|
|
5 |
from fastapi.responses import JSONResponse
|
6 |
from typing import Optional, List, Dict, Any
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
from app.utils.pdf_processor import PDFProcessor
|
9 |
from app.models.pdf_models import PDFResponse, DeleteDocumentRequest, DocumentsListResponse
|
|
|
|
|
10 |
from app.api.pdf_websocket import (
|
11 |
send_pdf_upload_started,
|
12 |
send_pdf_upload_progress,
|
@@ -17,21 +28,156 @@ from app.api.pdf_websocket import (
|
|
17 |
send_pdf_delete_failed
|
18 |
)
|
19 |
|
20 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
router = APIRouter(
|
22 |
prefix="/pdf",
|
23 |
tags=["PDF Processing"],
|
24 |
)
|
25 |
|
26 |
-
#
|
27 |
-
TEMP_UPLOAD_DIR =
|
28 |
-
STORAGE_DIR =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
#
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
33 |
|
34 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
@router.post("/upload", response_model=PDFResponse)
|
36 |
async def upload_pdf(
|
37 |
file: UploadFile = File(...),
|
@@ -40,157 +186,395 @@ async def upload_pdf(
|
|
40 |
title: Optional[str] = Form(None),
|
41 |
description: Optional[str] = Form(None),
|
42 |
user_id: Optional[str] = Form(None),
|
43 |
-
|
|
|
|
|
|
|
|
|
44 |
):
|
45 |
"""
|
46 |
-
Upload
|
47 |
|
48 |
-
- **file**:
|
49 |
-
- **namespace**: Namespace
|
50 |
-
- **index_name**:
|
51 |
-
- **title**:
|
52 |
-
- **description**:
|
53 |
-
- **user_id**: ID
|
|
|
|
|
|
|
54 |
"""
|
|
|
|
|
|
|
|
|
|
|
55 |
try:
|
56 |
-
#
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
file_id = str(uuid.uuid4())
|
62 |
-
temp_file_path = os.path.join(TEMP_UPLOAD_DIR, f"{file_id}.pdf")
|
|
|
63 |
|
64 |
-
#
|
65 |
if user_id:
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
-
# Lưu file
|
69 |
with open(temp_file_path, "wb") as buffer:
|
70 |
-
|
|
|
71 |
|
72 |
-
#
|
73 |
metadata = {
|
74 |
"filename": file.filename,
|
75 |
"content_type": file.content_type
|
76 |
}
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
if title:
|
79 |
metadata["title"] = title
|
|
|
|
|
|
|
|
|
|
|
80 |
if description:
|
81 |
metadata["description"] = description
|
82 |
|
83 |
-
#
|
84 |
if user_id:
|
85 |
-
|
|
|
86 |
user_id,
|
87 |
file_id,
|
88 |
"file_preparation",
|
89 |
0.2,
|
90 |
"File saved, preparing for processing"
|
91 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
-
#
|
97 |
if user_id:
|
98 |
-
|
|
|
99 |
user_id,
|
100 |
file_id,
|
101 |
"embedding_start",
|
102 |
0.4,
|
103 |
"Starting to process PDF and create embeddings"
|
104 |
)
|
|
|
|
|
|
|
105 |
|
106 |
-
#
|
107 |
-
|
108 |
-
async def progress_callback_wrapper(step, progress, message):
|
109 |
-
if user_id:
|
110 |
-
await send_progress_update(user_id, file_id, step, progress, message)
|
111 |
-
|
112 |
-
# Xử lý PDF và tạo embeddings với callback đã được xử lý đúng cách
|
113 |
result = await processor.process_pdf(
|
114 |
file_path=temp_file_path,
|
115 |
document_id=file_id,
|
116 |
metadata=metadata,
|
117 |
-
progress_callback=
|
118 |
)
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
# Gửi thông báo hoàn thành qua WebSocket
|
126 |
-
if user_id:
|
127 |
-
await send_pdf_upload_completed(
|
128 |
-
user_id,
|
129 |
-
file_id,
|
130 |
-
file.filename,
|
131 |
-
result.get('chunks_processed', 0)
|
132 |
-
)
|
133 |
-
else:
|
134 |
-
# Gửi thông báo lỗi qua WebSocket
|
135 |
-
if user_id:
|
136 |
-
await send_pdf_upload_failed(
|
137 |
-
user_id,
|
138 |
-
file_id,
|
139 |
-
file.filename,
|
140 |
-
result.get('error', 'Unknown error')
|
141 |
-
)
|
142 |
-
|
143 |
-
# Dọn dẹp: xóa file tạm nếu vẫn còn
|
144 |
-
if os.path.exists(temp_file_path):
|
145 |
-
os.remove(temp_file_path)
|
146 |
-
|
147 |
-
return result
|
148 |
except Exception as e:
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
os.remove(temp_file_path)
|
152 |
-
|
153 |
-
|
154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
await send_pdf_upload_failed(
|
156 |
user_id,
|
157 |
file_id,
|
158 |
-
|
159 |
str(e)
|
160 |
)
|
|
|
|
|
|
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
)
|
166 |
-
|
167 |
-
|
168 |
-
async def send_progress_update(user_id, document_id, step, progress, message):
|
169 |
-
if user_id:
|
170 |
-
await send_pdf_upload_progress(user_id, document_id, step, progress, message)
|
171 |
|
172 |
# Endpoint xóa tài liệu
|
173 |
@router.delete("/namespace", response_model=PDFResponse)
|
174 |
async def delete_namespace(
|
175 |
namespace: str = "Default",
|
176 |
index_name: str = "testbot768",
|
177 |
-
|
|
|
|
|
178 |
):
|
179 |
"""
|
180 |
Xóa toàn bộ embeddings trong một namespace từ Pinecone (tương ứng xoá namespace)
|
181 |
|
182 |
- **namespace**: Namespace trong Pinecone (mặc định: "Default")
|
183 |
- **index_name**: Tên index Pinecone (mặc định: "testbot768")
|
|
|
184 |
- **user_id**: ID của người dùng để cập nhật trạng thái qua WebSocket
|
185 |
"""
|
|
|
|
|
186 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
# Gửi thông báo bắt đầu xóa qua WebSocket
|
188 |
if user_id:
|
189 |
await send_pdf_delete_started(user_id, namespace)
|
190 |
|
191 |
-
processor = PDFProcessor(
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
result = await processor.delete_namespace()
|
193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
# Gửi thông báo kết quả qua WebSocket
|
195 |
if user_id:
|
196 |
if result.get('success'):
|
@@ -200,6 +584,8 @@ async def delete_namespace(
|
|
200 |
|
201 |
return result
|
202 |
except Exception as e:
|
|
|
|
|
203 |
# Gửi thông báo lỗi qua WebSocket
|
204 |
if user_id:
|
205 |
await send_pdf_delete_failed(user_id, namespace, str(e))
|
@@ -211,23 +597,235 @@ async def delete_namespace(
|
|
211 |
|
212 |
# Endpoint lấy danh sách tài liệu
|
213 |
@router.get("/documents", response_model=DocumentsListResponse)
|
214 |
-
async def get_documents(
|
|
|
|
|
|
|
|
|
|
|
215 |
"""
|
216 |
Lấy thông tin về tất cả tài liệu đã được embed
|
217 |
|
218 |
- **namespace**: Namespace trong Pinecone (mặc định: "Default")
|
219 |
- **index_name**: Tên index Pinecone (mặc định: "testbot768")
|
|
|
220 |
"""
|
|
|
|
|
221 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
# Khởi tạo PDF processor
|
223 |
-
processor = PDFProcessor(
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
|
225 |
-
# Lấy danh sách documents
|
226 |
-
|
227 |
|
228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
except Exception as e:
|
|
|
|
|
230 |
return DocumentsListResponse(
|
231 |
success=False,
|
232 |
error=str(e)
|
233 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import shutil
|
3 |
import uuid
|
4 |
+
import sys
|
5 |
+
import traceback
|
6 |
+
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, BackgroundTasks, Depends, Query
|
7 |
from fastapi.responses import JSONResponse
|
8 |
from typing import Optional, List, Dict, Any
|
9 |
+
from sqlalchemy.orm import Session
|
10 |
+
import os.path
|
11 |
+
import logging
|
12 |
+
import tempfile
|
13 |
+
import time
|
14 |
+
import json
|
15 |
+
from datetime import datetime
|
16 |
|
17 |
from app.utils.pdf_processor import PDFProcessor
|
18 |
from app.models.pdf_models import PDFResponse, DeleteDocumentRequest, DocumentsListResponse
|
19 |
+
from app.database.postgresql import get_db
|
20 |
+
from app.database.models import VectorDatabase, Document, VectorStatus, ApiKey, DocumentContent
|
21 |
from app.api.pdf_websocket import (
|
22 |
send_pdf_upload_started,
|
23 |
send_pdf_upload_progress,
|
|
|
28 |
send_pdf_delete_failed
|
29 |
)
|
30 |
|
31 |
+
# Setup logger
|
32 |
+
logger = logging.getLogger(__name__)
|
33 |
+
|
34 |
+
# Add a stream handler for PDF debug logging
|
35 |
+
pdf_debug_logger = logging.getLogger("pdf_debug_api")
|
36 |
+
pdf_debug_logger.setLevel(logging.DEBUG)
|
37 |
+
|
38 |
+
# Check if a stream handler already exists, add one if not
|
39 |
+
if not any(isinstance(h, logging.StreamHandler) for h in pdf_debug_logger.handlers):
|
40 |
+
stream_handler = logging.StreamHandler(sys.stdout)
|
41 |
+
stream_handler.setLevel(logging.INFO)
|
42 |
+
pdf_debug_logger.addHandler(stream_handler)
|
43 |
+
|
44 |
+
# Initialize router
|
45 |
router = APIRouter(
|
46 |
prefix="/pdf",
|
47 |
tags=["PDF Processing"],
|
48 |
)
|
49 |
|
50 |
+
# Constants - Use system temp directory instead of creating our own
|
51 |
+
TEMP_UPLOAD_DIR = tempfile.gettempdir()
|
52 |
+
STORAGE_DIR = tempfile.gettempdir() # Also use system temp for storage
|
53 |
+
|
54 |
+
USE_MOCK_MODE = False # Disabled - using real database with improved connection handling
|
55 |
+
logger.info(f"PDF API starting with USE_MOCK_MODE={USE_MOCK_MODE}")
|
56 |
+
|
57 |
+
# Helper function to log with timestamp
|
58 |
+
def log_with_timestamp(message: str, level: str = "info", error: Exception = None):
|
59 |
+
"""Add timestamps to log messages and log to the PDF debug logger if available"""
|
60 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
61 |
+
full_message = f"{timestamp} - {message}"
|
62 |
+
|
63 |
+
if level.lower() == "debug":
|
64 |
+
logger.debug(full_message)
|
65 |
+
pdf_debug_logger.debug(full_message)
|
66 |
+
elif level.lower() == "info":
|
67 |
+
logger.info(full_message)
|
68 |
+
pdf_debug_logger.info(full_message)
|
69 |
+
elif level.lower() == "warning":
|
70 |
+
logger.warning(full_message)
|
71 |
+
pdf_debug_logger.warning(full_message)
|
72 |
+
elif level.lower() == "error":
|
73 |
+
logger.error(full_message)
|
74 |
+
pdf_debug_logger.error(full_message)
|
75 |
+
if error:
|
76 |
+
logger.error(traceback.format_exc())
|
77 |
+
pdf_debug_logger.error(traceback.format_exc())
|
78 |
+
else:
|
79 |
+
logger.info(full_message)
|
80 |
+
pdf_debug_logger.info(full_message)
|
81 |
|
82 |
+
# Helper function to log debug information during upload
|
83 |
+
def log_upload_debug(correlation_id: str, message: str, error: Exception = None):
|
84 |
+
"""Log detailed debug information about PDF uploads"""
|
85 |
+
pdf_debug_logger.debug(f"[{correlation_id}] {message}")
|
86 |
+
if error:
|
87 |
+
pdf_debug_logger.error(f"[{correlation_id}] Error: {str(error)}")
|
88 |
+
pdf_debug_logger.error(traceback.format_exc())
|
89 |
|
90 |
+
# Helper function to send progress updates
|
91 |
+
async def send_progress_update(user_id, file_id, step, progress=0.0, message=""):
|
92 |
+
"""Send PDF processing progress updates via WebSocket"""
|
93 |
+
try:
|
94 |
+
await send_pdf_upload_progress(user_id, file_id, step, progress, message)
|
95 |
+
except Exception as e:
|
96 |
+
logger.error(f"Error sending progress update: {e}")
|
97 |
+
logger.error(traceback.format_exc())
|
98 |
+
|
99 |
+
# Function with fixed indentation for the troublesome parts
|
100 |
+
async def handle_pdf_processing_result(result, correlation_id, user_id, file_id, filename, document, vector_status,
|
101 |
+
vector_database_id, temp_file_path, db, is_pdf, mock_mode):
|
102 |
+
"""Fixed version of the code with proper indentation"""
|
103 |
+
# If successful, update status but don't try to permanently store files
|
104 |
+
if result.get('success'):
|
105 |
+
try:
|
106 |
+
log_upload_debug(correlation_id, f"Processed file successfully - no permanent storage in Hugging Face environment")
|
107 |
+
except Exception as move_error:
|
108 |
+
log_upload_debug(correlation_id, f"Error in storage handling: {move_error}", move_error)
|
109 |
+
|
110 |
+
# Update status in PostgreSQL
|
111 |
+
if vector_database_id and document and vector_status:
|
112 |
+
try:
|
113 |
+
log_upload_debug(correlation_id, f"Updating vector status to 'completed' for document ID {document.id}")
|
114 |
+
vector_status.status = "completed"
|
115 |
+
vector_status.embedded_at = datetime.now()
|
116 |
+
vector_status.vector_id = file_id
|
117 |
+
document.is_embedded = True
|
118 |
+
db.commit()
|
119 |
+
log_upload_debug(correlation_id, f"Database status updated successfully")
|
120 |
+
except Exception as db_error:
|
121 |
+
log_upload_debug(correlation_id, f"Error updating database status: {db_error}", db_error)
|
122 |
+
|
123 |
+
# Send completion notification via WebSocket
|
124 |
+
if user_id:
|
125 |
+
try:
|
126 |
+
await send_pdf_upload_completed(
|
127 |
+
user_id,
|
128 |
+
file_id,
|
129 |
+
filename,
|
130 |
+
result.get('chunks_processed', 0)
|
131 |
+
)
|
132 |
+
log_upload_debug(correlation_id, f"Sent upload completed notification to user {user_id}")
|
133 |
+
except Exception as ws_error:
|
134 |
+
log_upload_debug(correlation_id, f"Error sending WebSocket notification: {ws_error}", ws_error)
|
135 |
+
|
136 |
+
# Add document information to the result
|
137 |
+
if document:
|
138 |
+
result["document_database_id"] = document.id
|
139 |
+
|
140 |
+
# Include mock_mode in response
|
141 |
+
result["mock_mode"] = mock_mode
|
142 |
+
else:
|
143 |
+
log_upload_debug(correlation_id, f"PDF processing failed: {result.get('error', 'Unknown error')}")
|
144 |
+
|
145 |
+
# Update error status in PostgreSQL
|
146 |
+
if vector_database_id and document and vector_status:
|
147 |
+
try:
|
148 |
+
log_upload_debug(correlation_id, f"Updating vector status to 'failed' for document ID {document.id}")
|
149 |
+
vector_status.status = "failed"
|
150 |
+
vector_status.error_message = result.get('error', 'Unknown error')
|
151 |
+
db.commit()
|
152 |
+
log_upload_debug(correlation_id, f"Database status updated for failure")
|
153 |
+
except Exception as db_error:
|
154 |
+
log_upload_debug(correlation_id, f"Error updating database status for failure: {db_error}", db_error)
|
155 |
+
|
156 |
+
# Send failure notification via WebSocket
|
157 |
+
if user_id:
|
158 |
+
try:
|
159 |
+
await send_pdf_upload_failed(
|
160 |
+
user_id,
|
161 |
+
file_id,
|
162 |
+
filename,
|
163 |
+
result.get('error', 'Unknown error')
|
164 |
+
)
|
165 |
+
log_upload_debug(correlation_id, f"Sent upload failed notification to user {user_id}")
|
166 |
+
except Exception as ws_error:
|
167 |
+
log_upload_debug(correlation_id, f"Error sending WebSocket notification: {ws_error}", ws_error)
|
168 |
+
|
169 |
+
# Cleanup: delete temporary file if it still exists
|
170 |
+
if os.path.exists(temp_file_path):
|
171 |
+
try:
|
172 |
+
os.remove(temp_file_path)
|
173 |
+
log_upload_debug(correlation_id, f"Removed temporary file {temp_file_path}")
|
174 |
+
except Exception as cleanup_error:
|
175 |
+
log_upload_debug(correlation_id, f"Error removing temporary file: {cleanup_error}", cleanup_error)
|
176 |
+
|
177 |
+
log_upload_debug(correlation_id, f"Upload request completed with success={result.get('success', False)}")
|
178 |
+
return result
|
179 |
+
|
180 |
+
# Endpoint for uploading and processing PDFs
|
181 |
@router.post("/upload", response_model=PDFResponse)
|
182 |
async def upload_pdf(
|
183 |
file: UploadFile = File(...),
|
|
|
186 |
title: Optional[str] = Form(None),
|
187 |
description: Optional[str] = Form(None),
|
188 |
user_id: Optional[str] = Form(None),
|
189 |
+
vector_database_id: Optional[int] = Form(None),
|
190 |
+
content_type: Optional[str] = Form(None), # Add content_type parameter
|
191 |
+
background_tasks: BackgroundTasks = None,
|
192 |
+
mock_mode: bool = Form(False), # Set to False to use real database
|
193 |
+
db: Session = Depends(get_db)
|
194 |
):
|
195 |
"""
|
196 |
+
Upload and process PDF file to create embeddings and store in Pinecone
|
197 |
|
198 |
+
- **file**: PDF file to process
|
199 |
+
- **namespace**: Namespace in Pinecone to store embeddings (default: "Default")
|
200 |
+
- **index_name**: Name of Pinecone index (default: "testbot768")
|
201 |
+
- **title**: Document title (optional)
|
202 |
+
- **description**: Document description (optional)
|
203 |
+
- **user_id**: User ID for WebSocket status updates
|
204 |
+
- **vector_database_id**: ID of vector database in PostgreSQL (optional)
|
205 |
+
- **content_type**: Content type of the file (optional)
|
206 |
+
- **mock_mode**: Simulate Pinecone operations instead of performing real calls (default: false)
|
207 |
"""
|
208 |
+
# Generate request ID for tracking
|
209 |
+
correlation_id = str(uuid.uuid4())[:8]
|
210 |
+
logger.info(f"[{correlation_id}] PDF upload request received: ns={namespace}, index={index_name}, user={user_id}")
|
211 |
+
log_upload_debug(correlation_id, f"Upload request: vector_db_id={vector_database_id}, mock_mode={mock_mode}")
|
212 |
+
|
213 |
try:
|
214 |
+
# Check file type - accept both PDF and plaintext for testing
|
215 |
+
is_pdf = file.filename.lower().endswith('.pdf')
|
216 |
+
is_text = file.filename.lower().endswith(('.txt', '.md', '.html'))
|
217 |
+
|
218 |
+
log_upload_debug(correlation_id, f"File type check: is_pdf={is_pdf}, is_text={is_text}, filename={file.filename}")
|
219 |
+
|
220 |
+
if not (is_pdf or is_text):
|
221 |
+
if not mock_mode:
|
222 |
+
# In real mode, only accept PDFs
|
223 |
+
log_upload_debug(correlation_id, f"Rejecting non-PDF file in real mode: {file.filename}")
|
224 |
+
raise HTTPException(status_code=400, detail="Only PDF files are accepted")
|
225 |
+
else:
|
226 |
+
# In mock mode, convert any file to text for testing
|
227 |
+
logger.warning(f"[{correlation_id}] Non-PDF file uploaded in mock mode: {file.filename} - will treat as text")
|
228 |
+
|
229 |
+
# If vector_database_id provided, get info from PostgreSQL
|
230 |
+
api_key = None
|
231 |
+
vector_db = None
|
232 |
|
233 |
+
if vector_database_id:
|
234 |
+
log_upload_debug(correlation_id, f"Looking up vector database ID {vector_database_id}")
|
235 |
+
|
236 |
+
vector_db = db.query(VectorDatabase).filter(
|
237 |
+
VectorDatabase.id == vector_database_id,
|
238 |
+
VectorDatabase.status == "active"
|
239 |
+
).first()
|
240 |
+
if not vector_db:
|
241 |
+
return PDFResponse(
|
242 |
+
success=False,
|
243 |
+
error=f"Vector database with ID {vector_database_id} not found or inactive"
|
244 |
+
)
|
245 |
+
|
246 |
+
log_upload_debug(correlation_id, f"Found vector database: id={vector_db.id}, name={vector_db.name}, index={vector_db.pinecone_index}")
|
247 |
+
|
248 |
+
# Use vector database information
|
249 |
+
# Try to get API key from relationship
|
250 |
+
log_upload_debug(correlation_id, f"Trying to get API key for vector database {vector_database_id}")
|
251 |
+
|
252 |
+
# Log available attributes
|
253 |
+
vector_db_attrs = dir(vector_db)
|
254 |
+
log_upload_debug(correlation_id, f"Vector DB attributes: {vector_db_attrs}")
|
255 |
+
|
256 |
+
if hasattr(vector_db, 'api_key_ref') and vector_db.api_key_ref:
|
257 |
+
log_upload_debug(correlation_id, f"Using API key from relationship for vector database ID {vector_database_id}")
|
258 |
+
log_upload_debug(correlation_id, f"api_key_ref type: {type(vector_db.api_key_ref)}")
|
259 |
+
log_upload_debug(correlation_id, f"api_key_ref attributes: {dir(vector_db.api_key_ref)}")
|
260 |
+
|
261 |
+
if hasattr(vector_db.api_key_ref, 'key_value'):
|
262 |
+
api_key = vector_db.api_key_ref.key_value
|
263 |
+
# Log first few chars of API key for debugging
|
264 |
+
key_prefix = api_key[:4] + "..." if api_key and len(api_key) > 4 else "invalid/empty"
|
265 |
+
log_upload_debug(correlation_id, f"API key retrieved: {key_prefix}, length: {len(api_key) if api_key else 0}")
|
266 |
+
logger.info(f"[{correlation_id}] Using API key from relationship for vector database ID {vector_database_id}")
|
267 |
+
else:
|
268 |
+
log_upload_debug(correlation_id, f"api_key_ref does not have key_value attribute")
|
269 |
+
elif hasattr(vector_db, 'api_key') and vector_db.api_key:
|
270 |
+
# Fallback to direct api_key if needed (deprecated)
|
271 |
+
api_key = vector_db.api_key
|
272 |
+
key_prefix = api_key[:4] + "..." if api_key and len(api_key) > 4 else "invalid/empty"
|
273 |
+
log_upload_debug(correlation_id, f"Using deprecated direct api_key: {key_prefix}")
|
274 |
+
logger.warning(f"[{correlation_id}] Using deprecated direct api_key for vector database ID {vector_database_id}")
|
275 |
+
else:
|
276 |
+
log_upload_debug(correlation_id, "No API key found in vector database")
|
277 |
+
|
278 |
+
# Use index from vector database
|
279 |
+
index_name = vector_db.pinecone_index
|
280 |
+
log_upload_debug(correlation_id, f"Using index name '{index_name}' from vector database")
|
281 |
+
logger.info(f"[{correlation_id}] Using index name '{index_name}' from vector database")
|
282 |
+
|
283 |
+
# Generate file_id and save temporary file
|
284 |
file_id = str(uuid.uuid4())
|
285 |
+
temp_file_path = os.path.join(TEMP_UPLOAD_DIR, f"{file_id}{'.pdf' if is_pdf else '.txt'}")
|
286 |
+
log_upload_debug(correlation_id, f"Generated file_id: {file_id}, temp path: {temp_file_path}")
|
287 |
|
288 |
+
# Send notification of upload start via WebSocket if user_id provided
|
289 |
if user_id:
|
290 |
+
try:
|
291 |
+
await send_pdf_upload_started(user_id, file.filename, file_id)
|
292 |
+
log_upload_debug(correlation_id, f"Sent upload started notification to user {user_id}")
|
293 |
+
except Exception as ws_error:
|
294 |
+
log_upload_debug(correlation_id, f"Error sending WebSocket notification: {ws_error}", ws_error)
|
295 |
+
|
296 |
+
# Save file
|
297 |
+
log_upload_debug(correlation_id, f"Reading file content")
|
298 |
+
file_content = await file.read()
|
299 |
+
log_upload_debug(correlation_id, f"File size: {len(file_content)} bytes")
|
300 |
|
|
|
301 |
with open(temp_file_path, "wb") as buffer:
|
302 |
+
buffer.write(file_content)
|
303 |
+
log_upload_debug(correlation_id, f"File saved to {temp_file_path}")
|
304 |
|
305 |
+
# Create metadata
|
306 |
metadata = {
|
307 |
"filename": file.filename,
|
308 |
"content_type": file.content_type
|
309 |
}
|
310 |
|
311 |
+
# Use provided content_type or fallback to file.content_type
|
312 |
+
actual_content_type = content_type or file.content_type
|
313 |
+
log_upload_debug(correlation_id, f"Using content_type: {actual_content_type}")
|
314 |
+
|
315 |
+
if not actual_content_type:
|
316 |
+
# Fallback content type based on file extension
|
317 |
+
if is_pdf:
|
318 |
+
actual_content_type = "application/pdf"
|
319 |
+
elif is_text:
|
320 |
+
actual_content_type = "text/plain"
|
321 |
+
else:
|
322 |
+
actual_content_type = "application/octet-stream"
|
323 |
+
|
324 |
+
log_upload_debug(correlation_id, f"No content_type provided, using fallback: {actual_content_type}")
|
325 |
+
|
326 |
+
metadata["content_type"] = actual_content_type
|
327 |
+
|
328 |
if title:
|
329 |
metadata["title"] = title
|
330 |
+
else:
|
331 |
+
# Use filename as title if not provided
|
332 |
+
title = file.filename
|
333 |
+
metadata["title"] = title
|
334 |
+
|
335 |
if description:
|
336 |
metadata["description"] = description
|
337 |
|
338 |
+
# Send progress update via WebSocket
|
339 |
if user_id:
|
340 |
+
try:
|
341 |
+
await send_progress_update(
|
342 |
user_id,
|
343 |
file_id,
|
344 |
"file_preparation",
|
345 |
0.2,
|
346 |
"File saved, preparing for processing"
|
347 |
)
|
348 |
+
log_upload_debug(correlation_id, f"Sent file preparation progress to user {user_id}")
|
349 |
+
except Exception as ws_error:
|
350 |
+
log_upload_debug(correlation_id, f"Error sending progress update: {ws_error}", ws_error)
|
351 |
+
|
352 |
+
# Create document record - do this regardless of mock mode
|
353 |
+
document = None
|
354 |
+
vector_status = None
|
355 |
+
|
356 |
+
if vector_database_id and vector_db:
|
357 |
+
log_upload_debug(correlation_id, f"Creating PostgreSQL records for document with vector_database_id={vector_database_id}")
|
358 |
|
359 |
+
# Create document record without file content
|
360 |
+
try:
|
361 |
+
document = Document(
|
362 |
+
name=title or file.filename,
|
363 |
+
file_type="pdf" if is_pdf else "text",
|
364 |
+
content_type=actual_content_type, # Use the actual_content_type here
|
365 |
+
size=len(file_content),
|
366 |
+
is_embedded=False,
|
367 |
+
vector_database_id=vector_database_id
|
368 |
+
)
|
369 |
+
db.add(document)
|
370 |
+
db.commit()
|
371 |
+
db.refresh(document)
|
372 |
+
log_upload_debug(correlation_id, f"Created document record: id={document.id}")
|
373 |
+
except Exception as doc_error:
|
374 |
+
log_upload_debug(correlation_id, f"Error creating document record: {doc_error}", doc_error)
|
375 |
+
raise
|
376 |
+
|
377 |
+
# Create document content record to store binary data separately
|
378 |
+
try:
|
379 |
+
document_content = DocumentContent(
|
380 |
+
document_id=document.id,
|
381 |
+
file_content=file_content
|
382 |
+
)
|
383 |
+
db.add(document_content)
|
384 |
+
db.commit()
|
385 |
+
log_upload_debug(correlation_id, f"Created document content record for document ID {document.id}")
|
386 |
+
except Exception as content_error:
|
387 |
+
log_upload_debug(correlation_id, f"Error creating document content: {content_error}", content_error)
|
388 |
+
raise
|
389 |
+
|
390 |
+
# Create vector status record
|
391 |
+
try:
|
392 |
+
vector_status = VectorStatus(
|
393 |
+
document_id=document.id,
|
394 |
+
vector_database_id=vector_database_id,
|
395 |
+
status="pending"
|
396 |
+
)
|
397 |
+
db.add(vector_status)
|
398 |
+
db.commit()
|
399 |
+
log_upload_debug(correlation_id, f"Created vector status record for document ID {document.id}")
|
400 |
+
except Exception as status_error:
|
401 |
+
log_upload_debug(correlation_id, f"Error creating vector status: {status_error}", status_error)
|
402 |
+
raise
|
403 |
+
|
404 |
+
logger.info(f"[{correlation_id}] Created document ID {document.id} and vector status in PostgreSQL")
|
405 |
+
|
406 |
+
# Initialize PDF processor with correct parameters
|
407 |
+
log_upload_debug(correlation_id, f"Initializing PDFProcessor: index={index_name}, vector_db_id={vector_database_id}, mock_mode={mock_mode}")
|
408 |
+
processor = PDFProcessor(
|
409 |
+
index_name=index_name,
|
410 |
+
namespace=namespace,
|
411 |
+
api_key=api_key,
|
412 |
+
vector_db_id=vector_database_id,
|
413 |
+
mock_mode=mock_mode,
|
414 |
+
correlation_id=correlation_id
|
415 |
+
)
|
416 |
|
417 |
+
# Send embedding start notification via WebSocket
|
418 |
if user_id:
|
419 |
+
try:
|
420 |
+
await send_progress_update(
|
421 |
user_id,
|
422 |
file_id,
|
423 |
"embedding_start",
|
424 |
0.4,
|
425 |
"Starting to process PDF and create embeddings"
|
426 |
)
|
427 |
+
log_upload_debug(correlation_id, f"Sent embedding start notification to user {user_id}")
|
428 |
+
except Exception as ws_error:
|
429 |
+
log_upload_debug(correlation_id, f"Error sending WebSocket notification: {ws_error}", ws_error)
|
430 |
|
431 |
+
# Process PDF and create embeddings with progress callback
|
432 |
+
log_upload_debug(correlation_id, f"Processing PDF with file_path={temp_file_path}, document_id={file_id}")
|
|
|
|
|
|
|
|
|
|
|
433 |
result = await processor.process_pdf(
|
434 |
file_path=temp_file_path,
|
435 |
document_id=file_id,
|
436 |
metadata=metadata,
|
437 |
+
progress_callback=send_progress_update if user_id else None
|
438 |
)
|
439 |
|
440 |
+
log_upload_debug(correlation_id, f"PDF processing result: {result}")
|
441 |
+
|
442 |
+
# Handle PDF processing result
|
443 |
+
return await handle_pdf_processing_result(result, correlation_id, user_id, file_id, file.filename, document, vector_status,
|
444 |
+
vector_database_id, temp_file_path, db, is_pdf, mock_mode)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
445 |
except Exception as e:
|
446 |
+
return await handle_upload_error(e, correlation_id, temp_file_path, user_id, file_id, file.filename, vector_database_id, vector_status, db, mock_mode)
|
447 |
+
|
448 |
+
# Error handling for upload_pdf function
|
449 |
+
async def handle_upload_error(e, correlation_id, temp_file_path, user_id, file_id, filename, vector_database_id, vector_status, db, mock_mode):
|
450 |
+
"""Fixed version of the error handling part with proper indentation"""
|
451 |
+
log_upload_debug(correlation_id, f"Error in upload_pdf: {str(e)}", e)
|
452 |
+
logger.exception(f"[{correlation_id}] Error in upload_pdf: {str(e)}")
|
453 |
+
|
454 |
+
# Cleanup on error
|
455 |
+
if os.path.exists(temp_file_path):
|
456 |
+
try:
|
457 |
os.remove(temp_file_path)
|
458 |
+
log_upload_debug(correlation_id, f"Cleaned up temp file after error: {temp_file_path}")
|
459 |
+
except Exception as cleanup_error:
|
460 |
+
log_upload_debug(correlation_id, f"Error cleaning up temporary file: {cleanup_error}", cleanup_error)
|
461 |
+
|
462 |
+
# Update error status in PostgreSQL
|
463 |
+
if vector_database_id and vector_status:
|
464 |
+
try:
|
465 |
+
vector_status.status = "failed"
|
466 |
+
vector_status.error_message = str(e)
|
467 |
+
db.commit()
|
468 |
+
log_upload_debug(correlation_id, f"Updated database with error status")
|
469 |
+
except Exception as db_error:
|
470 |
+
log_upload_debug(correlation_id, f"Error updating database with error status: {db_error}", db_error)
|
471 |
+
|
472 |
+
# Send failure notification via WebSocket
|
473 |
+
if user_id and file_id:
|
474 |
+
try:
|
475 |
await send_pdf_upload_failed(
|
476 |
user_id,
|
477 |
file_id,
|
478 |
+
filename,
|
479 |
str(e)
|
480 |
)
|
481 |
+
log_upload_debug(correlation_id, f"Sent failure notification for exception")
|
482 |
+
except Exception as ws_error:
|
483 |
+
log_upload_debug(correlation_id, f"Error sending WebSocket notification for failure: {ws_error}", ws_error)
|
484 |
|
485 |
+
log_upload_debug(correlation_id, f"Upload request failed with exception: {str(e)}")
|
486 |
+
return PDFResponse(
|
487 |
+
success=False,
|
488 |
+
error=str(e),
|
489 |
+
mock_mode=mock_mode
|
490 |
+
)
|
|
|
|
|
|
|
491 |
|
492 |
# Endpoint xóa tài liệu
|
493 |
@router.delete("/namespace", response_model=PDFResponse)
|
494 |
async def delete_namespace(
|
495 |
namespace: str = "Default",
|
496 |
index_name: str = "testbot768",
|
497 |
+
vector_database_id: Optional[int] = None,
|
498 |
+
user_id: Optional[str] = None,
|
499 |
+
db: Session = Depends(get_db)
|
500 |
):
|
501 |
"""
|
502 |
Xóa toàn bộ embeddings trong một namespace từ Pinecone (tương ứng xoá namespace)
|
503 |
|
504 |
- **namespace**: Namespace trong Pinecone (mặc định: "Default")
|
505 |
- **index_name**: Tên index Pinecone (mặc định: "testbot768")
|
506 |
+
- **vector_database_id**: ID của vector database trong PostgreSQL (nếu có)
|
507 |
- **user_id**: ID của người dùng để cập nhật trạng thái qua WebSocket
|
508 |
"""
|
509 |
+
logger.info(f"Delete namespace request: namespace={namespace}, index={index_name}, vector_db_id={vector_database_id}")
|
510 |
+
|
511 |
try:
|
512 |
+
# Nếu có vector_database_id, lấy thông tin từ PostgreSQL
|
513 |
+
api_key = None
|
514 |
+
vector_db = None
|
515 |
+
mock_mode = False # Use real mode by default
|
516 |
+
|
517 |
+
if vector_database_id:
|
518 |
+
vector_db = db.query(VectorDatabase).filter(
|
519 |
+
VectorDatabase.id == vector_database_id,
|
520 |
+
VectorDatabase.status == "active"
|
521 |
+
).first()
|
522 |
+
if not vector_db:
|
523 |
+
return PDFResponse(
|
524 |
+
success=False,
|
525 |
+
error=f"Vector database with ID {vector_database_id} not found or inactive"
|
526 |
+
)
|
527 |
+
|
528 |
+
# Use index from vector database
|
529 |
+
index_name = vector_db.pinecone_index
|
530 |
+
|
531 |
+
# Get API key
|
532 |
+
if hasattr(vector_db, 'api_key_ref') and vector_db.api_key_ref:
|
533 |
+
api_key = vector_db.api_key_ref.key_value
|
534 |
+
elif hasattr(vector_db, 'api_key') and vector_db.api_key:
|
535 |
+
api_key = vector_db.api_key
|
536 |
+
|
537 |
+
# Use namespace based on vector database ID
|
538 |
+
namespace = f"vdb-{vector_database_id}" if vector_database_id else namespace
|
539 |
+
logger.info(f"Using namespace '{namespace}' based on vector database ID")
|
540 |
+
|
541 |
# Gửi thông báo bắt đầu xóa qua WebSocket
|
542 |
if user_id:
|
543 |
await send_pdf_delete_started(user_id, namespace)
|
544 |
|
545 |
+
processor = PDFProcessor(
|
546 |
+
index_name=index_name,
|
547 |
+
namespace=namespace,
|
548 |
+
api_key=api_key,
|
549 |
+
vector_db_id=vector_database_id,
|
550 |
+
mock_mode=mock_mode
|
551 |
+
)
|
552 |
result = await processor.delete_namespace()
|
553 |
|
554 |
+
# If in mock mode, also update PostgreSQL to reflect the deletion
|
555 |
+
if mock_mode and result.get('success') and vector_database_id:
|
556 |
+
try:
|
557 |
+
# Update vector statuses for this database
|
558 |
+
affected_count = db.query(VectorStatus).filter(
|
559 |
+
VectorStatus.vector_database_id == vector_database_id,
|
560 |
+
VectorStatus.status != "deleted"
|
561 |
+
).update({"status": "deleted", "updated_at": datetime.now()})
|
562 |
+
|
563 |
+
# Update document embedding status
|
564 |
+
db.query(Document).filter(
|
565 |
+
Document.vector_database_id == vector_database_id,
|
566 |
+
Document.is_embedded == True
|
567 |
+
).update({"is_embedded": False})
|
568 |
+
|
569 |
+
db.commit()
|
570 |
+
logger.info(f"Updated {affected_count} vector statuses to 'deleted'")
|
571 |
+
|
572 |
+
# Include this info in the result
|
573 |
+
result["updated_records"] = affected_count
|
574 |
+
except Exception as db_error:
|
575 |
+
logger.error(f"Error updating PostgreSQL records after namespace deletion: {db_error}")
|
576 |
+
result["postgresql_update_error"] = str(db_error)
|
577 |
+
|
578 |
# Gửi thông báo kết quả qua WebSocket
|
579 |
if user_id:
|
580 |
if result.get('success'):
|
|
|
584 |
|
585 |
return result
|
586 |
except Exception as e:
|
587 |
+
logger.exception(f"Error in delete_namespace: {str(e)}")
|
588 |
+
|
589 |
# Gửi thông báo lỗi qua WebSocket
|
590 |
if user_id:
|
591 |
await send_pdf_delete_failed(user_id, namespace, str(e))
|
|
|
597 |
|
598 |
# Endpoint lấy danh sách tài liệu
|
599 |
@router.get("/documents", response_model=DocumentsListResponse)
|
600 |
+
async def get_documents(
|
601 |
+
namespace: str = "Default",
|
602 |
+
index_name: str = "testbot768",
|
603 |
+
vector_database_id: Optional[int] = None,
|
604 |
+
db: Session = Depends(get_db)
|
605 |
+
):
|
606 |
"""
|
607 |
Lấy thông tin về tất cả tài liệu đã được embed
|
608 |
|
609 |
- **namespace**: Namespace trong Pinecone (mặc định: "Default")
|
610 |
- **index_name**: Tên index Pinecone (mặc định: "testbot768")
|
611 |
+
- **vector_database_id**: ID của vector database trong PostgreSQL (nếu có)
|
612 |
"""
|
613 |
+
logger.info(f"Get documents request: namespace={namespace}, index={index_name}, vector_db_id={vector_database_id}")
|
614 |
+
|
615 |
try:
|
616 |
+
# Nếu có vector_database_id, lấy thông tin từ PostgreSQL
|
617 |
+
api_key = None
|
618 |
+
vector_db = None
|
619 |
+
mock_mode = False # Use real mode by default
|
620 |
+
|
621 |
+
if vector_database_id:
|
622 |
+
vector_db = db.query(VectorDatabase).filter(
|
623 |
+
VectorDatabase.id == vector_database_id,
|
624 |
+
VectorDatabase.status == "active"
|
625 |
+
).first()
|
626 |
+
|
627 |
+
if not vector_db:
|
628 |
+
return DocumentsListResponse(
|
629 |
+
success=False,
|
630 |
+
error=f"Vector database with ID {vector_database_id} not found or inactive"
|
631 |
+
)
|
632 |
+
|
633 |
+
# Use index from vector database
|
634 |
+
index_name = vector_db.pinecone_index
|
635 |
+
|
636 |
+
# Get API key
|
637 |
+
if hasattr(vector_db, 'api_key_ref') and vector_db.api_key_ref:
|
638 |
+
api_key = vector_db.api_key_ref.key_value
|
639 |
+
elif hasattr(vector_db, 'api_key') and vector_db.api_key:
|
640 |
+
api_key = vector_db.api_key
|
641 |
+
|
642 |
+
# Use namespace based on vector database ID
|
643 |
+
namespace = f"vdb-{vector_database_id}" if vector_database_id else namespace
|
644 |
+
logger.info(f"Using namespace '{namespace}' based on vector database ID")
|
645 |
+
|
646 |
# Khởi tạo PDF processor
|
647 |
+
processor = PDFProcessor(
|
648 |
+
index_name=index_name,
|
649 |
+
namespace=namespace,
|
650 |
+
api_key=api_key,
|
651 |
+
vector_db_id=vector_database_id,
|
652 |
+
mock_mode=mock_mode
|
653 |
+
)
|
654 |
|
655 |
+
# Lấy danh sách documents từ Pinecone
|
656 |
+
pinecone_result = await processor.list_documents()
|
657 |
|
658 |
+
# If vector_database_id is provided, also fetch from PostgreSQL
|
659 |
+
if vector_database_id:
|
660 |
+
try:
|
661 |
+
# Get all successfully embedded documents for this vector database
|
662 |
+
documents = db.query(Document).join(
|
663 |
+
VectorStatus, Document.id == VectorStatus.document_id
|
664 |
+
).filter(
|
665 |
+
Document.vector_database_id == vector_database_id,
|
666 |
+
Document.is_embedded == True,
|
667 |
+
VectorStatus.status == "completed"
|
668 |
+
).all()
|
669 |
+
|
670 |
+
# Add document info to the result
|
671 |
+
if documents:
|
672 |
+
pinecone_result["postgresql_documents"] = [
|
673 |
+
{
|
674 |
+
"id": doc.id,
|
675 |
+
"name": doc.name,
|
676 |
+
"file_type": doc.file_type,
|
677 |
+
"content_type": doc.content_type,
|
678 |
+
"created_at": doc.created_at.isoformat() if doc.created_at else None
|
679 |
+
}
|
680 |
+
for doc in documents
|
681 |
+
]
|
682 |
+
pinecone_result["postgresql_document_count"] = len(documents)
|
683 |
+
except Exception as db_error:
|
684 |
+
logger.error(f"Error fetching PostgreSQL documents: {db_error}")
|
685 |
+
pinecone_result["postgresql_error"] = str(db_error)
|
686 |
+
|
687 |
+
return pinecone_result
|
688 |
except Exception as e:
|
689 |
+
logger.exception(f"Error in get_documents: {str(e)}")
|
690 |
+
|
691 |
return DocumentsListResponse(
|
692 |
success=False,
|
693 |
error=str(e)
|
694 |
+
)
|
695 |
+
|
696 |
+
# Health check endpoint for PDF API
|
697 |
+
@router.get("/health")
|
698 |
+
async def health_check():
|
699 |
+
return {
|
700 |
+
"status": "healthy",
|
701 |
+
"version": "1.0.0",
|
702 |
+
"message": "PDF API is running"
|
703 |
+
}
|
704 |
+
|
705 |
+
# Document deletion endpoint
|
706 |
+
@router.delete("/document", response_model=PDFResponse)
|
707 |
+
async def delete_document(
|
708 |
+
document_id: str,
|
709 |
+
namespace: str = "Default",
|
710 |
+
index_name: str = "testbot768",
|
711 |
+
vector_database_id: Optional[int] = None,
|
712 |
+
user_id: Optional[str] = None,
|
713 |
+
mock_mode: bool = False,
|
714 |
+
db: Session = Depends(get_db)
|
715 |
+
):
|
716 |
+
"""
|
717 |
+
Delete vectors for a specific document from the vector database
|
718 |
+
|
719 |
+
- **document_id**: ID of the document to delete
|
720 |
+
- **namespace**: Namespace in the vector database (default: "Default")
|
721 |
+
- **index_name**: Name of the vector index (default: "testbot768")
|
722 |
+
- **vector_database_id**: ID of vector database in PostgreSQL (optional)
|
723 |
+
- **user_id**: User ID for WebSocket status updates (optional)
|
724 |
+
- **mock_mode**: Simulate vector database operations (default: false)
|
725 |
+
"""
|
726 |
+
logger.info(f"Delete document request: document_id={document_id}, namespace={namespace}, index={index_name}, vector_db_id={vector_database_id}, mock_mode={mock_mode}")
|
727 |
+
|
728 |
+
try:
|
729 |
+
# If vector_database_id is provided, get info from PostgreSQL
|
730 |
+
api_key = None
|
731 |
+
vector_db = None
|
732 |
+
|
733 |
+
if vector_database_id:
|
734 |
+
vector_db = db.query(VectorDatabase).filter(
|
735 |
+
VectorDatabase.id == vector_database_id,
|
736 |
+
VectorDatabase.status == "active"
|
737 |
+
).first()
|
738 |
+
if not vector_db:
|
739 |
+
return PDFResponse(
|
740 |
+
success=False,
|
741 |
+
error=f"Vector database with ID {vector_database_id} not found or inactive"
|
742 |
+
)
|
743 |
+
|
744 |
+
# Use index from vector database
|
745 |
+
index_name = vector_db.pinecone_index
|
746 |
+
|
747 |
+
# Get API key
|
748 |
+
if hasattr(vector_db, 'api_key_ref') and vector_db.api_key_ref:
|
749 |
+
api_key = vector_db.api_key_ref.key_value
|
750 |
+
elif hasattr(vector_db, 'api_key') and vector_db.api_key:
|
751 |
+
api_key = vector_db.api_key
|
752 |
+
|
753 |
+
# Use namespace based on vector database ID
|
754 |
+
namespace = f"vdb-{vector_database_id}" if vector_database_id else namespace
|
755 |
+
logger.info(f"Using namespace '{namespace}' based on vector database ID")
|
756 |
+
|
757 |
+
# Send notification of deletion start via WebSocket if user_id provided
|
758 |
+
if user_id:
|
759 |
+
try:
|
760 |
+
await send_pdf_delete_started(user_id, document_id)
|
761 |
+
except Exception as ws_error:
|
762 |
+
logger.error(f"Error sending WebSocket notification: {ws_error}")
|
763 |
+
|
764 |
+
# Initialize PDF processor
|
765 |
+
processor = PDFProcessor(
|
766 |
+
index_name=index_name,
|
767 |
+
namespace=namespace,
|
768 |
+
api_key=api_key,
|
769 |
+
vector_db_id=vector_database_id,
|
770 |
+
mock_mode=mock_mode
|
771 |
+
)
|
772 |
+
|
773 |
+
# Delete document vectors
|
774 |
+
result = await processor.delete_document(document_id)
|
775 |
+
|
776 |
+
# If successful and vector_database_id is provided, update PostgreSQL records
|
777 |
+
if result.get('success') and vector_database_id:
|
778 |
+
try:
|
779 |
+
# Find document by vector ID if it exists
|
780 |
+
document = db.query(Document).join(
|
781 |
+
VectorStatus, Document.id == VectorStatus.document_id
|
782 |
+
).filter(
|
783 |
+
Document.vector_database_id == vector_database_id,
|
784 |
+
VectorStatus.vector_id == document_id
|
785 |
+
).first()
|
786 |
+
|
787 |
+
if document:
|
788 |
+
# Update vector status
|
789 |
+
vector_status = db.query(VectorStatus).filter(
|
790 |
+
VectorStatus.document_id == document.id,
|
791 |
+
VectorStatus.vector_database_id == vector_database_id
|
792 |
+
).first()
|
793 |
+
|
794 |
+
if vector_status:
|
795 |
+
vector_status.status = "deleted"
|
796 |
+
db.commit()
|
797 |
+
result["postgresql_updated"] = True
|
798 |
+
logger.info(f"Updated vector status for document ID {document.id} to 'deleted'")
|
799 |
+
except Exception as db_error:
|
800 |
+
logger.error(f"Error updating PostgreSQL records: {db_error}")
|
801 |
+
result["postgresql_error"] = str(db_error)
|
802 |
+
|
803 |
+
# Send notification of deletion completion via WebSocket if user_id provided
|
804 |
+
if user_id:
|
805 |
+
try:
|
806 |
+
if result.get('success'):
|
807 |
+
await send_pdf_delete_completed(user_id, document_id)
|
808 |
+
else:
|
809 |
+
await send_pdf_delete_failed(user_id, document_id, result.get('error', 'Unknown error'))
|
810 |
+
except Exception as ws_error:
|
811 |
+
logger.error(f"Error sending WebSocket notification: {ws_error}")
|
812 |
+
|
813 |
+
return result
|
814 |
+
except Exception as e:
|
815 |
+
logger.exception(f"Error in delete_document: {str(e)}")
|
816 |
+
|
817 |
+
# Send notification of deletion failure via WebSocket if user_id provided
|
818 |
+
if user_id:
|
819 |
+
try:
|
820 |
+
await send_pdf_delete_failed(user_id, document_id, str(e))
|
821 |
+
except Exception as ws_error:
|
822 |
+
logger.error(f"Error sending WebSocket notification: {ws_error}")
|
823 |
+
|
824 |
+
return PDFResponse(
|
825 |
+
success=False,
|
826 |
+
error=str(e),
|
827 |
+
mock_mode=mock_mode
|
828 |
+
)
|
829 |
+
|
830 |
+
|
831 |
+
|
app/api/pdf_websocket.py
CHANGED
@@ -108,7 +108,184 @@ class ConnectionManager:
|
|
108 |
# Tạo instance của ConnectionManager
|
109 |
manager = ConnectionManager()
|
110 |
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
async def websocket_endpoint(websocket: WebSocket, user_id: str):
|
113 |
"""Endpoint WebSocket để cập nhật tiến trình xử lý PDF"""
|
114 |
await manager.connect(websocket, user_id)
|
@@ -123,7 +300,7 @@ async def websocket_endpoint(websocket: WebSocket, user_id: str):
|
|
123 |
manager.disconnect(websocket, user_id)
|
124 |
|
125 |
# API endpoints để kiểm tra trạng thái WebSocket
|
126 |
-
@router.get("/status", response_model=AllConnectionsStatus, responses={
|
127 |
200: {
|
128 |
"description": "Successful response",
|
129 |
"content": {
|
@@ -151,7 +328,7 @@ async def get_all_websocket_connections():
|
|
151 |
"""
|
152 |
return manager.get_connection_status()
|
153 |
|
154 |
-
@router.get("/status/{user_id}", response_model=ConnectionStatus, responses={
|
155 |
200: {
|
156 |
"description": "Successful response for active connection",
|
157 |
"content": {
|
@@ -245,11 +422,12 @@ async def send_pdf_delete_started(user_id: str, namespace: str):
|
|
245 |
"timestamp": int(time.time())
|
246 |
}, user_id)
|
247 |
|
248 |
-
async def send_pdf_delete_completed(user_id: str, namespace: str):
|
249 |
"""Gửi thông báo hoàn thành xóa PDF"""
|
250 |
await manager.send_message({
|
251 |
"type": "pdf_delete_completed",
|
252 |
"namespace": namespace,
|
|
|
253 |
"timestamp": int(time.time())
|
254 |
}, user_id)
|
255 |
|
|
|
108 |
# Tạo instance của ConnectionManager
|
109 |
manager = ConnectionManager()
|
110 |
|
111 |
+
# Test route for manual WebSocket sending
|
112 |
+
@router.get("/ws/test/{user_id}")
|
113 |
+
async def test_websocket_send(user_id: str):
|
114 |
+
"""
|
115 |
+
Test route to manually send a WebSocket message to a user
|
116 |
+
This is useful for debugging WebSocket connections
|
117 |
+
"""
|
118 |
+
logger.info(f"Attempting to send test message to user: {user_id}")
|
119 |
+
|
120 |
+
# Check if user has a connection
|
121 |
+
status = manager.get_connection_status(user_id)
|
122 |
+
if not status["active"]:
|
123 |
+
logger.warning(f"No active WebSocket connection for user: {user_id}")
|
124 |
+
return {"success": False, "message": f"No active WebSocket connection for user: {user_id}"}
|
125 |
+
|
126 |
+
# Send test message
|
127 |
+
await manager.send_message({
|
128 |
+
"type": "test_message",
|
129 |
+
"message": "This is a test WebSocket message",
|
130 |
+
"timestamp": int(time.time())
|
131 |
+
}, user_id)
|
132 |
+
|
133 |
+
logger.info(f"Test message sent to user: {user_id}")
|
134 |
+
return {"success": True, "message": f"Test message sent to user: {user_id}"}
|
135 |
+
|
136 |
+
@router.websocket("/ws/pdf/{user_id}")
|
137 |
+
async def websocket_endpoint(websocket: WebSocket, user_id: str):
|
138 |
+
"""Endpoint WebSocket để cập nhật tiến trình xử lý PDF"""
|
139 |
+
logger.info(f"WebSocket connection request received for user: {user_id}")
|
140 |
+
|
141 |
+
try:
|
142 |
+
await manager.connect(websocket, user_id)
|
143 |
+
logger.info(f"WebSocket connection accepted for user: {user_id}")
|
144 |
+
|
145 |
+
# Send a test message to confirm connection
|
146 |
+
await manager.send_message({
|
147 |
+
"type": "connection_established",
|
148 |
+
"message": "WebSocket connection established successfully",
|
149 |
+
"user_id": user_id,
|
150 |
+
"timestamp": int(time.time())
|
151 |
+
}, user_id)
|
152 |
+
|
153 |
+
try:
|
154 |
+
while True:
|
155 |
+
# Đợi tin nhắn từ client (chỉ để giữ kết nối)
|
156 |
+
data = await websocket.receive_text()
|
157 |
+
logger.debug(f"Received from client: {data}")
|
158 |
+
|
159 |
+
# Echo back to confirm receipt
|
160 |
+
if data != "heartbeat": # Don't echo heartbeats
|
161 |
+
await manager.send_message({
|
162 |
+
"type": "echo",
|
163 |
+
"message": f"Received: {data}",
|
164 |
+
"timestamp": int(time.time())
|
165 |
+
}, user_id)
|
166 |
+
except WebSocketDisconnect:
|
167 |
+
logger.info(f"WebSocket disconnected for user: {user_id}")
|
168 |
+
manager.disconnect(websocket, user_id)
|
169 |
+
except Exception as e:
|
170 |
+
logger.error(f"WebSocket error: {str(e)}")
|
171 |
+
manager.disconnect(websocket, user_id)
|
172 |
+
except Exception as e:
|
173 |
+
logger.error(f"Failed to establish WebSocket connection: {str(e)}")
|
174 |
+
# Ensure the connection is closed properly
|
175 |
+
if websocket.client_state != 4: # 4 = CLOSED
|
176 |
+
await websocket.close(code=1011, reason=f"Server error: {str(e)}")
|
177 |
+
|
178 |
+
import logging
|
179 |
+
from typing import Dict, List, Optional, Any
|
180 |
+
from fastapi import WebSocket, WebSocketDisconnect, APIRouter
|
181 |
+
from pydantic import BaseModel
|
182 |
+
import json
|
183 |
+
import time
|
184 |
+
|
185 |
+
# Cấu hình logging
|
186 |
+
logger = logging.getLogger(__name__)
|
187 |
+
|
188 |
+
# Models cho Swagger documentation
|
189 |
+
class ConnectionStatus(BaseModel):
|
190 |
+
user_id: str
|
191 |
+
active: bool
|
192 |
+
connection_count: int
|
193 |
+
last_activity: Optional[float] = None
|
194 |
+
|
195 |
+
class UserConnection(BaseModel):
|
196 |
+
user_id: str
|
197 |
+
connection_count: int
|
198 |
+
|
199 |
+
class AllConnectionsStatus(BaseModel):
|
200 |
+
total_users: int
|
201 |
+
total_connections: int
|
202 |
+
users: List[UserConnection]
|
203 |
+
|
204 |
+
# Khởi tạo router
|
205 |
+
router = APIRouter(
|
206 |
+
prefix="",
|
207 |
+
tags=["WebSockets"],
|
208 |
+
)
|
209 |
+
|
210 |
+
class ConnectionManager:
|
211 |
+
"""Quản lý các kết nối WebSocket"""
|
212 |
+
|
213 |
+
def __init__(self):
|
214 |
+
# Lưu trữ các kết nối theo user_id
|
215 |
+
self.active_connections: Dict[str, List[WebSocket]] = {}
|
216 |
+
|
217 |
+
async def connect(self, websocket: WebSocket, user_id: str):
|
218 |
+
"""Kết nối một WebSocket mới"""
|
219 |
+
await websocket.accept()
|
220 |
+
if user_id not in self.active_connections:
|
221 |
+
self.active_connections[user_id] = []
|
222 |
+
self.active_connections[user_id].append(websocket)
|
223 |
+
logger.info(f"New WebSocket connection for user {user_id}. Total connections: {len(self.active_connections[user_id])}")
|
224 |
+
|
225 |
+
def disconnect(self, websocket: WebSocket, user_id: str):
|
226 |
+
"""Ngắt kết nối WebSocket"""
|
227 |
+
if user_id in self.active_connections:
|
228 |
+
if websocket in self.active_connections[user_id]:
|
229 |
+
self.active_connections[user_id].remove(websocket)
|
230 |
+
# Xóa user_id khỏi dict nếu không còn kết nối nào
|
231 |
+
if not self.active_connections[user_id]:
|
232 |
+
del self.active_connections[user_id]
|
233 |
+
logger.info(f"WebSocket disconnected for user {user_id}")
|
234 |
+
|
235 |
+
async def send_message(self, message: Dict[str, Any], user_id: str):
|
236 |
+
"""Gửi tin nhắn tới tất cả kết nối của một user"""
|
237 |
+
if user_id in self.active_connections:
|
238 |
+
disconnected_websockets = []
|
239 |
+
for websocket in self.active_connections[user_id]:
|
240 |
+
try:
|
241 |
+
await websocket.send_text(json.dumps(message))
|
242 |
+
except Exception as e:
|
243 |
+
logger.error(f"Error sending message to WebSocket: {str(e)}")
|
244 |
+
disconnected_websockets.append(websocket)
|
245 |
+
|
246 |
+
# Xóa các kết nối bị ngắt
|
247 |
+
for websocket in disconnected_websockets:
|
248 |
+
self.disconnect(websocket, user_id)
|
249 |
+
|
250 |
+
def get_connection_status(self, user_id: str = None) -> Dict[str, Any]:
|
251 |
+
"""Lấy thông tin về trạng thái kết nối WebSocket"""
|
252 |
+
if user_id:
|
253 |
+
# Trả về thông tin kết nối cho user cụ thể
|
254 |
+
if user_id in self.active_connections:
|
255 |
+
return {
|
256 |
+
"user_id": user_id,
|
257 |
+
"active": True,
|
258 |
+
"connection_count": len(self.active_connections[user_id]),
|
259 |
+
"last_activity": time.time()
|
260 |
+
}
|
261 |
+
else:
|
262 |
+
return {
|
263 |
+
"user_id": user_id,
|
264 |
+
"active": False,
|
265 |
+
"connection_count": 0,
|
266 |
+
"last_activity": None
|
267 |
+
}
|
268 |
+
else:
|
269 |
+
# Trả về thông tin tất cả kết nối
|
270 |
+
result = {
|
271 |
+
"total_users": len(self.active_connections),
|
272 |
+
"total_connections": sum(len(connections) for connections in self.active_connections.values()),
|
273 |
+
"users": []
|
274 |
+
}
|
275 |
+
|
276 |
+
for uid, connections in self.active_connections.items():
|
277 |
+
result["users"].append({
|
278 |
+
"user_id": uid,
|
279 |
+
"connection_count": len(connections)
|
280 |
+
})
|
281 |
+
|
282 |
+
return result
|
283 |
+
|
284 |
+
|
285 |
+
# Tạo instance của ConnectionManager
|
286 |
+
manager = ConnectionManager()
|
287 |
+
|
288 |
+
@router.websocket("/ws/pdf/{user_id}")
|
289 |
async def websocket_endpoint(websocket: WebSocket, user_id: str):
|
290 |
"""Endpoint WebSocket để cập nhật tiến trình xử lý PDF"""
|
291 |
await manager.connect(websocket, user_id)
|
|
|
300 |
manager.disconnect(websocket, user_id)
|
301 |
|
302 |
# API endpoints để kiểm tra trạng thái WebSocket
|
303 |
+
@router.get("/ws/status", response_model=AllConnectionsStatus, responses={
|
304 |
200: {
|
305 |
"description": "Successful response",
|
306 |
"content": {
|
|
|
328 |
"""
|
329 |
return manager.get_connection_status()
|
330 |
|
331 |
+
@router.get("/ws/status/{user_id}", response_model=ConnectionStatus, responses={
|
332 |
200: {
|
333 |
"description": "Successful response for active connection",
|
334 |
"content": {
|
|
|
422 |
"timestamp": int(time.time())
|
423 |
}, user_id)
|
424 |
|
425 |
+
async def send_pdf_delete_completed(user_id: str, namespace: str, deleted_count: int = 0):
|
426 |
"""Gửi thông báo hoàn thành xóa PDF"""
|
427 |
await manager.send_message({
|
428 |
"type": "pdf_delete_completed",
|
429 |
"namespace": namespace,
|
430 |
+
"deleted_count": deleted_count,
|
431 |
"timestamp": int(time.time())
|
432 |
}, user_id)
|
433 |
|
app/api/postgresql_routes.py
CHANGED
@@ -4,8 +4,10 @@ import traceback
|
|
4 |
from datetime import datetime, timedelta, timezone
|
5 |
import time
|
6 |
from functools import lru_cache
|
|
|
7 |
|
8 |
-
from fastapi import APIRouter, HTTPException, Depends, Query,
|
|
|
9 |
from sqlalchemy.orm import Session
|
10 |
from sqlalchemy.exc import SQLAlchemyError
|
11 |
from typing import List, Optional, Dict, Any
|
@@ -16,9 +18,10 @@ from sqlalchemy import text, inspect, func
|
|
16 |
from sqlalchemy.exc import SQLAlchemyError
|
17 |
from sqlalchemy import desc, func
|
18 |
from cachetools import TTLCache
|
|
|
19 |
|
20 |
from app.database.postgresql import get_db
|
21 |
-
from app.database.models import FAQItem, EmergencyItem, EventItem, AboutPixity, SolanaSummit, DaNangBucketList, ApiKey, VectorDatabase, Document, VectorStatus, TelegramBot, ChatEngine, BotEngine, EngineVectorDb
|
22 |
from pydantic import BaseModel, Field, ConfigDict
|
23 |
|
24 |
# Configure logging
|
@@ -1713,7 +1716,8 @@ async def update_solana_summit(
|
|
1713 |
|
1714 |
# --- API Key models and endpoints ---
|
1715 |
class ApiKeyBase(BaseModel):
|
1716 |
-
|
|
|
1717 |
description: Optional[str] = None
|
1718 |
is_active: bool = True
|
1719 |
|
@@ -1721,13 +1725,13 @@ class ApiKeyCreate(ApiKeyBase):
|
|
1721 |
pass
|
1722 |
|
1723 |
class ApiKeyUpdate(BaseModel):
|
1724 |
-
|
|
|
1725 |
description: Optional[str] = None
|
1726 |
is_active: Optional[bool] = None
|
1727 |
|
1728 |
class ApiKeyResponse(ApiKeyBase):
|
1729 |
id: int
|
1730 |
-
key: str
|
1731 |
created_at: datetime
|
1732 |
last_used: Optional[datetime] = None
|
1733 |
|
@@ -1772,23 +1776,10 @@ async def create_api_key(
|
|
1772 |
Create a new API key.
|
1773 |
"""
|
1774 |
try:
|
1775 |
-
# Generate a secure API key
|
1776 |
-
import secrets
|
1777 |
-
import string
|
1778 |
-
import time
|
1779 |
-
|
1780 |
-
# Create a random key with a prefix for easier identification
|
1781 |
-
prefix = "px_"
|
1782 |
-
random_key = ''.join(secrets.choice(string.ascii_letters + string.digits) for _ in range(32))
|
1783 |
-
timestamp = hex(int(time.time()))[2:]
|
1784 |
-
|
1785 |
-
# Combine parts for the final key
|
1786 |
-
key_value = f"{prefix}{timestamp}_{random_key}"
|
1787 |
-
|
1788 |
# Create API key object
|
1789 |
db_api_key = ApiKey(
|
1790 |
-
|
1791 |
-
|
1792 |
description=api_key.description,
|
1793 |
is_active=api_key.is_active
|
1794 |
)
|
@@ -1844,8 +1835,10 @@ async def update_api_key(
|
|
1844 |
raise HTTPException(status_code=404, detail=f"API key with ID {api_key_id} not found")
|
1845 |
|
1846 |
# Update fields if provided
|
1847 |
-
if api_key_update.
|
1848 |
-
db_api_key.
|
|
|
|
|
1849 |
if api_key_update.description is not None:
|
1850 |
db_api_key.description = api_key_update.description
|
1851 |
if api_key_update.is_active is not None:
|
@@ -1905,17 +1898,17 @@ async def validate_api_key(
|
|
1905 |
Validate an API key and update its last_used timestamp.
|
1906 |
"""
|
1907 |
try:
|
1908 |
-
db_api_key = db.query(ApiKey).filter(ApiKey.
|
1909 |
if not db_api_key:
|
1910 |
return {"valid": False, "message": "Invalid or inactive API key"}
|
1911 |
-
|
1912 |
-
# Update
|
1913 |
-
db_api_key.last_used = datetime.
|
1914 |
db.commit()
|
1915 |
|
1916 |
return {
|
1917 |
"valid": True,
|
1918 |
-
"
|
1919 |
"id": db_api_key.id,
|
1920 |
"message": "API key is valid"
|
1921 |
}
|
@@ -1929,23 +1922,30 @@ class VectorDatabaseBase(BaseModel):
|
|
1929 |
name: str
|
1930 |
description: Optional[str] = None
|
1931 |
pinecone_index: str
|
1932 |
-
|
1933 |
status: str = "active"
|
1934 |
|
1935 |
class VectorDatabaseCreate(VectorDatabaseBase):
|
|
|
1936 |
pass
|
1937 |
|
1938 |
class VectorDatabaseUpdate(BaseModel):
|
1939 |
name: Optional[str] = None
|
1940 |
description: Optional[str] = None
|
1941 |
pinecone_index: Optional[str] = None
|
1942 |
-
|
1943 |
status: Optional[str] = None
|
1944 |
|
1945 |
-
class VectorDatabaseResponse(
|
|
|
|
|
|
|
|
|
|
|
1946 |
id: int
|
1947 |
created_at: datetime
|
1948 |
updated_at: datetime
|
|
|
1949 |
|
1950 |
model_config = ConfigDict(from_attributes=True)
|
1951 |
|
@@ -1960,6 +1960,7 @@ class VectorDatabaseDetailResponse(BaseModel):
|
|
1960 |
document_count: int
|
1961 |
embedded_count: int
|
1962 |
pending_count: int
|
|
|
1963 |
|
1964 |
model_config = ConfigDict(from_attributes=True)
|
1965 |
|
@@ -1999,7 +2000,7 @@ async def create_vector_database(
|
|
1999 |
db: Session = Depends(get_db)
|
2000 |
):
|
2001 |
"""
|
2002 |
-
Create a new vector database.
|
2003 |
"""
|
2004 |
try:
|
2005 |
# Check if a database with the same name already exists
|
@@ -2007,6 +2008,71 @@ async def create_vector_database(
|
|
2007 |
if existing_db:
|
2008 |
raise HTTPException(status_code=400, detail=f"Vector database with name '{vector_db.name}' already exists")
|
2009 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2010 |
# Create new vector database
|
2011 |
db_vector_db = VectorDatabase(**vector_db.model_dump())
|
2012 |
|
@@ -2014,7 +2080,16 @@ async def create_vector_database(
|
|
2014 |
db.commit()
|
2015 |
db.refresh(db_vector_db)
|
2016 |
|
2017 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2018 |
except HTTPException:
|
2019 |
raise
|
2020 |
except SQLAlchemyError as e:
|
@@ -2068,6 +2143,12 @@ async def update_vector_database(
|
|
2068 |
if existing_db:
|
2069 |
raise HTTPException(status_code=400, detail=f"Vector database with name '{vector_db_update.name}' already exists")
|
2070 |
|
|
|
|
|
|
|
|
|
|
|
|
|
2071 |
# Update fields if provided
|
2072 |
update_data = vector_db_update.model_dump(exclude_unset=True)
|
2073 |
for key, value in update_data.items():
|
@@ -2149,6 +2230,7 @@ async def get_vector_database_info(
|
|
2149 |
):
|
2150 |
"""
|
2151 |
Get detailed information about a vector database including document counts.
|
|
|
2152 |
"""
|
2153 |
try:
|
2154 |
# Get the vector database
|
@@ -2173,6 +2255,40 @@ async def get_vector_database_info(
|
|
2173 |
Document.is_embedded == False
|
2174 |
).scalar()
|
2175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2176 |
# Create response with added counts
|
2177 |
result = VectorDatabaseDetailResponse(
|
2178 |
id=vector_db.id,
|
@@ -2184,7 +2300,8 @@ async def get_vector_database_info(
|
|
2184 |
updated_at=vector_db.updated_at,
|
2185 |
document_count=total_docs or 0,
|
2186 |
embedded_count=embedded_docs or 0,
|
2187 |
-
pending_count=pending_docs or 0
|
|
|
2188 |
)
|
2189 |
|
2190 |
return result
|
@@ -2198,29 +2315,26 @@ async def get_vector_database_info(
|
|
2198 |
# --- Document models and endpoints ---
|
2199 |
class DocumentBase(BaseModel):
|
2200 |
name: str
|
2201 |
-
content_type: str
|
2202 |
vector_database_id: int
|
2203 |
-
file_metadata: Optional[Dict[str, Any]] = None
|
2204 |
|
2205 |
-
class DocumentCreate(
|
2206 |
-
|
|
|
2207 |
|
2208 |
class DocumentUpdate(BaseModel):
|
2209 |
name: Optional[str] = None
|
2210 |
-
file_metadata: Optional[Dict[str, Any]] = None
|
2211 |
|
2212 |
class DocumentResponse(BaseModel):
|
2213 |
id: int
|
2214 |
name: str
|
2215 |
file_type: str
|
|
|
2216 |
size: int
|
2217 |
-
content_type: str
|
2218 |
created_at: datetime
|
2219 |
updated_at: datetime
|
2220 |
vector_database_id: int
|
2221 |
vector_database_name: Optional[str] = None
|
2222 |
is_embedded: bool
|
2223 |
-
file_metadata: Optional[Dict[str, Any]] = None
|
2224 |
|
2225 |
model_config = ConfigDict(from_attributes=True)
|
2226 |
|
@@ -2261,15 +2375,32 @@ async def get_documents(
|
|
2261 |
# Add vector database name
|
2262 |
result = []
|
2263 |
for doc in documents:
|
2264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2265 |
|
2266 |
# Get vector database name if not already populated
|
2267 |
-
|
|
|
2268 |
vector_db = db.query(VectorDatabase).filter(VectorDatabase.id == doc.vector_database_id).first()
|
2269 |
vector_db_name = vector_db.name if vector_db else f"db_{doc.vector_database_id}"
|
2270 |
-
|
|
|
|
|
|
|
2271 |
|
2272 |
-
|
|
|
|
|
2273 |
|
2274 |
return result
|
2275 |
except SQLAlchemyError as e:
|
@@ -2309,141 +2440,41 @@ async def get_document(
|
|
2309 |
logger.error(traceback.format_exc())
|
2310 |
raise HTTPException(status_code=500, detail=f"Error retrieving document: {str(e)}")
|
2311 |
|
2312 |
-
@router.
|
2313 |
-
async def
|
2314 |
-
document_id: int = Path(..., gt=0),
|
2315 |
-
document_update: DocumentUpdate = Body(...),
|
2316 |
-
db: Session = Depends(get_db)
|
2317 |
-
):
|
2318 |
-
"""
|
2319 |
-
Update document details.
|
2320 |
-
"""
|
2321 |
-
try:
|
2322 |
-
document = db.query(Document).filter(Document.id == document_id).first()
|
2323 |
-
if not document:
|
2324 |
-
raise HTTPException(status_code=404, detail=f"Document with ID {document_id} not found")
|
2325 |
-
|
2326 |
-
# Update fields if provided
|
2327 |
-
if document_update.name is not None:
|
2328 |
-
document.name = document_update.name
|
2329 |
-
|
2330 |
-
if document_update.file_metadata is not None:
|
2331 |
-
# Merge with existing metadata if it exists
|
2332 |
-
if document.file_metadata:
|
2333 |
-
document.file_metadata.update(document_update.file_metadata)
|
2334 |
-
else:
|
2335 |
-
document.file_metadata = document_update.file_metadata
|
2336 |
-
|
2337 |
-
db.commit()
|
2338 |
-
db.refresh(document)
|
2339 |
-
|
2340 |
-
# Get vector database name
|
2341 |
-
vector_db = db.query(VectorDatabase).filter(VectorDatabase.id == document.vector_database_id).first()
|
2342 |
-
vector_db_name = vector_db.name if vector_db else f"db_{document.vector_database_id}"
|
2343 |
-
|
2344 |
-
# Create response with vector database name
|
2345 |
-
result = DocumentResponse.model_validate(document, from_attributes=True)
|
2346 |
-
result.vector_database_name = vector_db_name
|
2347 |
-
|
2348 |
-
return result
|
2349 |
-
except HTTPException:
|
2350 |
-
raise
|
2351 |
-
except SQLAlchemyError as e:
|
2352 |
-
db.rollback()
|
2353 |
-
logger.error(f"Database error updating document: {e}")
|
2354 |
-
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
2355 |
-
except Exception as e:
|
2356 |
-
db.rollback()
|
2357 |
-
logger.error(f"Error updating document: {e}")
|
2358 |
-
logger.error(traceback.format_exc())
|
2359 |
-
raise HTTPException(status_code=500, detail=f"Error updating document: {str(e)}")
|
2360 |
-
|
2361 |
-
@router.delete("/documents/{document_id}", response_model=dict)
|
2362 |
-
async def delete_document(
|
2363 |
document_id: int = Path(..., gt=0),
|
2364 |
db: Session = Depends(get_db)
|
2365 |
):
|
2366 |
"""
|
2367 |
-
|
|
|
2368 |
"""
|
2369 |
try:
|
|
|
2370 |
document = db.query(Document).filter(Document.id == document_id).first()
|
2371 |
if not document:
|
2372 |
raise HTTPException(status_code=404, detail=f"Document with ID {document_id} not found")
|
2373 |
|
2374 |
-
#
|
2375 |
-
db.query(
|
2376 |
-
|
2377 |
-
|
2378 |
-
db.delete(document)
|
2379 |
-
db.commit()
|
2380 |
-
|
2381 |
-
return {"message": f"Document with ID {document_id} deleted successfully"}
|
2382 |
-
except HTTPException:
|
2383 |
-
raise
|
2384 |
-
except SQLAlchemyError as e:
|
2385 |
-
db.rollback()
|
2386 |
-
logger.error(f"Database error deleting document: {e}")
|
2387 |
-
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
2388 |
-
except Exception as e:
|
2389 |
-
db.rollback()
|
2390 |
-
logger.error(f"Error deleting document: {e}")
|
2391 |
-
logger.error(traceback.format_exc())
|
2392 |
-
raise HTTPException(status_code=500, detail=f"Error deleting document: {str(e)}")
|
2393 |
-
|
2394 |
-
@router.get("/vector-databases/{vector_db_id}/documents", response_model=List[DocumentResponse])
|
2395 |
-
async def get_documents_by_vector_db(
|
2396 |
-
vector_db_id: int = Path(..., gt=0),
|
2397 |
-
skip: int = 0,
|
2398 |
-
limit: int = 100,
|
2399 |
-
is_embedded: Optional[bool] = None,
|
2400 |
-
file_type: Optional[str] = None,
|
2401 |
-
db: Session = Depends(get_db)
|
2402 |
-
):
|
2403 |
-
"""
|
2404 |
-
Get all documents for a specific vector database.
|
2405 |
-
|
2406 |
-
- **skip**: Number of items to skip
|
2407 |
-
- **limit**: Maximum number of items to return
|
2408 |
-
- **is_embedded**: Filter by embedding status
|
2409 |
-
- **file_type**: Filter by file type
|
2410 |
-
"""
|
2411 |
-
try:
|
2412 |
-
# Verify vector database exists
|
2413 |
-
vector_db = db.query(VectorDatabase).filter(VectorDatabase.id == vector_db_id).first()
|
2414 |
-
if not vector_db:
|
2415 |
-
raise HTTPException(status_code=404, detail=f"Vector database with ID {vector_db_id} not found")
|
2416 |
-
|
2417 |
-
# Build query
|
2418 |
-
query = db.query(Document).filter(Document.vector_database_id == vector_db_id)
|
2419 |
|
2420 |
-
#
|
2421 |
-
if
|
2422 |
-
query = query.filter(Document.is_embedded == is_embedded)
|
2423 |
-
|
2424 |
-
if file_type is not None:
|
2425 |
-
query = query.filter(Document.file_type == file_type)
|
2426 |
-
|
2427 |
-
# Execute query with pagination
|
2428 |
-
documents = query.offset(skip).limit(limit).all()
|
2429 |
|
2430 |
-
#
|
2431 |
-
|
2432 |
-
|
2433 |
-
|
2434 |
-
|
2435 |
-
|
2436 |
-
|
2437 |
-
return result
|
2438 |
except HTTPException:
|
2439 |
raise
|
2440 |
-
except SQLAlchemyError as e:
|
2441 |
-
logger.error(f"Database error retrieving documents: {e}")
|
2442 |
-
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
2443 |
except Exception as e:
|
2444 |
-
logger.error(f"Error retrieving
|
2445 |
logger.error(traceback.format_exc())
|
2446 |
-
raise HTTPException(status_code=500, detail=f"Error retrieving
|
2447 |
|
2448 |
# --- Telegram Bot models and endpoints ---
|
2449 |
class TelegramBotBase(BaseModel):
|
@@ -2468,73 +2499,6 @@ class TelegramBotResponse(TelegramBotBase):
|
|
2468 |
|
2469 |
model_config = ConfigDict(from_attributes=True)
|
2470 |
|
2471 |
-
@router.get("/telegram-bots", response_model=List[TelegramBotResponse])
|
2472 |
-
async def get_telegram_bots(
|
2473 |
-
skip: int = 0,
|
2474 |
-
limit: int = 100,
|
2475 |
-
status: Optional[str] = None,
|
2476 |
-
db: Session = Depends(get_db)
|
2477 |
-
):
|
2478 |
-
"""
|
2479 |
-
Get all Telegram bots.
|
2480 |
-
|
2481 |
-
- **skip**: Number of items to skip
|
2482 |
-
- **limit**: Maximum number of items to return
|
2483 |
-
- **status**: Filter by status (e.g., 'active', 'inactive')
|
2484 |
-
"""
|
2485 |
-
try:
|
2486 |
-
query = db.query(TelegramBot)
|
2487 |
-
|
2488 |
-
if status:
|
2489 |
-
query = query.filter(TelegramBot.status == status)
|
2490 |
-
|
2491 |
-
bots = query.offset(skip).limit(limit).all()
|
2492 |
-
return [TelegramBotResponse.model_validate(bot, from_attributes=True) for bot in bots]
|
2493 |
-
except SQLAlchemyError as e:
|
2494 |
-
logger.error(f"Database error retrieving Telegram bots: {e}")
|
2495 |
-
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
2496 |
-
except Exception as e:
|
2497 |
-
logger.error(f"Error retrieving Telegram bots: {e}")
|
2498 |
-
logger.error(traceback.format_exc())
|
2499 |
-
raise HTTPException(status_code=500, detail=f"Error retrieving Telegram bots: {str(e)}")
|
2500 |
-
|
2501 |
-
@router.post("/telegram-bots", response_model=TelegramBotResponse)
|
2502 |
-
async def create_telegram_bot(
|
2503 |
-
bot: TelegramBotCreate,
|
2504 |
-
db: Session = Depends(get_db)
|
2505 |
-
):
|
2506 |
-
"""
|
2507 |
-
Create a new Telegram bot.
|
2508 |
-
"""
|
2509 |
-
try:
|
2510 |
-
# Check if bot with this username already exists
|
2511 |
-
existing_bot = db.query(TelegramBot).filter(TelegramBot.username == bot.username).first()
|
2512 |
-
if existing_bot:
|
2513 |
-
raise HTTPException(
|
2514 |
-
status_code=400,
|
2515 |
-
detail=f"Telegram bot with username '{bot.username}' already exists"
|
2516 |
-
)
|
2517 |
-
|
2518 |
-
# Create new bot
|
2519 |
-
db_bot = TelegramBot(**bot.model_dump())
|
2520 |
-
|
2521 |
-
db.add(db_bot)
|
2522 |
-
db.commit()
|
2523 |
-
db.refresh(db_bot)
|
2524 |
-
|
2525 |
-
return TelegramBotResponse.model_validate(db_bot, from_attributes=True)
|
2526 |
-
except HTTPException:
|
2527 |
-
raise
|
2528 |
-
except SQLAlchemyError as e:
|
2529 |
-
db.rollback()
|
2530 |
-
logger.error(f"Database error creating Telegram bot: {e}")
|
2531 |
-
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
2532 |
-
except Exception as e:
|
2533 |
-
db.rollback()
|
2534 |
-
logger.error(f"Error creating Telegram bot: {e}")
|
2535 |
-
logger.error(traceback.format_exc())
|
2536 |
-
raise HTTPException(status_code=500, detail=f"Error creating Telegram bot: {str(e)}")
|
2537 |
-
|
2538 |
@router.get("/telegram-bots/{bot_id}", response_model=TelegramBotResponse)
|
2539 |
async def get_telegram_bot(
|
2540 |
bot_id: int = Path(..., gt=0),
|
@@ -3543,4 +3507,301 @@ async def batch_delete_emergency_contacts(
|
|
3543 |
db.rollback()
|
3544 |
logger.error(f"Database error in batch_delete_emergency_contacts: {e}")
|
3545 |
logger.error(traceback.format_exc())
|
3546 |
-
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from datetime import datetime, timedelta, timezone
|
5 |
import time
|
6 |
from functools import lru_cache
|
7 |
+
from pathlib import Path as pathlib_Path # Import Path from pathlib with a different name
|
8 |
|
9 |
+
from fastapi import APIRouter, HTTPException, Depends, Query, Body, Response, File, UploadFile, Form, BackgroundTasks
|
10 |
+
from fastapi.params import Path # Import Path explicitly from fastapi.params instead
|
11 |
from sqlalchemy.orm import Session
|
12 |
from sqlalchemy.exc import SQLAlchemyError
|
13 |
from typing import List, Optional, Dict, Any
|
|
|
18 |
from sqlalchemy.exc import SQLAlchemyError
|
19 |
from sqlalchemy import desc, func
|
20 |
from cachetools import TTLCache
|
21 |
+
import uuid
|
22 |
|
23 |
from app.database.postgresql import get_db
|
24 |
+
from app.database.models import FAQItem, EmergencyItem, EventItem, AboutPixity, SolanaSummit, DaNangBucketList, ApiKey, VectorDatabase, Document, VectorStatus, TelegramBot, ChatEngine, BotEngine, EngineVectorDb, DocumentContent
|
25 |
from pydantic import BaseModel, Field, ConfigDict
|
26 |
|
27 |
# Configure logging
|
|
|
1716 |
|
1717 |
# --- API Key models and endpoints ---
|
1718 |
class ApiKeyBase(BaseModel):
|
1719 |
+
key_type: str
|
1720 |
+
key_value: str
|
1721 |
description: Optional[str] = None
|
1722 |
is_active: bool = True
|
1723 |
|
|
|
1725 |
pass
|
1726 |
|
1727 |
class ApiKeyUpdate(BaseModel):
|
1728 |
+
key_type: Optional[str] = None
|
1729 |
+
key_value: Optional[str] = None
|
1730 |
description: Optional[str] = None
|
1731 |
is_active: Optional[bool] = None
|
1732 |
|
1733 |
class ApiKeyResponse(ApiKeyBase):
|
1734 |
id: int
|
|
|
1735 |
created_at: datetime
|
1736 |
last_used: Optional[datetime] = None
|
1737 |
|
|
|
1776 |
Create a new API key.
|
1777 |
"""
|
1778 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1779 |
# Create API key object
|
1780 |
db_api_key = ApiKey(
|
1781 |
+
key_type=api_key.key_type,
|
1782 |
+
key_value=api_key.key_value,
|
1783 |
description=api_key.description,
|
1784 |
is_active=api_key.is_active
|
1785 |
)
|
|
|
1835 |
raise HTTPException(status_code=404, detail=f"API key with ID {api_key_id} not found")
|
1836 |
|
1837 |
# Update fields if provided
|
1838 |
+
if api_key_update.key_type is not None:
|
1839 |
+
db_api_key.key_type = api_key_update.key_type
|
1840 |
+
if api_key_update.key_value is not None:
|
1841 |
+
db_api_key.key_value = api_key_update.key_value
|
1842 |
if api_key_update.description is not None:
|
1843 |
db_api_key.description = api_key_update.description
|
1844 |
if api_key_update.is_active is not None:
|
|
|
1898 |
Validate an API key and update its last_used timestamp.
|
1899 |
"""
|
1900 |
try:
|
1901 |
+
db_api_key = db.query(ApiKey).filter(ApiKey.key_value == key, ApiKey.is_active == True).first()
|
1902 |
if not db_api_key:
|
1903 |
return {"valid": False, "message": "Invalid or inactive API key"}
|
1904 |
+
|
1905 |
+
# Update last used timestamp
|
1906 |
+
db_api_key.last_used = datetime.now()
|
1907 |
db.commit()
|
1908 |
|
1909 |
return {
|
1910 |
"valid": True,
|
1911 |
+
"key_type": db_api_key.key_type,
|
1912 |
"id": db_api_key.id,
|
1913 |
"message": "API key is valid"
|
1914 |
}
|
|
|
1922 |
name: str
|
1923 |
description: Optional[str] = None
|
1924 |
pinecone_index: str
|
1925 |
+
api_key_id: Optional[int] = None # Make api_key_id optional to handle NULL values
|
1926 |
status: str = "active"
|
1927 |
|
1928 |
class VectorDatabaseCreate(VectorDatabaseBase):
|
1929 |
+
api_key_id: int # Keep this required for new databases
|
1930 |
pass
|
1931 |
|
1932 |
class VectorDatabaseUpdate(BaseModel):
|
1933 |
name: Optional[str] = None
|
1934 |
description: Optional[str] = None
|
1935 |
pinecone_index: Optional[str] = None
|
1936 |
+
api_key_id: Optional[int] = None
|
1937 |
status: Optional[str] = None
|
1938 |
|
1939 |
+
class VectorDatabaseResponse(BaseModel):
|
1940 |
+
name: str
|
1941 |
+
description: Optional[str] = None
|
1942 |
+
pinecone_index: str
|
1943 |
+
api_key_id: Optional[int] = None # Make api_key_id optional to handle NULL values
|
1944 |
+
status: str
|
1945 |
id: int
|
1946 |
created_at: datetime
|
1947 |
updated_at: datetime
|
1948 |
+
message: Optional[str] = None # Add message field for notifications
|
1949 |
|
1950 |
model_config = ConfigDict(from_attributes=True)
|
1951 |
|
|
|
1960 |
document_count: int
|
1961 |
embedded_count: int
|
1962 |
pending_count: int
|
1963 |
+
message: Optional[str] = None # Add message field for notifications
|
1964 |
|
1965 |
model_config = ConfigDict(from_attributes=True)
|
1966 |
|
|
|
2000 |
db: Session = Depends(get_db)
|
2001 |
):
|
2002 |
"""
|
2003 |
+
Create a new vector database. If the specified Pinecone index doesn't exist, it will be created automatically.
|
2004 |
"""
|
2005 |
try:
|
2006 |
# Check if a database with the same name already exists
|
|
|
2008 |
if existing_db:
|
2009 |
raise HTTPException(status_code=400, detail=f"Vector database with name '{vector_db.name}' already exists")
|
2010 |
|
2011 |
+
# Check if the API key exists
|
2012 |
+
api_key = db.query(ApiKey).filter(ApiKey.id == vector_db.api_key_id).first()
|
2013 |
+
if not api_key:
|
2014 |
+
raise HTTPException(status_code=400, detail=f"API key with ID {vector_db.api_key_id} not found")
|
2015 |
+
|
2016 |
+
# Initialize Pinecone client with the API key
|
2017 |
+
from pinecone import Pinecone, ServerlessSpec
|
2018 |
+
pc_client = Pinecone(api_key=api_key.key_value)
|
2019 |
+
|
2020 |
+
# Check if the index exists
|
2021 |
+
index_list = pc_client.list_indexes()
|
2022 |
+
index_names = index_list.names() if hasattr(index_list, 'names') else []
|
2023 |
+
|
2024 |
+
index_exists = vector_db.pinecone_index in index_names
|
2025 |
+
index_created = False
|
2026 |
+
|
2027 |
+
if not index_exists:
|
2028 |
+
# Index doesn't exist - try to create it
|
2029 |
+
try:
|
2030 |
+
logger.info(f"Pinecone index '{vector_db.pinecone_index}' does not exist. Attempting to create it automatically.")
|
2031 |
+
|
2032 |
+
# Create the index with standard parameters
|
2033 |
+
pc_client.create_index(
|
2034 |
+
name=vector_db.pinecone_index,
|
2035 |
+
dimension=1536, # Standard OpenAI embedding dimension
|
2036 |
+
metric="cosine", # Most common similarity metric
|
2037 |
+
spec=ServerlessSpec(
|
2038 |
+
cloud="aws",
|
2039 |
+
region="us-east-1" # Use a standard region that works with the free tier
|
2040 |
+
)
|
2041 |
+
)
|
2042 |
+
|
2043 |
+
logger.info(f"Successfully created Pinecone index '{vector_db.pinecone_index}'")
|
2044 |
+
index_created = True
|
2045 |
+
|
2046 |
+
# Allow some time for the index to initialize
|
2047 |
+
import time
|
2048 |
+
time.sleep(5)
|
2049 |
+
|
2050 |
+
except Exception as create_error:
|
2051 |
+
logger.error(f"Failed to create Pinecone index '{vector_db.pinecone_index}': {create_error}")
|
2052 |
+
raise HTTPException(
|
2053 |
+
status_code=400,
|
2054 |
+
detail=f"Failed to create Pinecone index '{vector_db.pinecone_index}': {str(create_error)}"
|
2055 |
+
)
|
2056 |
+
|
2057 |
+
# Verify we can connect to the index (whether existing or newly created)
|
2058 |
+
try:
|
2059 |
+
index = pc_client.Index(vector_db.pinecone_index)
|
2060 |
+
# Try to get stats to verify connection
|
2061 |
+
stats = index.describe_index_stats()
|
2062 |
+
|
2063 |
+
# Create success message based on whether we created the index or used an existing one
|
2064 |
+
if index_created:
|
2065 |
+
success_message = f"Successfully created and connected to new Pinecone index '{vector_db.pinecone_index}'"
|
2066 |
+
else:
|
2067 |
+
success_message = f"Successfully connected to existing Pinecone index '{vector_db.pinecone_index}'"
|
2068 |
+
|
2069 |
+
logger.info(f"{success_message}: {stats}")
|
2070 |
+
|
2071 |
+
except Exception as e:
|
2072 |
+
error_message = f"Error connecting to Pinecone index '{vector_db.pinecone_index}': {str(e)}"
|
2073 |
+
logger.error(error_message)
|
2074 |
+
raise HTTPException(status_code=400, detail=error_message)
|
2075 |
+
|
2076 |
# Create new vector database
|
2077 |
db_vector_db = VectorDatabase(**vector_db.model_dump())
|
2078 |
|
|
|
2080 |
db.commit()
|
2081 |
db.refresh(db_vector_db)
|
2082 |
|
2083 |
+
# Return response with additional info about index creation
|
2084 |
+
response_data = VectorDatabaseResponse.model_validate(db_vector_db, from_attributes=True).model_dump()
|
2085 |
+
|
2086 |
+
# Add a message to the response indicating whether the index was created or existed
|
2087 |
+
if index_created:
|
2088 |
+
response_data["message"] = f"Created new Pinecone index '{vector_db.pinecone_index}' automatically"
|
2089 |
+
else:
|
2090 |
+
response_data["message"] = f"Using existing Pinecone index '{vector_db.pinecone_index}'"
|
2091 |
+
|
2092 |
+
return VectorDatabaseResponse.model_validate(response_data)
|
2093 |
except HTTPException:
|
2094 |
raise
|
2095 |
except SQLAlchemyError as e:
|
|
|
2143 |
if existing_db:
|
2144 |
raise HTTPException(status_code=400, detail=f"Vector database with name '{vector_db_update.name}' already exists")
|
2145 |
|
2146 |
+
# Check if API key exists if updating API key ID
|
2147 |
+
if vector_db_update.api_key_id:
|
2148 |
+
api_key = db.query(ApiKey).filter(ApiKey.id == vector_db_update.api_key_id).first()
|
2149 |
+
if not api_key:
|
2150 |
+
raise HTTPException(status_code=400, detail=f"API key with ID {vector_db_update.api_key_id} not found")
|
2151 |
+
|
2152 |
# Update fields if provided
|
2153 |
update_data = vector_db_update.model_dump(exclude_unset=True)
|
2154 |
for key, value in update_data.items():
|
|
|
2230 |
):
|
2231 |
"""
|
2232 |
Get detailed information about a vector database including document counts.
|
2233 |
+
Also verifies connectivity to the Pinecone index.
|
2234 |
"""
|
2235 |
try:
|
2236 |
# Get the vector database
|
|
|
2255 |
Document.is_embedded == False
|
2256 |
).scalar()
|
2257 |
|
2258 |
+
# Verify Pinecone index connectivity if API key is available
|
2259 |
+
message = None
|
2260 |
+
if vector_db.api_key_id:
|
2261 |
+
try:
|
2262 |
+
# Get the API key
|
2263 |
+
api_key = db.query(ApiKey).filter(ApiKey.id == vector_db.api_key_id).first()
|
2264 |
+
if api_key:
|
2265 |
+
# Initialize Pinecone client with the API key
|
2266 |
+
from pinecone import Pinecone
|
2267 |
+
pc_client = Pinecone(api_key=api_key.key_value)
|
2268 |
+
|
2269 |
+
# Check if the index exists
|
2270 |
+
index_list = pc_client.list_indexes()
|
2271 |
+
index_names = index_list.names() if hasattr(index_list, 'names') else []
|
2272 |
+
|
2273 |
+
if vector_db.pinecone_index in index_names:
|
2274 |
+
# Try to connect to the index
|
2275 |
+
index = pc_client.Index(vector_db.pinecone_index)
|
2276 |
+
stats = index.describe_index_stats()
|
2277 |
+
message = f"Pinecone index '{vector_db.pinecone_index}' is operational with {stats.get('total_vector_count', 0)} vectors"
|
2278 |
+
logger.info(f"Successfully connected to Pinecone index '{vector_db.pinecone_index}': {stats}")
|
2279 |
+
else:
|
2280 |
+
message = f"Pinecone index '{vector_db.pinecone_index}' does not exist. Available indexes: {', '.join(index_names)}"
|
2281 |
+
logger.warning(message)
|
2282 |
+
else:
|
2283 |
+
message = f"API key with ID {vector_db.api_key_id} not found"
|
2284 |
+
logger.warning(message)
|
2285 |
+
except Exception as e:
|
2286 |
+
message = f"Error connecting to Pinecone: {str(e)}"
|
2287 |
+
logger.error(message)
|
2288 |
+
else:
|
2289 |
+
message = "No API key associated with this vector database"
|
2290 |
+
logger.warning(message)
|
2291 |
+
|
2292 |
# Create response with added counts
|
2293 |
result = VectorDatabaseDetailResponse(
|
2294 |
id=vector_db.id,
|
|
|
2300 |
updated_at=vector_db.updated_at,
|
2301 |
document_count=total_docs or 0,
|
2302 |
embedded_count=embedded_docs or 0,
|
2303 |
+
pending_count=pending_docs or 0,
|
2304 |
+
message=message
|
2305 |
)
|
2306 |
|
2307 |
return result
|
|
|
2315 |
# --- Document models and endpoints ---
|
2316 |
class DocumentBase(BaseModel):
|
2317 |
name: str
|
|
|
2318 |
vector_database_id: int
|
|
|
2319 |
|
2320 |
+
class DocumentCreate(BaseModel):
|
2321 |
+
name: str
|
2322 |
+
vector_database_id: int
|
2323 |
|
2324 |
class DocumentUpdate(BaseModel):
|
2325 |
name: Optional[str] = None
|
|
|
2326 |
|
2327 |
class DocumentResponse(BaseModel):
|
2328 |
id: int
|
2329 |
name: str
|
2330 |
file_type: str
|
2331 |
+
content_type: Optional[str] = None
|
2332 |
size: int
|
|
|
2333 |
created_at: datetime
|
2334 |
updated_at: datetime
|
2335 |
vector_database_id: int
|
2336 |
vector_database_name: Optional[str] = None
|
2337 |
is_embedded: bool
|
|
|
2338 |
|
2339 |
model_config = ConfigDict(from_attributes=True)
|
2340 |
|
|
|
2375 |
# Add vector database name
|
2376 |
result = []
|
2377 |
for doc in documents:
|
2378 |
+
# Create a dictionary from the document for easier manipulation
|
2379 |
+
doc_dict = {
|
2380 |
+
"id": doc.id,
|
2381 |
+
"name": doc.name,
|
2382 |
+
"file_type": doc.file_type,
|
2383 |
+
"content_type": doc.content_type,
|
2384 |
+
"size": doc.size,
|
2385 |
+
"created_at": doc.created_at,
|
2386 |
+
"updated_at": doc.updated_at,
|
2387 |
+
"vector_database_id": doc.vector_database_id or 0, # Handle NULL values
|
2388 |
+
"is_embedded": doc.is_embedded
|
2389 |
+
}
|
2390 |
|
2391 |
# Get vector database name if not already populated
|
2392 |
+
vector_db_name = None
|
2393 |
+
if doc.vector_database_id is not None:
|
2394 |
vector_db = db.query(VectorDatabase).filter(VectorDatabase.id == doc.vector_database_id).first()
|
2395 |
vector_db_name = vector_db.name if vector_db else f"db_{doc.vector_database_id}"
|
2396 |
+
else:
|
2397 |
+
vector_db_name = "No Database"
|
2398 |
+
|
2399 |
+
doc_dict["vector_database_name"] = vector_db_name
|
2400 |
|
2401 |
+
# Create Pydantic model from dictionary
|
2402 |
+
doc_response = DocumentResponse(**doc_dict)
|
2403 |
+
result.append(doc_response)
|
2404 |
|
2405 |
return result
|
2406 |
except SQLAlchemyError as e:
|
|
|
2440 |
logger.error(traceback.format_exc())
|
2441 |
raise HTTPException(status_code=500, detail=f"Error retrieving document: {str(e)}")
|
2442 |
|
2443 |
+
@router.get("/documents/{document_id}/content", response_class=Response)
|
2444 |
+
async def get_document_content(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2445 |
document_id: int = Path(..., gt=0),
|
2446 |
db: Session = Depends(get_db)
|
2447 |
):
|
2448 |
"""
|
2449 |
+
Get document content (file) by document ID.
|
2450 |
+
Returns the binary content with the appropriate Content-Type header.
|
2451 |
"""
|
2452 |
try:
|
2453 |
+
# Get document to check if it exists and get metadata
|
2454 |
document = db.query(Document).filter(Document.id == document_id).first()
|
2455 |
if not document:
|
2456 |
raise HTTPException(status_code=404, detail=f"Document with ID {document_id} not found")
|
2457 |
|
2458 |
+
# Get document content from document_content table
|
2459 |
+
document_content = db.query(DocumentContent).filter(DocumentContent.document_id == document_id).first()
|
2460 |
+
if not document_content or not document_content.file_content:
|
2461 |
+
raise HTTPException(status_code=404, detail=f"Content for document with ID {document_id} not found")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2462 |
|
2463 |
+
# Determine content type
|
2464 |
+
content_type = document.content_type if hasattr(document, 'content_type') and document.content_type else "application/octet-stream"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2465 |
|
2466 |
+
# Return binary content with correct content type
|
2467 |
+
return Response(
|
2468 |
+
content=document_content.file_content,
|
2469 |
+
media_type=content_type,
|
2470 |
+
headers={"Content-Disposition": f"attachment; filename=\"{document.name}\""}
|
2471 |
+
)
|
|
|
|
|
2472 |
except HTTPException:
|
2473 |
raise
|
|
|
|
|
|
|
2474 |
except Exception as e:
|
2475 |
+
logger.error(f"Error retrieving document content: {e}")
|
2476 |
logger.error(traceback.format_exc())
|
2477 |
+
raise HTTPException(status_code=500, detail=f"Error retrieving document content: {str(e)}")
|
2478 |
|
2479 |
# --- Telegram Bot models and endpoints ---
|
2480 |
class TelegramBotBase(BaseModel):
|
|
|
2499 |
|
2500 |
model_config = ConfigDict(from_attributes=True)
|
2501 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2502 |
@router.get("/telegram-bots/{bot_id}", response_model=TelegramBotResponse)
|
2503 |
async def get_telegram_bot(
|
2504 |
bot_id: int = Path(..., gt=0),
|
|
|
3507 |
db.rollback()
|
3508 |
logger.error(f"Database error in batch_delete_emergency_contacts: {e}")
|
3509 |
logger.error(traceback.format_exc())
|
3510 |
+
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
3511 |
+
|
3512 |
+
@router.post("/documents", response_model=DocumentResponse)
|
3513 |
+
async def upload_document(
|
3514 |
+
name: str = Form(...),
|
3515 |
+
vector_database_id: int = Form(...),
|
3516 |
+
file: UploadFile = File(...),
|
3517 |
+
db: Session = Depends(get_db)
|
3518 |
+
):
|
3519 |
+
"""
|
3520 |
+
Upload a new document and associate it with a vector database.
|
3521 |
+
|
3522 |
+
- **name**: Document name
|
3523 |
+
- **vector_database_id**: ID of the vector database to associate with
|
3524 |
+
- **file**: The file to upload
|
3525 |
+
"""
|
3526 |
+
try:
|
3527 |
+
# Check if vector database exists
|
3528 |
+
vector_db = db.query(VectorDatabase).filter(VectorDatabase.id == vector_database_id).first()
|
3529 |
+
if not vector_db:
|
3530 |
+
raise HTTPException(status_code=404, detail=f"Vector database with ID {vector_database_id} not found")
|
3531 |
+
|
3532 |
+
# Read file content
|
3533 |
+
file_content = await file.read()
|
3534 |
+
file_size = len(file_content)
|
3535 |
+
|
3536 |
+
# Determine file type from extension
|
3537 |
+
filename = file.filename
|
3538 |
+
file_extension = pathlib_Path(filename).suffix.lower()[1:] if filename else ""
|
3539 |
+
|
3540 |
+
# Create document record
|
3541 |
+
document = Document(
|
3542 |
+
name=name,
|
3543 |
+
vector_database_id=vector_database_id,
|
3544 |
+
file_type=file_extension,
|
3545 |
+
content_type=file.content_type,
|
3546 |
+
size=file_size,
|
3547 |
+
is_embedded=False
|
3548 |
+
)
|
3549 |
+
|
3550 |
+
db.add(document)
|
3551 |
+
db.flush() # Get ID without committing
|
3552 |
+
|
3553 |
+
# Create document content record
|
3554 |
+
document_content = DocumentContent(
|
3555 |
+
document_id=document.id,
|
3556 |
+
file_content=file_content
|
3557 |
+
)
|
3558 |
+
|
3559 |
+
db.add(document_content)
|
3560 |
+
db.commit()
|
3561 |
+
db.refresh(document)
|
3562 |
+
|
3563 |
+
# Create vector status record for tracking embedding
|
3564 |
+
vector_status = VectorStatus(
|
3565 |
+
document_id=document.id,
|
3566 |
+
vector_database_id=vector_database_id,
|
3567 |
+
status="pending"
|
3568 |
+
)
|
3569 |
+
|
3570 |
+
db.add(vector_status)
|
3571 |
+
db.commit()
|
3572 |
+
|
3573 |
+
# Get vector database name for response
|
3574 |
+
vector_db_name = vector_db.name if vector_db else f"db_{vector_database_id}"
|
3575 |
+
|
3576 |
+
# Create response
|
3577 |
+
result = DocumentResponse(
|
3578 |
+
id=document.id,
|
3579 |
+
name=document.name,
|
3580 |
+
file_type=document.file_type,
|
3581 |
+
content_type=document.content_type,
|
3582 |
+
size=document.size,
|
3583 |
+
created_at=document.created_at,
|
3584 |
+
updated_at=document.updated_at,
|
3585 |
+
vector_database_id=document.vector_database_id,
|
3586 |
+
vector_database_name=vector_db_name,
|
3587 |
+
is_embedded=document.is_embedded
|
3588 |
+
)
|
3589 |
+
|
3590 |
+
return result
|
3591 |
+
except HTTPException:
|
3592 |
+
raise
|
3593 |
+
except SQLAlchemyError as e:
|
3594 |
+
db.rollback()
|
3595 |
+
logger.error(f"Database error uploading document: {e}")
|
3596 |
+
logger.error(traceback.format_exc())
|
3597 |
+
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
3598 |
+
except Exception as e:
|
3599 |
+
db.rollback()
|
3600 |
+
logger.error(f"Error uploading document: {e}")
|
3601 |
+
logger.error(traceback.format_exc())
|
3602 |
+
raise HTTPException(status_code=500, detail=f"Error uploading document: {str(e)}")
|
3603 |
+
|
3604 |
+
@router.put("/documents/{document_id}", response_model=DocumentResponse)
|
3605 |
+
async def update_document(
|
3606 |
+
document_id: int,
|
3607 |
+
name: Optional[str] = Form(None),
|
3608 |
+
file: Optional[UploadFile] = File(None),
|
3609 |
+
background_tasks: BackgroundTasks = None,
|
3610 |
+
db: Session = Depends(get_db)
|
3611 |
+
):
|
3612 |
+
"""
|
3613 |
+
Update an existing document. Can update name, file content, or both.
|
3614 |
+
|
3615 |
+
- **document_id**: ID of the document to update
|
3616 |
+
- **name**: New document name (optional)
|
3617 |
+
- **file**: New file content (optional)
|
3618 |
+
"""
|
3619 |
+
try:
|
3620 |
+
# Validate document_id
|
3621 |
+
if document_id <= 0:
|
3622 |
+
raise HTTPException(status_code=400, detail="document_id must be greater than 0")
|
3623 |
+
|
3624 |
+
# Check if document exists
|
3625 |
+
document = db.query(Document).filter(Document.id == document_id).first()
|
3626 |
+
if not document:
|
3627 |
+
raise HTTPException(status_code=404, detail=f"Document with ID {document_id} not found")
|
3628 |
+
|
3629 |
+
# Get vector database information for later use
|
3630 |
+
vector_db = None
|
3631 |
+
if document.vector_database_id:
|
3632 |
+
vector_db = db.query(VectorDatabase).filter(VectorDatabase.id == document.vector_database_id).first()
|
3633 |
+
|
3634 |
+
# Update name if provided
|
3635 |
+
if name:
|
3636 |
+
document.name = name
|
3637 |
+
|
3638 |
+
# Update file if provided
|
3639 |
+
if file:
|
3640 |
+
# Read new file content
|
3641 |
+
file_content = await file.read()
|
3642 |
+
file_size = len(file_content)
|
3643 |
+
|
3644 |
+
# Determine file type from extension
|
3645 |
+
filename = file.filename
|
3646 |
+
file_extension = pathlib_Path(filename).suffix.lower()[1:] if filename else ""
|
3647 |
+
|
3648 |
+
# Update document record
|
3649 |
+
document.file_type = file_extension
|
3650 |
+
document.content_type = file.content_type
|
3651 |
+
document.size = file_size
|
3652 |
+
document.is_embedded = False # Reset embedding status
|
3653 |
+
document.updated_at = datetime.now()
|
3654 |
+
|
3655 |
+
# Update document content
|
3656 |
+
document_content = db.query(DocumentContent).filter(DocumentContent.document_id == document_id).first()
|
3657 |
+
if document_content:
|
3658 |
+
document_content.file_content = file_content
|
3659 |
+
else:
|
3660 |
+
# Create new document content if it doesn't exist
|
3661 |
+
document_content = DocumentContent(
|
3662 |
+
document_id=document_id,
|
3663 |
+
file_content=file_content
|
3664 |
+
)
|
3665 |
+
db.add(document_content)
|
3666 |
+
|
3667 |
+
# Get vector status for Pinecone cleanup
|
3668 |
+
vector_status = db.query(VectorStatus).filter(VectorStatus.document_id == document_id).first()
|
3669 |
+
|
3670 |
+
# Store old vector_id for cleanup
|
3671 |
+
old_vector_id = None
|
3672 |
+
if vector_status and vector_status.vector_id:
|
3673 |
+
old_vector_id = vector_status.vector_id
|
3674 |
+
|
3675 |
+
# Update vector status to pending
|
3676 |
+
if vector_status:
|
3677 |
+
vector_status.status = "pending"
|
3678 |
+
vector_status.vector_id = None
|
3679 |
+
vector_status.embedded_at = None
|
3680 |
+
vector_status.error_message = None
|
3681 |
+
else:
|
3682 |
+
# Create new vector status if it doesn't exist
|
3683 |
+
vector_status = VectorStatus(
|
3684 |
+
document_id=document_id,
|
3685 |
+
vector_database_id=document.vector_database_id,
|
3686 |
+
status="pending"
|
3687 |
+
)
|
3688 |
+
db.add(vector_status)
|
3689 |
+
|
3690 |
+
# Schedule deletion of old vectors in Pinecone if we have all needed info
|
3691 |
+
if old_vector_id and vector_db and document.vector_database_id and background_tasks:
|
3692 |
+
try:
|
3693 |
+
# Initialize PDFProcessor for vector deletion
|
3694 |
+
from app.pdf.processor import PDFProcessor
|
3695 |
+
|
3696 |
+
processor = PDFProcessor(
|
3697 |
+
index_name=vector_db.pinecone_index,
|
3698 |
+
namespace=f"vdb-{document.vector_database_id}",
|
3699 |
+
vector_db_id=document.vector_database_id
|
3700 |
+
)
|
3701 |
+
|
3702 |
+
# Add deletion task to background tasks
|
3703 |
+
background_tasks.add_task(
|
3704 |
+
processor.delete_document_vectors,
|
3705 |
+
old_vector_id
|
3706 |
+
)
|
3707 |
+
|
3708 |
+
logger.info(f"Scheduled deletion of old vectors for document {document_id}")
|
3709 |
+
except Exception as e:
|
3710 |
+
logger.error(f"Error scheduling vector deletion: {str(e)}")
|
3711 |
+
# Continue with the update even if vector deletion scheduling fails
|
3712 |
+
|
3713 |
+
# Schedule document for re-embedding if possible
|
3714 |
+
if background_tasks and document.vector_database_id:
|
3715 |
+
try:
|
3716 |
+
# Import here to avoid circular imports
|
3717 |
+
from app.pdf.tasks import process_document_for_embedding
|
3718 |
+
|
3719 |
+
# Schedule embedding
|
3720 |
+
background_tasks.add_task(
|
3721 |
+
process_document_for_embedding,
|
3722 |
+
document_id=document_id,
|
3723 |
+
vector_db_id=document.vector_database_id
|
3724 |
+
)
|
3725 |
+
|
3726 |
+
logger.info(f"Scheduled re-embedding for document {document_id}")
|
3727 |
+
except Exception as e:
|
3728 |
+
logger.error(f"Error scheduling document embedding: {str(e)}")
|
3729 |
+
# Continue with the update even if embedding scheduling fails
|
3730 |
+
|
3731 |
+
db.commit()
|
3732 |
+
db.refresh(document)
|
3733 |
+
|
3734 |
+
# Get vector database name for response
|
3735 |
+
vector_db_name = "No Database"
|
3736 |
+
if vector_db:
|
3737 |
+
vector_db_name = vector_db.name
|
3738 |
+
elif document.vector_database_id:
|
3739 |
+
vector_db_name = f"db_{document.vector_database_id}"
|
3740 |
+
|
3741 |
+
# Create response
|
3742 |
+
result = DocumentResponse(
|
3743 |
+
id=document.id,
|
3744 |
+
name=document.name,
|
3745 |
+
file_type=document.file_type,
|
3746 |
+
content_type=document.content_type,
|
3747 |
+
size=document.size,
|
3748 |
+
created_at=document.created_at,
|
3749 |
+
updated_at=document.updated_at,
|
3750 |
+
vector_database_id=document.vector_database_id or 0,
|
3751 |
+
vector_database_name=vector_db_name,
|
3752 |
+
is_embedded=document.is_embedded
|
3753 |
+
)
|
3754 |
+
|
3755 |
+
return result
|
3756 |
+
except HTTPException:
|
3757 |
+
raise
|
3758 |
+
except SQLAlchemyError as e:
|
3759 |
+
db.rollback()
|
3760 |
+
logger.error(f"Database error updating document: {e}")
|
3761 |
+
logger.error(traceback.format_exc())
|
3762 |
+
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
3763 |
+
except Exception as e:
|
3764 |
+
db.rollback()
|
3765 |
+
logger.error(f"Error updating document: {e}")
|
3766 |
+
logger.error(traceback.format_exc())
|
3767 |
+
raise HTTPException(status_code=500, detail=f"Error updating document: {str(e)}")
|
3768 |
+
|
3769 |
+
@router.delete("/documents/{document_id}", response_model=dict)
|
3770 |
+
async def delete_document(
|
3771 |
+
document_id: int = Path(..., gt=0),
|
3772 |
+
db: Session = Depends(get_db)
|
3773 |
+
):
|
3774 |
+
"""
|
3775 |
+
Delete a document and its associated content.
|
3776 |
+
|
3777 |
+
- **document_id**: ID of the document to delete
|
3778 |
+
"""
|
3779 |
+
try:
|
3780 |
+
# Check if document exists
|
3781 |
+
document = db.query(Document).filter(Document.id == document_id).first()
|
3782 |
+
if not document:
|
3783 |
+
raise HTTPException(status_code=404, detail=f"Document with ID {document_id} not found")
|
3784 |
+
|
3785 |
+
# Delete vector status
|
3786 |
+
db.query(VectorStatus).filter(VectorStatus.document_id == document_id).delete()
|
3787 |
+
|
3788 |
+
# Delete document content
|
3789 |
+
db.query(DocumentContent).filter(DocumentContent.document_id == document_id).delete()
|
3790 |
+
|
3791 |
+
# Delete document
|
3792 |
+
db.delete(document)
|
3793 |
+
db.commit()
|
3794 |
+
|
3795 |
+
return {"status": "success", "message": f"Document with ID {document_id} deleted successfully"}
|
3796 |
+
except HTTPException:
|
3797 |
+
raise
|
3798 |
+
except SQLAlchemyError as e:
|
3799 |
+
db.rollback()
|
3800 |
+
logger.error(f"Database error deleting document: {e}")
|
3801 |
+
logger.error(traceback.format_exc())
|
3802 |
+
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
3803 |
+
except Exception as e:
|
3804 |
+
db.rollback()
|
3805 |
+
logger.error(f"Error deleting document: {e}")
|
3806 |
+
logger.error(traceback.format_exc())
|
3807 |
+
raise HTTPException(status_code=500, detail=f"Error deleting document: {str(e)}")
|
app/api/rag_routes.py
CHANGED
@@ -48,17 +48,17 @@ router = APIRouter(
|
|
48 |
|
49 |
fix_request = PromptTemplate(
|
50 |
template = """Goal:
|
51 |
-
Your task is
|
52 |
-
You will
|
53 |
-
Generate a
|
54 |
|
55 |
Return Format:
|
56 |
-
Only return
|
57 |
-
If the current message is NOT related to the
|
58 |
-
If the current message IS related to the
|
59 |
|
60 |
Warning:
|
61 |
-
Only use history
|
62 |
|
63 |
Conversation History:
|
64 |
{chat_history}
|
@@ -66,15 +66,24 @@ Conversation History:
|
|
66 |
User current message:
|
67 |
{question}
|
68 |
""",
|
69 |
-
input_variables
|
70 |
)
|
71 |
|
72 |
# Create a prompt template with conversation history
|
73 |
prompt = PromptTemplate(
|
74 |
template = """Goal:
|
75 |
-
You are a professional tour guide assistant that assists users in finding information about places in Da Nang, Vietnam.
|
76 |
You can provide details on restaurants, cafes, hotels, attractions, and other local venues.
|
77 |
You have to use core knowledge and conversation history to chat with users, who are Da Nang's tourists.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
Return Format:
|
80 |
Respond in friendly, natural, concise and use only English like a real tour guide.
|
@@ -251,7 +260,7 @@ async def chat(request: ChatRequest, background_tasks: BackgroundTasks):
|
|
251 |
# Generate the prompt using template
|
252 |
prompt_text = prompt.format(
|
253 |
context=context,
|
254 |
-
question=
|
255 |
chat_history=chat_history
|
256 |
)
|
257 |
logger.info(f"Full prompt with history and context: {prompt_text}")
|
|
|
48 |
|
49 |
fix_request = PromptTemplate(
|
50 |
template = """Goal:
|
51 |
+
Your task is to extract important keywords from the user's current request, optionally using chat history if relevant.
|
52 |
+
You will receive a conversation history and the user's current message.
|
53 |
+
Generate a **list of concise keywords** that best represent the user's intent.
|
54 |
|
55 |
Return Format:
|
56 |
+
Only return keywords (comma-separated, no extra explanation).
|
57 |
+
If the current message is NOT related to the chat history or if there is no chat history: Return keywords from the current message only.
|
58 |
+
If the current message IS related to the chat history: Return a refined set of keywords based on both history and current message.
|
59 |
|
60 |
Warning:
|
61 |
+
Only use chat history if the current message is clearly related to the prior context.
|
62 |
|
63 |
Conversation History:
|
64 |
{chat_history}
|
|
|
66 |
User current message:
|
67 |
{question}
|
68 |
""",
|
69 |
+
input_variables=["chat_history", "question"],
|
70 |
)
|
71 |
|
72 |
# Create a prompt template with conversation history
|
73 |
prompt = PromptTemplate(
|
74 |
template = """Goal:
|
75 |
+
You are Pixity - a professional tour guide assistant that assists users in finding information about places in Da Nang, Vietnam.
|
76 |
You can provide details on restaurants, cafes, hotels, attractions, and other local venues.
|
77 |
You have to use core knowledge and conversation history to chat with users, who are Da Nang's tourists.
|
78 |
+
Pixity’s Core Personality: Friendly & Warm: Chats like a trustworthy friend who listens and is always ready to help.
|
79 |
+
Naturally Cute: Shows cuteness through word choice, soft emojis, and gentle care for the user.
|
80 |
+
Playful – a little bit cheeky in a lovable way: Occasionally cracks jokes, uses light memes or throws in a surprise response that makes users smile. Think Duolingo-style humor, but less threatening.
|
81 |
+
Smart & Proactive: Friendly, but also delivers quick, accurate info. Knows how to guide users to the right place – at the right time – with the right solution.
|
82 |
+
Tone & Voice: Friendly – Youthful – Snappy. Uses simple words, similar to daily chat language (e.g., “Let’s find it together!” / “Need a tip?” / “Here’s something cool”). Avoids sounding robotic or overly scripted. Can joke lightly in smart ways, making Pixity feel like a travel buddy who knows how to lift the mood
|
83 |
+
SAMPLE DIALOGUES
|
84 |
+
When a user opens the chatbot for the first time:
|
85 |
+
User: Hello?
|
86 |
+
Pixity: Hi hi 👋 I’ve been waiting for you! Ready to explore Da Nang together? I’ve got tips, tricks, and a tiny bit of magic 🎒✨
|
87 |
|
88 |
Return Format:
|
89 |
Respond in friendly, natural, concise and use only English like a real tour guide.
|
|
|
260 |
# Generate the prompt using template
|
261 |
prompt_text = prompt.format(
|
262 |
context=context,
|
263 |
+
question=request.question,
|
264 |
chat_history=chat_history
|
265 |
)
|
266 |
logger.info(f"Full prompt with history and context: {prompt_text}")
|
app/api/websocket_routes.py
CHANGED
@@ -92,7 +92,7 @@ def get_full_websocket_url(server_side=False):
|
|
92 |
3. When there are new sessions requiring attention, you will receive notifications through this connection
|
93 |
|
94 |
Notifications are sent when:
|
95 |
-
- Session response starts with "I
|
96 |
- The system cannot answer the user's question
|
97 |
|
98 |
Make sure to send a "keepalive" message every 5 minutes to maintain the connection.
|
@@ -114,14 +114,14 @@ async def websocket_documentation():
|
|
114 |
"full_url": ws_url,
|
115 |
"description": "Endpoint to receive notifications about new sessions requiring attention",
|
116 |
"notification_format": {
|
117 |
-
"type": "
|
118 |
"timestamp": "YYYY-MM-DD HH:MM:SS",
|
119 |
"data": {
|
120 |
"session_id": "session id",
|
121 |
"factor": "user",
|
122 |
"action": "action type",
|
123 |
"message": "User question",
|
124 |
-
"response": "I
|
125 |
"user_id": "user id",
|
126 |
"first_name": "user's first name",
|
127 |
"last_name": "user's last name",
|
@@ -168,7 +168,7 @@ async def websocket_documentation():
|
|
168 |
data = json.loads(message)
|
169 |
print(f"Received notification: {data}")
|
170 |
# Process notification, e.g.: send to Telegram Admin
|
171 |
-
if data.get("type") == "
|
172 |
session_data = data.get("data", {})
|
173 |
user_question = session_data.get("message", "")
|
174 |
user_name = session_data.get("first_name", "Unknown User")
|
@@ -230,18 +230,60 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
230 |
"""
|
231 |
await manager.connect(websocket)
|
232 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
while True:
|
|
|
|
|
|
|
234 |
# Maintain WebSocket connection
|
235 |
data = await websocket.receive_text()
|
|
|
236 |
# Echo back to keep connection active
|
237 |
-
await websocket.send_json({
|
|
|
|
|
|
|
|
|
238 |
logger.info(f"Received message from WebSocket: {data}")
|
239 |
except WebSocketDisconnect:
|
240 |
logger.info("WebSocket client disconnected")
|
241 |
-
manager.disconnect(websocket)
|
242 |
except Exception as e:
|
243 |
logger.error(f"WebSocket error: {e}")
|
|
|
|
|
244 |
manager.disconnect(websocket)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
|
246 |
# Function to send notifications over WebSocket
|
247 |
async def send_notification(data: dict):
|
@@ -249,7 +291,7 @@ async def send_notification(data: dict):
|
|
249 |
Send notification to all active WebSocket connections.
|
250 |
|
251 |
This function is used to notify admin bots about new issues or questions that need attention.
|
252 |
-
It's triggered when the system cannot answer a user's question (response starts with "I
|
253 |
|
254 |
Args:
|
255 |
data: The data to send as notification
|
@@ -260,33 +302,30 @@ async def send_notification(data: dict):
|
|
260 |
logger.info(f"Notification data: session_id={data.get('session_id')}, user_id={data.get('user_id')}")
|
261 |
logger.info(f"Response: {data.get('response', '')[:50]}...")
|
262 |
|
263 |
-
# Check if the response starts with "I
|
264 |
response = data.get('response', '')
|
265 |
if not response or not isinstance(response, str):
|
266 |
logger.warning(f"Invalid response format in notification data: {response}")
|
267 |
return
|
268 |
|
269 |
-
if not response.strip().lower().startswith("i
|
270 |
-
logger.info(f"Response doesn't start with 'I
|
271 |
return
|
272 |
|
273 |
-
logger.info(f"Response starts with 'I
|
274 |
|
275 |
-
# Format the notification data for admin
|
276 |
notification_data = {
|
277 |
-
"type": "
|
278 |
"timestamp": get_local_time(),
|
279 |
-
"
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
"first_name": data.get('first_name', 'User'),
|
285 |
"last_name": data.get('last_name', ''),
|
286 |
-
"username": data.get('username', '')
|
287 |
-
"created_at": data.get('created_at', get_local_time()),
|
288 |
-
"action": data.get('action', 'unknown'),
|
289 |
-
"factor": "user" # Always show as user for better readability
|
290 |
}
|
291 |
}
|
292 |
|
|
|
92 |
3. When there are new sessions requiring attention, you will receive notifications through this connection
|
93 |
|
94 |
Notifications are sent when:
|
95 |
+
- Session response starts with "I'm sorry"
|
96 |
- The system cannot answer the user's question
|
97 |
|
98 |
Make sure to send a "keepalive" message every 5 minutes to maintain the connection.
|
|
|
114 |
"full_url": ws_url,
|
115 |
"description": "Endpoint to receive notifications about new sessions requiring attention",
|
116 |
"notification_format": {
|
117 |
+
"type": "sorry_response",
|
118 |
"timestamp": "YYYY-MM-DD HH:MM:SS",
|
119 |
"data": {
|
120 |
"session_id": "session id",
|
121 |
"factor": "user",
|
122 |
"action": "action type",
|
123 |
"message": "User question",
|
124 |
+
"response": "I'm sorry...",
|
125 |
"user_id": "user id",
|
126 |
"first_name": "user's first name",
|
127 |
"last_name": "user's last name",
|
|
|
168 |
data = json.loads(message)
|
169 |
print(f"Received notification: {data}")
|
170 |
# Process notification, e.g.: send to Telegram Admin
|
171 |
+
if data.get("type") == "sorry_response":
|
172 |
session_data = data.get("data", {})
|
173 |
user_question = session_data.get("message", "")
|
174 |
user_name = session_data.get("first_name", "Unknown User")
|
|
|
230 |
"""
|
231 |
await manager.connect(websocket)
|
232 |
try:
|
233 |
+
# Keep track of last activity time to prevent connection timeouts
|
234 |
+
last_activity = datetime.now()
|
235 |
+
|
236 |
+
# Set up a background ping task
|
237 |
+
async def send_periodic_ping():
|
238 |
+
try:
|
239 |
+
while True:
|
240 |
+
# Send ping every 20 seconds if no other activity
|
241 |
+
await asyncio.sleep(20)
|
242 |
+
current_time = datetime.now()
|
243 |
+
time_since_activity = (current_time - last_activity).total_seconds()
|
244 |
+
|
245 |
+
# Only send ping if there's been no activity for 15+ seconds
|
246 |
+
if time_since_activity > 15:
|
247 |
+
logger.debug("Sending ping to client to keep connection alive")
|
248 |
+
await websocket.send_json({"type": "ping", "timestamp": current_time.isoformat()})
|
249 |
+
except asyncio.CancelledError:
|
250 |
+
# Task was cancelled, just exit quietly
|
251 |
+
pass
|
252 |
+
except Exception as e:
|
253 |
+
logger.error(f"Error in ping task: {e}")
|
254 |
+
|
255 |
+
# Start ping task
|
256 |
+
ping_task = asyncio.create_task(send_periodic_ping())
|
257 |
+
|
258 |
+
# Main message loop
|
259 |
while True:
|
260 |
+
# Update last activity time
|
261 |
+
last_activity = datetime.now()
|
262 |
+
|
263 |
# Maintain WebSocket connection
|
264 |
data = await websocket.receive_text()
|
265 |
+
|
266 |
# Echo back to keep connection active
|
267 |
+
await websocket.send_json({
|
268 |
+
"status": "connected",
|
269 |
+
"echo": data,
|
270 |
+
"timestamp": last_activity.isoformat()
|
271 |
+
})
|
272 |
logger.info(f"Received message from WebSocket: {data}")
|
273 |
except WebSocketDisconnect:
|
274 |
logger.info("WebSocket client disconnected")
|
|
|
275 |
except Exception as e:
|
276 |
logger.error(f"WebSocket error: {e}")
|
277 |
+
finally:
|
278 |
+
# Always clean up properly
|
279 |
manager.disconnect(websocket)
|
280 |
+
# Cancel ping task if it's still running
|
281 |
+
try:
|
282 |
+
ping_task.cancel()
|
283 |
+
await ping_task
|
284 |
+
except (UnboundLocalError, asyncio.CancelledError):
|
285 |
+
# ping_task wasn't created or already cancelled
|
286 |
+
pass
|
287 |
|
288 |
# Function to send notifications over WebSocket
|
289 |
async def send_notification(data: dict):
|
|
|
291 |
Send notification to all active WebSocket connections.
|
292 |
|
293 |
This function is used to notify admin bots about new issues or questions that need attention.
|
294 |
+
It's triggered when the system cannot answer a user's question (response starts with "I'm sorry").
|
295 |
|
296 |
Args:
|
297 |
data: The data to send as notification
|
|
|
302 |
logger.info(f"Notification data: session_id={data.get('session_id')}, user_id={data.get('user_id')}")
|
303 |
logger.info(f"Response: {data.get('response', '')[:50]}...")
|
304 |
|
305 |
+
# Check if the response starts with "I'm sorry"
|
306 |
response = data.get('response', '')
|
307 |
if not response or not isinstance(response, str):
|
308 |
logger.warning(f"Invalid response format in notification data: {response}")
|
309 |
return
|
310 |
|
311 |
+
if not response.strip().lower().startswith("i'm sorry"):
|
312 |
+
logger.info(f"Response doesn't start with 'I'm sorry', notification not needed: {response[:50]}...")
|
313 |
return
|
314 |
|
315 |
+
logger.info(f"Response starts with 'I'm sorry', sending notification")
|
316 |
|
317 |
+
# Format the notification data for admin - format theo chuẩn Admin_bot
|
318 |
notification_data = {
|
319 |
+
"type": "sorry_response", # Đổi type thành sorry_response để phù hợp với Admin_bot
|
320 |
"timestamp": get_local_time(),
|
321 |
+
"user_id": data.get('user_id', 'unknown'),
|
322 |
+
"message": data.get('message', ''),
|
323 |
+
"response": response,
|
324 |
+
"session_id": data.get('session_id', 'unknown'),
|
325 |
+
"user_info": {
|
326 |
"first_name": data.get('first_name', 'User'),
|
327 |
"last_name": data.get('last_name', ''),
|
328 |
+
"username": data.get('username', '')
|
|
|
|
|
|
|
329 |
}
|
330 |
}
|
331 |
|
app/database/models.py
CHANGED
@@ -78,7 +78,7 @@ class VectorDatabase(Base):
|
|
78 |
name = Column(String, nullable=False, unique=True)
|
79 |
description = Column(String, nullable=True)
|
80 |
pinecone_index = Column(String, nullable=False)
|
81 |
-
|
82 |
status = Column(String, default="active")
|
83 |
created_at = Column(DateTime, server_default=func.now())
|
84 |
updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now())
|
@@ -87,18 +87,17 @@ class VectorDatabase(Base):
|
|
87 |
documents = relationship("Document", back_populates="vector_database")
|
88 |
vector_statuses = relationship("VectorStatus", back_populates="vector_database")
|
89 |
engine_associations = relationship("EngineVectorDb", back_populates="vector_database")
|
|
|
90 |
|
91 |
class Document(Base):
|
92 |
__tablename__ = "document"
|
93 |
|
94 |
id = Column(Integer, primary_key=True, index=True)
|
95 |
name = Column(String, nullable=False)
|
96 |
-
file_content = Column(LargeBinary, nullable=True)
|
97 |
file_type = Column(String, nullable=True)
|
98 |
-
size = Column(Integer, nullable=True)
|
99 |
content_type = Column(String, nullable=True)
|
|
|
100 |
is_embedded = Column(Boolean, default=False)
|
101 |
-
file_metadata = Column(JSON, nullable=True)
|
102 |
vector_database_id = Column(Integer, ForeignKey("vector_database.id"), nullable=False)
|
103 |
created_at = Column(DateTime, server_default=func.now())
|
104 |
updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now())
|
@@ -106,6 +105,18 @@ class Document(Base):
|
|
106 |
# Relationships
|
107 |
vector_database = relationship("VectorDatabase", back_populates="documents")
|
108 |
vector_statuses = relationship("VectorStatus", back_populates="document")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
class VectorStatus(Base):
|
111 |
__tablename__ = "vector_status"
|
@@ -184,9 +195,10 @@ class ApiKey(Base):
|
|
184 |
__tablename__ = "api_key"
|
185 |
|
186 |
id = Column(Integer, primary_key=True, index=True)
|
187 |
-
|
188 |
-
|
189 |
-
description = Column(
|
190 |
-
is_active = Column(Boolean, default=True)
|
191 |
created_at = Column(DateTime, server_default=func.now())
|
192 |
-
last_used = Column(DateTime, nullable=True)
|
|
|
|
|
|
78 |
name = Column(String, nullable=False, unique=True)
|
79 |
description = Column(String, nullable=True)
|
80 |
pinecone_index = Column(String, nullable=False)
|
81 |
+
api_key_id = Column(Integer, ForeignKey("api_key.id"), nullable=True)
|
82 |
status = Column(String, default="active")
|
83 |
created_at = Column(DateTime, server_default=func.now())
|
84 |
updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now())
|
|
|
87 |
documents = relationship("Document", back_populates="vector_database")
|
88 |
vector_statuses = relationship("VectorStatus", back_populates="vector_database")
|
89 |
engine_associations = relationship("EngineVectorDb", back_populates="vector_database")
|
90 |
+
api_key_ref = relationship("ApiKey", foreign_keys=[api_key_id])
|
91 |
|
92 |
class Document(Base):
|
93 |
__tablename__ = "document"
|
94 |
|
95 |
id = Column(Integer, primary_key=True, index=True)
|
96 |
name = Column(String, nullable=False)
|
|
|
97 |
file_type = Column(String, nullable=True)
|
|
|
98 |
content_type = Column(String, nullable=True)
|
99 |
+
size = Column(Integer, nullable=True)
|
100 |
is_embedded = Column(Boolean, default=False)
|
|
|
101 |
vector_database_id = Column(Integer, ForeignKey("vector_database.id"), nullable=False)
|
102 |
created_at = Column(DateTime, server_default=func.now())
|
103 |
updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now())
|
|
|
105 |
# Relationships
|
106 |
vector_database = relationship("VectorDatabase", back_populates="documents")
|
107 |
vector_statuses = relationship("VectorStatus", back_populates="document")
|
108 |
+
file_content_ref = relationship("DocumentContent", back_populates="document", uselist=False, cascade="all, delete-orphan")
|
109 |
+
|
110 |
+
class DocumentContent(Base):
|
111 |
+
__tablename__ = "document_content"
|
112 |
+
|
113 |
+
id = Column(Integer, primary_key=True, index=True)
|
114 |
+
document_id = Column(Integer, ForeignKey("document.id"), nullable=False, unique=True)
|
115 |
+
file_content = Column(LargeBinary, nullable=True)
|
116 |
+
created_at = Column(DateTime, server_default=func.now())
|
117 |
+
|
118 |
+
# Relationships
|
119 |
+
document = relationship("Document", back_populates="file_content_ref")
|
120 |
|
121 |
class VectorStatus(Base):
|
122 |
__tablename__ = "vector_status"
|
|
|
195 |
__tablename__ = "api_key"
|
196 |
|
197 |
id = Column(Integer, primary_key=True, index=True)
|
198 |
+
key_type = Column(String, nullable=False)
|
199 |
+
key_value = Column(Text, nullable=False)
|
200 |
+
description = Column(Text, nullable=True)
|
|
|
201 |
created_at = Column(DateTime, server_default=func.now())
|
202 |
+
last_used = Column(DateTime, nullable=True)
|
203 |
+
expires_at = Column(DateTime, nullable=True)
|
204 |
+
is_active = Column(Boolean, default=True)
|
app/database/mongodb.py
CHANGED
@@ -142,13 +142,40 @@ def get_chat_history(user_id, n = 5) -> str:
|
|
142 |
Bot: ...
|
143 |
User: ...
|
144 |
Bot: ...
|
|
|
|
|
145 |
"""
|
146 |
try:
|
147 |
-
#
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
if not docs:
|
153 |
logger.info(f"Không tìm thấy dữ liệu cho user_id: {user_id}")
|
154 |
return ""
|
@@ -161,6 +188,10 @@ def get_chat_history(user_id, n = 5) -> str:
|
|
161 |
message = doc.get("message", "")
|
162 |
response = doc.get("response", "")
|
163 |
|
|
|
|
|
|
|
|
|
164 |
if factor == "user" and action == "asking_freely":
|
165 |
conversation_lines.append(f"User: {message}")
|
166 |
conversation_lines.append(f"Bot: {response}")
|
@@ -174,13 +205,14 @@ def get_chat_history(user_id, n = 5) -> str:
|
|
174 |
def get_request_history(user_id, n=3):
|
175 |
"""Get the most recent user requests to use as context for retrieval"""
|
176 |
try:
|
177 |
-
#
|
178 |
-
history =
|
179 |
|
180 |
# Just extract the questions for context
|
181 |
requests = []
|
182 |
-
for
|
183 |
-
|
|
|
184 |
|
185 |
# Join all recent requests into a single string for context
|
186 |
return " ".join(requests)
|
|
|
142 |
Bot: ...
|
143 |
User: ...
|
144 |
Bot: ...
|
145 |
+
|
146 |
+
Chỉ lấy history sau lệnh /start hoặc /clear mới nhất
|
147 |
"""
|
148 |
try:
|
149 |
+
# Tìm session /start hoặc /clear mới nhất
|
150 |
+
reset_session = session_collection.find_one(
|
151 |
+
{
|
152 |
+
"user_id": str(user_id),
|
153 |
+
"$or": [
|
154 |
+
{"action": "start"},
|
155 |
+
{"action": "clear"}
|
156 |
+
]
|
157 |
+
},
|
158 |
+
sort=[("created_at_datetime", -1)]
|
159 |
+
)
|
160 |
+
|
161 |
+
# Nếu không tìm thấy session reset nào, lấy n session gần nhất
|
162 |
+
if reset_session:
|
163 |
+
reset_time = reset_session["created_at_datetime"]
|
164 |
+
# Lấy các session sau reset_time
|
165 |
+
docs = list(
|
166 |
+
session_collection.find({
|
167 |
+
"user_id": str(user_id),
|
168 |
+
"created_at_datetime": {"$gt": reset_time}
|
169 |
+
}).sort("created_at_datetime", 1)
|
170 |
+
)
|
171 |
+
logger.info(f"Lấy {len(docs)} session sau lệnh {reset_session['action']} lúc {reset_time}")
|
172 |
+
else:
|
173 |
+
# Không tìm thấy reset session, lấy n session gần nhất
|
174 |
+
docs = list(session_collection.find({"user_id": str(user_id)}).sort("created_at", -1).limit(n))
|
175 |
+
# Đảo ngược để có thứ tự từ cũ đến mới
|
176 |
+
docs.reverse()
|
177 |
+
logger.info(f"Không tìm thấy session reset, lấy {len(docs)} session gần nhất")
|
178 |
+
|
179 |
if not docs:
|
180 |
logger.info(f"Không tìm thấy dữ liệu cho user_id: {user_id}")
|
181 |
return ""
|
|
|
188 |
message = doc.get("message", "")
|
189 |
response = doc.get("response", "")
|
190 |
|
191 |
+
# Bỏ qua lệnh start và clear
|
192 |
+
if action in ["start", "clear"]:
|
193 |
+
continue
|
194 |
+
|
195 |
if factor == "user" and action == "asking_freely":
|
196 |
conversation_lines.append(f"User: {message}")
|
197 |
conversation_lines.append(f"Bot: {response}")
|
|
|
205 |
def get_request_history(user_id, n=3):
|
206 |
"""Get the most recent user requests to use as context for retrieval"""
|
207 |
try:
|
208 |
+
# Truy vấn trực tiếp từ MongoDB
|
209 |
+
history = get_chat_history(user_id, n)
|
210 |
|
211 |
# Just extract the questions for context
|
212 |
requests = []
|
213 |
+
for line in history.split('\n'):
|
214 |
+
if line.startswith("User: "):
|
215 |
+
requests.append(line[6:]) # Lấy nội dung sau "User: "
|
216 |
|
217 |
# Join all recent requests into a single string for context
|
218 |
return " ".join(requests)
|
app/database/postgresql.py
CHANGED
@@ -30,11 +30,11 @@ if not DATABASE_URL:
|
|
30 |
try:
|
31 |
engine = create_engine(
|
32 |
DATABASE_URL,
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
connect_args={
|
39 |
"connect_timeout": 5, # Connection timeout in seconds
|
40 |
"keepalives": 1, # Enable TCP keepalives
|
@@ -89,18 +89,17 @@ def check_db_connection():
|
|
89 |
|
90 |
# Dependency to get DB session with improved error handling
|
91 |
def get_db():
|
92 |
-
"""Get database session
|
93 |
db = SessionLocal()
|
94 |
try:
|
95 |
-
# Test connection
|
96 |
db.execute(text("SELECT 1")).fetchone()
|
97 |
yield db
|
98 |
-
except
|
99 |
-
logger.error(f"
|
100 |
-
db.rollback()
|
101 |
raise
|
102 |
finally:
|
103 |
-
db.close()
|
104 |
|
105 |
# Create tables in database if they don't exist
|
106 |
def create_tables():
|
|
|
30 |
try:
|
31 |
engine = create_engine(
|
32 |
DATABASE_URL,
|
33 |
+
pool_size=10, # Limit max connections
|
34 |
+
max_overflow=5, # Allow temporary overflow of connections
|
35 |
+
pool_timeout=30, # Timeout waiting for connection from pool
|
36 |
+
pool_recycle=300, # Recycle connections every 5 minutes
|
37 |
+
pool_pre_ping=True, # Verify connection is still valid before using it
|
38 |
connect_args={
|
39 |
"connect_timeout": 5, # Connection timeout in seconds
|
40 |
"keepalives": 1, # Enable TCP keepalives
|
|
|
89 |
|
90 |
# Dependency to get DB session with improved error handling
|
91 |
def get_db():
|
92 |
+
"""Get PostgreSQL database session"""
|
93 |
db = SessionLocal()
|
94 |
try:
|
95 |
+
# Test connection
|
96 |
db.execute(text("SELECT 1")).fetchone()
|
97 |
yield db
|
98 |
+
except Exception as e:
|
99 |
+
logger.error(f"DB connection error: {e}")
|
|
|
100 |
raise
|
101 |
finally:
|
102 |
+
db.close() # Ensure connection is closed and returned to pool
|
103 |
|
104 |
# Create tables in database if they don't exist
|
105 |
def create_tables():
|
app/models/pdf_models.py
CHANGED
@@ -7,14 +7,19 @@ class PDFUploadRequest(BaseModel):
|
|
7 |
index_name: Optional[str] = Field("testbot768", description="Tên index trong Pinecone")
|
8 |
title: Optional[str] = Field(None, description="Tiêu đề của tài liệu")
|
9 |
description: Optional[str] = Field(None, description="Mô tả về tài liệu")
|
|
|
10 |
|
11 |
class PDFResponse(BaseModel):
|
12 |
-
"""Response model cho
|
13 |
-
success: bool = Field(
|
14 |
-
document_id: Optional[str] = Field(None, description="ID của tài liệu")
|
|
|
15 |
chunks_processed: Optional[int] = Field(None, description="Số lượng chunks đã xử lý")
|
16 |
-
total_text_length: Optional[int] = Field(None, description="Tổng
|
17 |
-
error: Optional[str] = Field(None, description="Thông báo lỗi nếu có")
|
|
|
|
|
|
|
18 |
|
19 |
class Config:
|
20 |
schema_extra = {
|
@@ -22,7 +27,9 @@ class PDFResponse(BaseModel):
|
|
22 |
"success": True,
|
23 |
"document_id": "550e8400-e29b-41d4-a716-446655440000",
|
24 |
"chunks_processed": 25,
|
25 |
-
"total_text_length": 50000
|
|
|
|
|
26 |
}
|
27 |
}
|
28 |
|
@@ -31,14 +38,18 @@ class DeleteDocumentRequest(BaseModel):
|
|
31 |
document_id: str = Field(..., description="ID của tài liệu cần xóa")
|
32 |
namespace: Optional[str] = Field("Default", description="Namespace trong Pinecone")
|
33 |
index_name: Optional[str] = Field("testbot768", description="Tên index trong Pinecone")
|
|
|
34 |
|
35 |
class DocumentsListResponse(BaseModel):
|
36 |
-
"""Response model cho
|
37 |
-
success: bool = Field(
|
38 |
-
total_vectors: Optional[int] = Field(None, description="Tổng số vectors trong
|
39 |
-
namespace: Optional[str] = Field(None, description="Namespace
|
40 |
-
index_name: Optional[str] = Field(None, description="Tên index
|
41 |
-
|
|
|
|
|
|
|
42 |
|
43 |
class Config:
|
44 |
schema_extra = {
|
|
|
7 |
index_name: Optional[str] = Field("testbot768", description="Tên index trong Pinecone")
|
8 |
title: Optional[str] = Field(None, description="Tiêu đề của tài liệu")
|
9 |
description: Optional[str] = Field(None, description="Mô tả về tài liệu")
|
10 |
+
vector_database_id: Optional[int] = Field(None, description="ID của vector database trong PostgreSQL để sử dụng")
|
11 |
|
12 |
class PDFResponse(BaseModel):
|
13 |
+
"""Response model cho các endpoints liên quan đến PDF."""
|
14 |
+
success: bool = Field(False, description="Kết quả xử lý: true/false")
|
15 |
+
document_id: Optional[str] = Field(None, description="ID của tài liệu đã xử lý")
|
16 |
+
document_database_id: Optional[int] = Field(None, description="ID của tài liệu trong PostgreSQL (nếu có)")
|
17 |
chunks_processed: Optional[int] = Field(None, description="Số lượng chunks đã xử lý")
|
18 |
+
total_text_length: Optional[int] = Field(None, description="Tổng kích thước text đã xử lý")
|
19 |
+
error: Optional[str] = Field(None, description="Thông báo lỗi (nếu có)")
|
20 |
+
warning: Optional[str] = Field(None, description="Cảnh báo (nếu có)")
|
21 |
+
mock_mode: Optional[bool] = Field(None, description="Đã chạy ở chế độ mock hay không")
|
22 |
+
message: Optional[str] = Field(None, description="Thông báo thành công")
|
23 |
|
24 |
class Config:
|
25 |
schema_extra = {
|
|
|
27 |
"success": True,
|
28 |
"document_id": "550e8400-e29b-41d4-a716-446655440000",
|
29 |
"chunks_processed": 25,
|
30 |
+
"total_text_length": 50000,
|
31 |
+
"mock_mode": False,
|
32 |
+
"message": "Successfully processed document"
|
33 |
}
|
34 |
}
|
35 |
|
|
|
38 |
document_id: str = Field(..., description="ID của tài liệu cần xóa")
|
39 |
namespace: Optional[str] = Field("Default", description="Namespace trong Pinecone")
|
40 |
index_name: Optional[str] = Field("testbot768", description="Tên index trong Pinecone")
|
41 |
+
vector_database_id: Optional[int] = Field(None, description="ID của vector database trong PostgreSQL")
|
42 |
|
43 |
class DocumentsListResponse(BaseModel):
|
44 |
+
"""Response model cho danh sách documents"""
|
45 |
+
success: bool = Field(False, description="Kết quả xử lý: true/false")
|
46 |
+
total_vectors: Optional[int] = Field(None, description="Tổng số vectors trong namespace")
|
47 |
+
namespace: Optional[str] = Field(None, description="Namespace đã truy vấn")
|
48 |
+
index_name: Optional[str] = Field(None, description="Tên index đã truy vấn")
|
49 |
+
documents: Optional[List[Dict[str, Any]]] = Field(None, description="Danh sách documents")
|
50 |
+
postgresql_documents: Optional[List[Dict[str, Any]]] = Field(None, description="Danh sách documents từ PostgreSQL")
|
51 |
+
postgresql_document_count: Optional[int] = Field(None, description="Số lượng documents từ PostgreSQL")
|
52 |
+
error: Optional[str] = Field(None, description="Thông báo lỗi (nếu có)")
|
53 |
|
54 |
class Config:
|
55 |
schema_extra = {
|
app/utils/cache.py
CHANGED
@@ -17,8 +17,6 @@ load_dotenv()
|
|
17 |
DEFAULT_CACHE_TTL = int(os.getenv("CACHE_TTL_SECONDS", "300")) # Mặc định 5 phút
|
18 |
DEFAULT_CACHE_CLEANUP_INTERVAL = int(os.getenv("CACHE_CLEANUP_INTERVAL", "60")) # Mặc định 1 phút
|
19 |
DEFAULT_CACHE_MAX_SIZE = int(os.getenv("CACHE_MAX_SIZE", "1000")) # Mặc định 1000 phần tử
|
20 |
-
DEFAULT_HISTORY_QUEUE_SIZE = int(os.getenv("HISTORY_QUEUE_SIZE", "10")) # Mặc định queue size là 10
|
21 |
-
DEFAULT_HISTORY_CACHE_TTL = int(os.getenv("HISTORY_CACHE_TTL", "3600")) # Mặc định 1 giờ
|
22 |
|
23 |
# Generic type để có thể sử dụng cho nhiều loại giá trị khác nhau
|
24 |
T = TypeVar('T')
|
@@ -42,36 +40,6 @@ class CacheItem(Generic[T]):
|
|
42 |
"""Gia hạn thời gian sống của item"""
|
43 |
self.expire_at = time.time() + ttl
|
44 |
|
45 |
-
|
46 |
-
# Lớp HistoryQueue để lưu trữ lịch sử người dùng
|
47 |
-
class HistoryQueue:
|
48 |
-
def __init__(self, max_size: int = DEFAULT_HISTORY_QUEUE_SIZE, ttl: int = DEFAULT_HISTORY_CACHE_TTL):
|
49 |
-
self.items: List[Dict[str, Any]] = []
|
50 |
-
self.max_size = max_size
|
51 |
-
self.ttl = ttl
|
52 |
-
self.expire_at = time.time() + ttl
|
53 |
-
|
54 |
-
def add(self, item: Dict[str, Any]) -> None:
|
55 |
-
"""Thêm một item vào queue, nếu đã đầy thì loại bỏ item cũ nhất"""
|
56 |
-
if len(self.items) >= self.max_size:
|
57 |
-
self.items.pop(0)
|
58 |
-
self.items.append(item)
|
59 |
-
# Mỗi khi thêm item mới, cập nhật thời gian hết hạn
|
60 |
-
self.refresh_expiry()
|
61 |
-
|
62 |
-
def get_all(self) -> List[Dict[str, Any]]:
|
63 |
-
"""Lấy tất cả items trong queue"""
|
64 |
-
return self.items
|
65 |
-
|
66 |
-
def is_expired(self) -> bool:
|
67 |
-
"""Kiểm tra xem queue có hết hạn chưa"""
|
68 |
-
return time.time() > self.expire_at
|
69 |
-
|
70 |
-
def refresh_expiry(self) -> None:
|
71 |
-
"""Làm mới thời gian hết hạn"""
|
72 |
-
self.expire_at = time.time() + self.ttl
|
73 |
-
|
74 |
-
|
75 |
# Lớp cache chính
|
76 |
class InMemoryCache:
|
77 |
def __init__(
|
@@ -84,7 +52,6 @@ class InMemoryCache:
|
|
84 |
self.ttl = ttl
|
85 |
self.cleanup_interval = cleanup_interval
|
86 |
self.max_size = max_size
|
87 |
-
self.user_history_queues: Dict[str, HistoryQueue] = {}
|
88 |
self.lock = threading.RLock() # Sử dụng RLock để tránh deadlock
|
89 |
|
90 |
# Khởi động thread dọn dẹp cache định kỳ (active expiration)
|
@@ -170,13 +137,8 @@ class InMemoryCache:
|
|
170 |
for key in expired_keys:
|
171 |
del self.cache[key]
|
172 |
|
173 |
-
|
174 |
-
|
175 |
-
for user_id in expired_user_ids:
|
176 |
-
del self.user_history_queues[user_id]
|
177 |
-
|
178 |
-
if expired_keys or expired_user_ids:
|
179 |
-
logger.debug(f"Cleaned up {len(expired_keys)} expired cache items and {len(expired_user_ids)} expired history queues")
|
180 |
|
181 |
def _evict_lru_items(self, count: int = 1) -> None:
|
182 |
"""Xóa bỏ các item ít được truy cập nhất khi cache đầy"""
|
@@ -198,8 +160,7 @@ class InMemoryCache:
|
|
198 |
"active_items": total_items - expired_items,
|
199 |
"memory_usage_bytes": memory_usage,
|
200 |
"memory_usage_mb": memory_usage / (1024 * 1024),
|
201 |
-
"max_size": self.max_size
|
202 |
-
"history_queues": len(self.user_history_queues)
|
203 |
}
|
204 |
|
205 |
def _estimate_memory_usage(self) -> int:
|
@@ -219,46 +180,7 @@ class InMemoryCache:
|
|
219 |
except:
|
220 |
cache_size += 100
|
221 |
|
222 |
-
# Ước tính kích thước của user history queues
|
223 |
-
for queue in self.user_history_queues.values():
|
224 |
-
try:
|
225 |
-
cache_size += len(json.dumps(queue.items)) + 100 # 100 bytes cho metadata
|
226 |
-
except:
|
227 |
-
cache_size += 100
|
228 |
-
|
229 |
return cache_size
|
230 |
-
|
231 |
-
# Các phương thức chuyên biệt cho việc quản lý lịch sử người dùng
|
232 |
-
def add_user_history(self, user_id: str, item: Dict[str, Any], queue_size: Optional[int] = None, ttl: Optional[int] = None) -> None:
|
233 |
-
"""Thêm một item vào history queue của người dùng"""
|
234 |
-
with self.lock:
|
235 |
-
# Tạo queue nếu chưa tồn tại
|
236 |
-
if user_id not in self.user_history_queues:
|
237 |
-
queue_size_value = queue_size if queue_size is not None else DEFAULT_HISTORY_QUEUE_SIZE
|
238 |
-
ttl_value = ttl if ttl is not None else DEFAULT_HISTORY_CACHE_TTL
|
239 |
-
self.user_history_queues[user_id] = HistoryQueue(max_size=queue_size_value, ttl=ttl_value)
|
240 |
-
|
241 |
-
# Thêm item vào queue
|
242 |
-
self.user_history_queues[user_id].add(item)
|
243 |
-
logger.debug(f"Added history item for user {user_id}")
|
244 |
-
|
245 |
-
def get_user_history(self, user_id: str, default: Any = None) -> List[Dict[str, Any]]:
|
246 |
-
"""Lấy lịch sử của người dùng từ cache"""
|
247 |
-
with self.lock:
|
248 |
-
queue = self.user_history_queues.get(user_id)
|
249 |
-
|
250 |
-
# Nếu không tìm thấy queue hoặc queue đã hết hạn
|
251 |
-
if queue is None or queue.is_expired():
|
252 |
-
if queue is not None and queue.is_expired():
|
253 |
-
del self.user_history_queues[user_id]
|
254 |
-
logger.debug(f"User history queue expired: {user_id}")
|
255 |
-
return default if default is not None else []
|
256 |
-
|
257 |
-
# Làm mới thời gian hết hạn
|
258 |
-
queue.refresh_expiry()
|
259 |
-
logger.debug(f"Retrieved history for user {user_id}: {len(queue.items)} items")
|
260 |
-
return queue.get_all()
|
261 |
-
|
262 |
|
263 |
# Singleton instance
|
264 |
_cache_instance = None
|
|
|
17 |
DEFAULT_CACHE_TTL = int(os.getenv("CACHE_TTL_SECONDS", "300")) # Mặc định 5 phút
|
18 |
DEFAULT_CACHE_CLEANUP_INTERVAL = int(os.getenv("CACHE_CLEANUP_INTERVAL", "60")) # Mặc định 1 phút
|
19 |
DEFAULT_CACHE_MAX_SIZE = int(os.getenv("CACHE_MAX_SIZE", "1000")) # Mặc định 1000 phần tử
|
|
|
|
|
20 |
|
21 |
# Generic type để có thể sử dụng cho nhiều loại giá trị khác nhau
|
22 |
T = TypeVar('T')
|
|
|
40 |
"""Gia hạn thời gian sống của item"""
|
41 |
self.expire_at = time.time() + ttl
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
# Lớp cache chính
|
44 |
class InMemoryCache:
|
45 |
def __init__(
|
|
|
52 |
self.ttl = ttl
|
53 |
self.cleanup_interval = cleanup_interval
|
54 |
self.max_size = max_size
|
|
|
55 |
self.lock = threading.RLock() # Sử dụng RLock để tránh deadlock
|
56 |
|
57 |
# Khởi động thread dọn dẹp cache định kỳ (active expiration)
|
|
|
137 |
for key in expired_keys:
|
138 |
del self.cache[key]
|
139 |
|
140 |
+
if expired_keys:
|
141 |
+
logger.debug(f"Cleaned up {len(expired_keys)} expired cache items")
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
def _evict_lru_items(self, count: int = 1) -> None:
|
144 |
"""Xóa bỏ các item ít được truy cập nhất khi cache đầy"""
|
|
|
160 |
"active_items": total_items - expired_items,
|
161 |
"memory_usage_bytes": memory_usage,
|
162 |
"memory_usage_mb": memory_usage / (1024 * 1024),
|
163 |
+
"max_size": self.max_size
|
|
|
164 |
}
|
165 |
|
166 |
def _estimate_memory_usage(self) -> int:
|
|
|
180 |
except:
|
181 |
cache_size += 100
|
182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
return cache_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
# Singleton instance
|
186 |
_cache_instance = None
|
app/utils/pdf_processor.py
CHANGED
@@ -1,211 +1,449 @@
|
|
1 |
import os
|
2 |
-
import
|
3 |
import uuid
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from langchain_community.document_loaders import PyPDFLoader
|
|
|
6 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
7 |
-
import
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
# Cấu hình logging
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
14 |
-
# Khởi tạo embeddings model
|
15 |
-
embeddings_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
|
16 |
-
|
17 |
class PDFProcessor:
|
18 |
-
"""
|
19 |
|
20 |
-
def __init__(self, index_name="testbot768", namespace="Default"):
|
21 |
-
"""Khởi tạo với tên index và namespace Pinecone mặc định"""
|
22 |
self.index_name = index_name
|
23 |
self.namespace = namespace
|
|
|
|
|
24 |
self.pinecone_index = None
|
|
|
|
|
|
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
logger.
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
|
39 |
async def process_pdf(self, file_path, document_id=None, metadata=None, progress_callback=None):
|
40 |
-
"""
|
41 |
-
Xử lý file PDF, chia thành chunks và tạo embeddings
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
Returns:
|
50 |
-
dict: Thông tin kết quả xử lý gồm document_id và số chunks đã xử lý
|
51 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
try:
|
53 |
-
#
|
54 |
-
if
|
55 |
-
|
56 |
-
return {"success": False, "error": "Không thể kết nối đến Pinecone"}
|
57 |
|
58 |
-
#
|
59 |
-
if
|
60 |
document_id = str(uuid.uuid4())
|
61 |
|
62 |
-
#
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
if progress_callback:
|
65 |
-
await progress_callback("
|
66 |
|
67 |
loader = PyPDFLoader(file_path)
|
68 |
-
|
|
|
69 |
|
70 |
-
|
71 |
-
all_text = ""
|
72 |
-
for page in pages:
|
73 |
-
all_text += page.page_content + "\n"
|
74 |
|
|
|
75 |
if progress_callback:
|
76 |
-
await progress_callback("
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
-
logger.info(f"
|
|
|
|
|
83 |
if progress_callback:
|
84 |
-
await progress_callback("
|
85 |
-
|
86 |
-
#
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
-
#
|
98 |
-
|
99 |
-
"document_id": document_id,
|
100 |
-
"chunk_index": i,
|
101 |
-
"text": chunk
|
102 |
-
}
|
103 |
|
104 |
-
#
|
105 |
-
|
106 |
-
for key, value in metadata.items():
|
107 |
-
if key not in vector_metadata:
|
108 |
-
vector_metadata[key] = value
|
109 |
|
110 |
-
#
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
|
|
|
|
|
|
121 |
|
122 |
-
#
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
125 |
|
126 |
-
logger.info(f"
|
127 |
|
128 |
-
# Final progress update
|
129 |
if progress_callback:
|
130 |
-
await progress_callback("
|
131 |
|
|
|
132 |
return {
|
133 |
"success": True,
|
134 |
"document_id": document_id,
|
135 |
"chunks_processed": len(chunks),
|
136 |
-
"total_text_length":
|
|
|
|
|
|
|
137 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
|
|
|
|
|
|
|
|
139 |
except Exception as e:
|
140 |
-
logger.error(f"
|
141 |
-
if progress_callback:
|
142 |
-
await progress_callback("error", 0, f"Error processing PDF: {str(e)}")
|
143 |
return {
|
144 |
"success": False,
|
145 |
-
"error": str(e)
|
146 |
}
|
147 |
|
148 |
-
async def
|
149 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
try:
|
151 |
-
if not
|
152 |
-
|
153 |
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
)
|
|
|
158 |
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
except Exception as e:
|
162 |
-
logger.error(f"
|
163 |
-
|
|
|
|
|
|
|
|
|
164 |
|
165 |
-
async def
|
166 |
-
"""
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
try:
|
174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
result = self.pinecone_index.delete(
|
176 |
-
|
177 |
-
namespace=
|
178 |
)
|
179 |
-
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
except Exception as e:
|
182 |
-
logger.error(f"
|
183 |
-
return {
|
|
|
|
|
|
|
|
|
184 |
|
185 |
async def list_documents(self):
|
186 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
try:
|
188 |
-
# Khởi tạo kết nối Pinecone nếu chưa có
|
189 |
if not self.pinecone_index:
|
190 |
-
|
191 |
-
|
|
|
|
|
192 |
|
193 |
-
#
|
194 |
stats = self.pinecone_index.describe_index_stats()
|
|
|
|
|
195 |
|
196 |
-
#
|
197 |
-
#
|
198 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
return {
|
201 |
"success": True,
|
202 |
-
"
|
203 |
-
"
|
204 |
-
"
|
|
|
205 |
}
|
206 |
except Exception as e:
|
207 |
-
logger.error(f"
|
208 |
return {
|
209 |
"success": False,
|
210 |
-
"error": str(e)
|
211 |
-
}
|
|
|
|
1 |
import os
|
2 |
+
import logging
|
3 |
import uuid
|
4 |
+
import pinecone
|
5 |
+
from app.utils.pinecone_fix import PineconeConnectionManager, check_connection
|
6 |
+
import time
|
7 |
+
import os
|
8 |
+
from typing import List, Dict, Any, Optional
|
9 |
+
|
10 |
+
# Langchain imports for document processing
|
11 |
from langchain_community.document_loaders import PyPDFLoader
|
12 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
14 |
+
import google.generativeai as genai
|
15 |
|
16 |
+
# Configure logger
|
|
|
|
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
|
|
|
|
|
|
19 |
class PDFProcessor:
|
20 |
+
"""Process PDF files and create embeddings in Pinecone"""
|
21 |
|
22 |
+
def __init__(self, index_name="testbot768", namespace="Default", api_key=None, vector_db_id=None, mock_mode=False, correlation_id=None):
|
|
|
23 |
self.index_name = index_name
|
24 |
self.namespace = namespace
|
25 |
+
self.api_key = api_key
|
26 |
+
self.vector_db_id = vector_db_id
|
27 |
self.pinecone_index = None
|
28 |
+
self.mock_mode = mock_mode
|
29 |
+
self.correlation_id = correlation_id or str(uuid.uuid4())[:8]
|
30 |
+
self.google_api_key = os.environ.get("GOOGLE_API_KEY")
|
31 |
|
32 |
+
# Initialize Pinecone connection if not in mock mode
|
33 |
+
if not self.mock_mode and self.api_key:
|
34 |
+
try:
|
35 |
+
# Use connection manager from pinecone_fix
|
36 |
+
logger.info(f"[{self.correlation_id}] Initializing Pinecone connection to {self.index_name}")
|
37 |
+
self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)
|
38 |
+
logger.info(f"[{self.correlation_id}] Successfully connected to Pinecone index {self.index_name}")
|
39 |
+
except Exception as e:
|
40 |
+
logger.error(f"[{self.correlation_id}] Failed to initialize Pinecone: {str(e)}")
|
41 |
+
# Fall back to mock mode if connection fails
|
42 |
+
self.mock_mode = True
|
43 |
+
logger.warning(f"[{self.correlation_id}] Falling back to mock mode due to connection error")
|
44 |
|
45 |
async def process_pdf(self, file_path, document_id=None, metadata=None, progress_callback=None):
|
46 |
+
"""Process a PDF file and create vector embeddings
|
|
|
47 |
|
48 |
+
This method:
|
49 |
+
1. Extracts text from PDF using PyPDFLoader
|
50 |
+
2. Splits text into chunks using RecursiveCharacterTextSplitter
|
51 |
+
3. Creates embeddings using Google Gemini model
|
52 |
+
4. Stores embeddings in Pinecone
|
|
|
|
|
|
|
53 |
"""
|
54 |
+
logger.info(f"[{self.correlation_id}] Processing PDF: {file_path}")
|
55 |
+
|
56 |
+
if self.mock_mode:
|
57 |
+
logger.info(f"[{self.correlation_id}] MOCK: Processing PDF {file_path}")
|
58 |
+
# Mock implementation - just return success
|
59 |
+
if progress_callback:
|
60 |
+
await progress_callback(None, document_id, "embedding_complete", 1.0, "Mock processing completed")
|
61 |
+
return {"success": True, "message": "PDF processed successfully"}
|
62 |
+
|
63 |
try:
|
64 |
+
# Initialize metadata if not provided
|
65 |
+
if metadata is None:
|
66 |
+
metadata = {}
|
|
|
67 |
|
68 |
+
# Ensure document_id is included
|
69 |
+
if document_id is None:
|
70 |
document_id = str(uuid.uuid4())
|
71 |
|
72 |
+
# Add document_id to metadata
|
73 |
+
metadata["document_id"] = document_id
|
74 |
+
|
75 |
+
# The namespace to use might be in vdb-X format if vector_db_id provided
|
76 |
+
actual_namespace = f"vdb-{self.vector_db_id}" if self.vector_db_id else self.namespace
|
77 |
+
|
78 |
+
# 1. Extract text from PDF
|
79 |
+
logger.info(f"[{self.correlation_id}] Extracting text from PDF: {file_path}")
|
80 |
if progress_callback:
|
81 |
+
await progress_callback(None, document_id, "text_extraction", 0.2, "Extracting text from PDF")
|
82 |
|
83 |
loader = PyPDFLoader(file_path)
|
84 |
+
documents = loader.load()
|
85 |
+
total_text_length = sum(len(doc.page_content) for doc in documents)
|
86 |
|
87 |
+
logger.info(f"[{self.correlation_id}] Extracted {len(documents)} pages, total text length: {total_text_length}")
|
|
|
|
|
|
|
88 |
|
89 |
+
# 2. Split text into chunks
|
90 |
if progress_callback:
|
91 |
+
await progress_callback(None, document_id, "chunking", 0.4, "Splitting text into chunks")
|
92 |
+
|
93 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
94 |
+
chunk_size=1000,
|
95 |
+
chunk_overlap=100,
|
96 |
+
length_function=len,
|
97 |
+
separators=["\n\n", "\n", " ", ""]
|
98 |
+
)
|
99 |
+
|
100 |
+
chunks = text_splitter.split_documents(documents)
|
101 |
|
102 |
+
logger.info(f"[{self.correlation_id}] Split into {len(chunks)} chunks")
|
103 |
+
|
104 |
+
# 3. Create embeddings
|
105 |
if progress_callback:
|
106 |
+
await progress_callback(None, document_id, "embedding", 0.6, "Creating embeddings")
|
107 |
+
|
108 |
+
# Initialize Google Gemini for embeddings
|
109 |
+
if not self.google_api_key:
|
110 |
+
raise ValueError("Google API key not found in environment variables")
|
111 |
+
|
112 |
+
genai.configure(api_key=self.google_api_key)
|
113 |
+
|
114 |
+
# First, get the expected dimensions from Pinecone
|
115 |
+
logger.info(f"[{self.correlation_id}] Checking Pinecone index dimensions")
|
116 |
+
if not self.pinecone_index:
|
117 |
+
self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)
|
118 |
+
|
119 |
+
stats = self.pinecone_index.describe_index_stats()
|
120 |
+
pinecone_dimension = stats.dimension
|
121 |
+
logger.info(f"[{self.correlation_id}] Pinecone index dimension: {pinecone_dimension}")
|
122 |
+
|
123 |
+
# Create embedding model
|
124 |
+
embedding_model = GoogleGenerativeAIEmbeddings(
|
125 |
+
model="models/embedding-001",
|
126 |
+
google_api_key=self.google_api_key,
|
127 |
+
task_type="retrieval_document" # Use document embedding mode for longer text
|
128 |
+
)
|
129 |
+
|
130 |
+
# Get a sample embedding to check dimensions
|
131 |
+
sample_embedding = embedding_model.embed_query("test")
|
132 |
+
embedding_dimension = len(sample_embedding)
|
133 |
+
|
134 |
+
logger.info(f"[{self.correlation_id}] Generated embeddings with dimension: {embedding_dimension}")
|
135 |
+
|
136 |
+
# Dimension handling - if mismatch, we handle it appropriately
|
137 |
+
if embedding_dimension != pinecone_dimension:
|
138 |
+
logger.warning(f"[{self.correlation_id}] Embedding dimension mismatch: got {embedding_dimension}, need {pinecone_dimension}")
|
139 |
|
140 |
+
if embedding_dimension < pinecone_dimension:
|
141 |
+
# For upscaling from 768 to 1536: duplicate each value and scale appropriately
|
142 |
+
# This is one approach to handle dimension mismatches while preserving semantic information
|
143 |
+
logger.info(f"[{self.correlation_id}] Using duplication strategy to upscale from {embedding_dimension} to {pinecone_dimension}")
|
144 |
+
|
145 |
+
if embedding_dimension * 2 == pinecone_dimension:
|
146 |
+
# Perfect doubling (768 -> 1536)
|
147 |
+
def adjust_embedding(embedding):
|
148 |
+
# Duplicate each value to double the dimension
|
149 |
+
return [val for val in embedding for _ in range(2)]
|
150 |
+
else:
|
151 |
+
# Generic padding with zeros
|
152 |
+
pad_size = pinecone_dimension - embedding_dimension
|
153 |
+
def adjust_embedding(embedding):
|
154 |
+
return embedding + [0.0] * pad_size
|
155 |
+
else:
|
156 |
+
# Truncation strategy - take first pinecone_dimension values
|
157 |
+
logger.info(f"[{self.correlation_id}] Will truncate embeddings from {embedding_dimension} to {pinecone_dimension}")
|
158 |
+
|
159 |
+
def adjust_embedding(embedding):
|
160 |
+
return embedding[:pinecone_dimension]
|
161 |
+
else:
|
162 |
+
# No adjustment needed
|
163 |
+
def adjust_embedding(embedding):
|
164 |
+
return embedding
|
165 |
+
|
166 |
+
# Process in batches to avoid memory issues
|
167 |
+
batch_size = 10
|
168 |
+
vectors_to_upsert = []
|
169 |
+
|
170 |
+
for i in range(0, len(chunks), batch_size):
|
171 |
+
batch = chunks[i:i+batch_size]
|
172 |
|
173 |
+
# Extract text content
|
174 |
+
texts = [chunk.page_content for chunk in batch]
|
|
|
|
|
|
|
|
|
175 |
|
176 |
+
# Create embeddings for batch
|
177 |
+
embeddings = embedding_model.embed_documents(texts)
|
|
|
|
|
|
|
178 |
|
179 |
+
# Prepare vectors for Pinecone
|
180 |
+
for j, (chunk, embedding) in enumerate(zip(batch, embeddings)):
|
181 |
+
# Adjust embedding dimensions if needed
|
182 |
+
adjusted_embedding = adjust_embedding(embedding)
|
183 |
+
|
184 |
+
# Verify dimensions are correct
|
185 |
+
if len(adjusted_embedding) != pinecone_dimension:
|
186 |
+
raise ValueError(f"Dimension mismatch after adjustment: got {len(adjusted_embedding)}, expected {pinecone_dimension}")
|
187 |
+
|
188 |
+
# Create metadata for this chunk
|
189 |
+
chunk_metadata = {
|
190 |
+
"document_id": document_id,
|
191 |
+
"page": chunk.metadata.get("page", 0),
|
192 |
+
"chunk_id": f"{document_id}-chunk-{i+j}",
|
193 |
+
"text": chunk.page_content[:1000], # Store first 1000 chars of text
|
194 |
+
**metadata # Include original metadata
|
195 |
+
}
|
196 |
+
|
197 |
+
# Create vector record
|
198 |
+
vector = {
|
199 |
+
"id": f"{document_id}-{i+j}",
|
200 |
+
"values": adjusted_embedding,
|
201 |
+
"metadata": chunk_metadata
|
202 |
+
}
|
203 |
+
|
204 |
+
vectors_to_upsert.append(vector)
|
205 |
|
206 |
+
logger.info(f"[{self.correlation_id}] Processed batch {i//batch_size + 1}/{(len(chunks)-1)//batch_size + 1}")
|
207 |
+
|
208 |
+
# 4. Store embeddings in Pinecone
|
209 |
+
if progress_callback:
|
210 |
+
await progress_callback(None, document_id, "storing", 0.8, f"Storing {len(vectors_to_upsert)} vectors in Pinecone")
|
211 |
+
|
212 |
+
logger.info(f"[{self.correlation_id}] Upserting {len(vectors_to_upsert)} vectors to Pinecone index {self.index_name}, namespace {actual_namespace}")
|
213 |
|
214 |
+
# Use PineconeConnectionManager for better error handling
|
215 |
+
result = PineconeConnectionManager.upsert_vectors_with_validation(
|
216 |
+
self.pinecone_index,
|
217 |
+
vectors_to_upsert,
|
218 |
+
namespace=actual_namespace
|
219 |
+
)
|
220 |
|
221 |
+
logger.info(f"[{self.correlation_id}] Successfully upserted {result.get('upserted_count', 0)} vectors to Pinecone")
|
222 |
|
|
|
223 |
if progress_callback:
|
224 |
+
await progress_callback(None, document_id, "embedding_complete", 1.0, "Processing completed")
|
225 |
|
226 |
+
# Return success with stats
|
227 |
return {
|
228 |
"success": True,
|
229 |
"document_id": document_id,
|
230 |
"chunks_processed": len(chunks),
|
231 |
+
"total_text_length": total_text_length,
|
232 |
+
"vectors_created": len(vectors_to_upsert),
|
233 |
+
"vectors_upserted": result.get('upserted_count', 0),
|
234 |
+
"message": "PDF processed successfully"
|
235 |
}
|
236 |
+
except Exception as e:
|
237 |
+
logger.error(f"[{self.correlation_id}] Error processing PDF: {str(e)}")
|
238 |
+
return {
|
239 |
+
"success": False,
|
240 |
+
"error": f"Error processing PDF: {str(e)}"
|
241 |
+
}
|
242 |
+
|
243 |
+
async def list_namespaces(self):
|
244 |
+
"""List all namespaces in the Pinecone index"""
|
245 |
+
if self.mock_mode:
|
246 |
+
logger.info(f"[{self.correlation_id}] MOCK: Listing namespaces")
|
247 |
+
return {"success": True, "namespaces": ["test"]}
|
248 |
+
|
249 |
+
try:
|
250 |
+
if not self.pinecone_index:
|
251 |
+
self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)
|
252 |
+
|
253 |
+
# Get index stats which includes namespaces
|
254 |
+
stats = self.pinecone_index.describe_index_stats()
|
255 |
+
namespaces = list(stats.get("namespaces", {}).keys())
|
256 |
|
257 |
+
return {
|
258 |
+
"success": True,
|
259 |
+
"namespaces": namespaces
|
260 |
+
}
|
261 |
except Exception as e:
|
262 |
+
logger.error(f"[{self.correlation_id}] Error listing namespaces: {str(e)}")
|
|
|
|
|
263 |
return {
|
264 |
"success": False,
|
265 |
+
"error": f"Error listing namespaces: {str(e)}"
|
266 |
}
|
267 |
|
268 |
+
async def delete_namespace(self):
|
269 |
+
"""Delete all vectors in a namespace"""
|
270 |
+
if self.mock_mode:
|
271 |
+
logger.info(f"[{self.correlation_id}] MOCK: Deleting namespace '{self.namespace}'")
|
272 |
+
return {
|
273 |
+
"success": True,
|
274 |
+
"namespace": self.namespace,
|
275 |
+
"deleted_count": 100,
|
276 |
+
"message": f"Successfully deleted namespace '{self.namespace}'"
|
277 |
+
}
|
278 |
+
|
279 |
try:
|
280 |
+
if not self.pinecone_index:
|
281 |
+
self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)
|
282 |
|
283 |
+
logger.info(f"[{self.correlation_id}] Deleting namespace '{self.namespace}' from index '{self.index_name}'")
|
284 |
+
|
285 |
+
# Check if namespace exists
|
286 |
+
stats = self.pinecone_index.describe_index_stats()
|
287 |
+
namespaces = stats.get("namespaces", {})
|
288 |
|
289 |
+
if self.namespace in namespaces:
|
290 |
+
vector_count = namespaces[self.namespace].get("vector_count", 0)
|
291 |
+
# Delete all vectors in namespace
|
292 |
+
self.pinecone_index.delete(delete_all=True, namespace=self.namespace)
|
293 |
+
return {
|
294 |
+
"success": True,
|
295 |
+
"namespace": self.namespace,
|
296 |
+
"deleted_count": vector_count,
|
297 |
+
"message": f"Successfully deleted namespace '{self.namespace}' with {vector_count} vectors"
|
298 |
+
}
|
299 |
+
else:
|
300 |
+
return {
|
301 |
+
"success": True,
|
302 |
+
"namespace": self.namespace,
|
303 |
+
"deleted_count": 0,
|
304 |
+
"message": f"Namespace '{self.namespace}' does not exist - nothing to delete"
|
305 |
+
}
|
306 |
except Exception as e:
|
307 |
+
logger.error(f"[{self.correlation_id}] Error deleting namespace: {str(e)}")
|
308 |
+
return {
|
309 |
+
"success": False,
|
310 |
+
"namespace": self.namespace,
|
311 |
+
"error": f"Error deleting namespace: {str(e)}"
|
312 |
+
}
|
313 |
|
314 |
+
async def delete_document(self, document_id):
|
315 |
+
"""Delete vectors associated with a specific document ID"""
|
316 |
+
logger.info(f"[{self.correlation_id}] Deleting vectors for document '{document_id}' from namespace '{self.namespace}'")
|
317 |
+
|
318 |
+
if self.mock_mode:
|
319 |
+
logger.info(f"[{self.correlation_id}] MOCK: Deleting document vectors for '{document_id}'")
|
320 |
+
# In mock mode, simulate deleting 10 vectors
|
321 |
+
return {
|
322 |
+
"success": True,
|
323 |
+
"document_id": document_id,
|
324 |
+
"namespace": self.namespace,
|
325 |
+
"deleted_count": 10,
|
326 |
+
"message": f"Successfully deleted vectors for document '{document_id}' from namespace '{self.namespace}'"
|
327 |
+
}
|
328 |
|
329 |
try:
|
330 |
+
if not self.pinecone_index:
|
331 |
+
self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)
|
332 |
+
|
333 |
+
# Use metadata filtering to find vectors with matching document_id
|
334 |
+
# The specific namespace to use might be vdb-X format if vector_db_id provided
|
335 |
+
actual_namespace = f"vdb-{self.vector_db_id}" if self.vector_db_id else self.namespace
|
336 |
+
|
337 |
+
# Search for vectors with this document ID
|
338 |
+
results = self.pinecone_index.query(
|
339 |
+
vector=[0] * 1536, # Dummy vector, we only care about metadata filter
|
340 |
+
top_k=1,
|
341 |
+
include_metadata=True,
|
342 |
+
filter={"document_id": document_id},
|
343 |
+
namespace=actual_namespace
|
344 |
+
)
|
345 |
+
|
346 |
+
# If no vectors found, return success with warning
|
347 |
+
if len(results.get("matches", [])) == 0:
|
348 |
+
logger.warning(f"[{self.correlation_id}] No vectors found for document '{document_id}' in namespace '{actual_namespace}'")
|
349 |
+
return {
|
350 |
+
"success": True,
|
351 |
+
"document_id": document_id,
|
352 |
+
"namespace": actual_namespace,
|
353 |
+
"deleted_count": 0,
|
354 |
+
"warning": f"No vectors found for document '{document_id}' in namespace '{actual_namespace}'",
|
355 |
+
"message": f"Successfully deleted 0 vectors for document '{document_id}' from namespace '{actual_namespace}'"
|
356 |
+
}
|
357 |
+
|
358 |
+
# Delete vectors by filter
|
359 |
result = self.pinecone_index.delete(
|
360 |
+
filter={"document_id": document_id},
|
361 |
+
namespace=actual_namespace
|
362 |
)
|
363 |
+
|
364 |
+
# Get delete count from result
|
365 |
+
deleted_count = result.get("deleted_count", 0)
|
366 |
+
|
367 |
+
return {
|
368 |
+
"success": True,
|
369 |
+
"document_id": document_id,
|
370 |
+
"namespace": actual_namespace,
|
371 |
+
"deleted_count": deleted_count,
|
372 |
+
"message": f"Successfully deleted {deleted_count} vectors for document '{document_id}' from namespace '{actual_namespace}'"
|
373 |
+
}
|
374 |
except Exception as e:
|
375 |
+
logger.error(f"[{self.correlation_id}] Error deleting document vectors: {str(e)}")
|
376 |
+
return {
|
377 |
+
"success": False,
|
378 |
+
"document_id": document_id,
|
379 |
+
"error": f"Error deleting document vectors: {str(e)}"
|
380 |
+
}
|
381 |
|
382 |
async def list_documents(self):
|
383 |
+
"""List all documents in the Pinecone index"""
|
384 |
+
if self.mock_mode:
|
385 |
+
logger.info(f"[{self.correlation_id}] MOCK: Listing documents in namespace '{self.namespace}'")
|
386 |
+
return {
|
387 |
+
"success": True,
|
388 |
+
"namespace": self.namespace,
|
389 |
+
"documents": [
|
390 |
+
{"id": "doc1", "title": "Sample Document 1"},
|
391 |
+
{"id": "doc2", "title": "Sample Document 2"}
|
392 |
+
]
|
393 |
+
}
|
394 |
+
|
395 |
try:
|
|
|
396 |
if not self.pinecone_index:
|
397 |
+
self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)
|
398 |
+
|
399 |
+
# The namespace to use might be in vdb-X format if vector_db_id provided
|
400 |
+
actual_namespace = f"vdb-{self.vector_db_id}" if self.vector_db_id else self.namespace
|
401 |
|
402 |
+
# Get index stats
|
403 |
stats = self.pinecone_index.describe_index_stats()
|
404 |
+
namespaces = stats.get("namespaces", {})
|
405 |
+
total_vectors = namespaces.get(actual_namespace, {}).get("vector_count", 0)
|
406 |
|
407 |
+
# Query unique document IDs
|
408 |
+
# Use a sparse vector with top_k=0 to just get metadata stats
|
409 |
+
# This is more efficient than retrieving actual vectors
|
410 |
+
results = self.pinecone_index.query(
|
411 |
+
vector=[0] * 1536, # Dummy vector for metadata-only query
|
412 |
+
top_k=100, # Limit to 100 results
|
413 |
+
include_metadata=True,
|
414 |
+
namespace=actual_namespace
|
415 |
+
)
|
416 |
+
|
417 |
+
# Extract unique document IDs from metadata
|
418 |
+
document_map = {}
|
419 |
+
matches = results.get("matches", [])
|
420 |
+
|
421 |
+
for match in matches:
|
422 |
+
metadata = match.get("metadata", {})
|
423 |
+
doc_id = metadata.get("document_id")
|
424 |
+
|
425 |
+
if doc_id and doc_id not in document_map:
|
426 |
+
document_map[doc_id] = {
|
427 |
+
"id": doc_id,
|
428 |
+
"title": metadata.get("title", "Unknown"),
|
429 |
+
"chunks": 1
|
430 |
+
}
|
431 |
+
elif doc_id:
|
432 |
+
document_map[doc_id]["chunks"] += 1
|
433 |
+
|
434 |
+
documents = list(document_map.values())
|
435 |
|
436 |
return {
|
437 |
"success": True,
|
438 |
+
"namespace": actual_namespace,
|
439 |
+
"index_name": self.index_name,
|
440 |
+
"total_vectors": total_vectors,
|
441 |
+
"documents": documents
|
442 |
}
|
443 |
except Exception as e:
|
444 |
+
logger.error(f"[{self.correlation_id}] Error listing documents: {str(e)}")
|
445 |
return {
|
446 |
"success": False,
|
447 |
+
"error": f"Error listing documents: {str(e)}"
|
448 |
+
}
|
449 |
+
|
app/utils/pinecone_fix.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Improved Pinecone connection handling with dimension validation.
|
3 |
+
This module provides more robust connection and error handling for Pinecone operations.
|
4 |
+
"""
|
5 |
+
import logging
|
6 |
+
import time
|
7 |
+
from typing import Optional, Dict, Any, Tuple, List
|
8 |
+
import pinecone
|
9 |
+
from pinecone import Pinecone, ServerlessSpec, PodSpec
|
10 |
+
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
+
|
13 |
+
# Default retry settings
|
14 |
+
DEFAULT_MAX_RETRIES = 3
|
15 |
+
DEFAULT_RETRY_DELAY = 2
|
16 |
+
|
17 |
+
class PineconeConnectionManager:
|
18 |
+
"""
|
19 |
+
Manages Pinecone connections with enhanced error handling and dimension validation.
|
20 |
+
|
21 |
+
This class centralizes Pinecone connection logic, providing:
|
22 |
+
- Connection pooling/reuse
|
23 |
+
- Automatic retries with exponential backoff
|
24 |
+
- Dimension validation before operations
|
25 |
+
- Detailed error logging for better debugging
|
26 |
+
"""
|
27 |
+
|
28 |
+
# Class-level cache of Pinecone clients
|
29 |
+
_clients = {}
|
30 |
+
|
31 |
+
@classmethod
|
32 |
+
def get_client(cls, api_key: str) -> Pinecone:
|
33 |
+
"""
|
34 |
+
Returns a Pinecone client for the given API key, creating one if needed.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
api_key: Pinecone API key
|
38 |
+
|
39 |
+
Returns:
|
40 |
+
Initialized Pinecone client
|
41 |
+
"""
|
42 |
+
if not api_key:
|
43 |
+
raise ValueError("Pinecone API key cannot be empty")
|
44 |
+
|
45 |
+
# Return cached client if it exists
|
46 |
+
if api_key in cls._clients:
|
47 |
+
return cls._clients[api_key]
|
48 |
+
|
49 |
+
# Log client creation (but hide full API key)
|
50 |
+
key_prefix = api_key[:4] + "..." if len(api_key) > 4 else "invalid"
|
51 |
+
logger.info(f"Creating new Pinecone client with API key (first 4 chars: {key_prefix}...)")
|
52 |
+
|
53 |
+
try:
|
54 |
+
# Initialize Pinecone client
|
55 |
+
client = Pinecone(api_key=api_key)
|
56 |
+
cls._clients[api_key] = client
|
57 |
+
logger.info("Pinecone client created successfully")
|
58 |
+
return client
|
59 |
+
except Exception as e:
|
60 |
+
logger.error(f"Failed to create Pinecone client: {str(e)}")
|
61 |
+
raise RuntimeError(f"Pinecone client initialization failed: {str(e)}") from e
|
62 |
+
|
63 |
+
@classmethod
|
64 |
+
def get_index(cls,
|
65 |
+
api_key: str,
|
66 |
+
index_name: str,
|
67 |
+
max_retries: int = DEFAULT_MAX_RETRIES) -> Any:
|
68 |
+
"""
|
69 |
+
Get a Pinecone index with retry logic.
|
70 |
+
|
71 |
+
Args:
|
72 |
+
api_key: Pinecone API key
|
73 |
+
index_name: Name of the index to connect to
|
74 |
+
max_retries: Maximum number of retry attempts
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
Pinecone index
|
78 |
+
"""
|
79 |
+
client = cls.get_client(api_key)
|
80 |
+
|
81 |
+
# Retry logic for connection issues
|
82 |
+
for attempt in range(max_retries):
|
83 |
+
try:
|
84 |
+
index = client.Index(index_name)
|
85 |
+
# Test the connection
|
86 |
+
_ = index.describe_index_stats()
|
87 |
+
logger.info(f"Connected to Pinecone index: {index_name}")
|
88 |
+
return index
|
89 |
+
except Exception as e:
|
90 |
+
if attempt < max_retries - 1:
|
91 |
+
wait_time = DEFAULT_RETRY_DELAY * (2 ** attempt) # Exponential backoff
|
92 |
+
logger.warning(f"Pinecone connection attempt {attempt+1} failed: {e}. Retrying in {wait_time}s...")
|
93 |
+
time.sleep(wait_time)
|
94 |
+
else:
|
95 |
+
logger.error(f"Failed to connect to Pinecone index after {max_retries} attempts: {e}")
|
96 |
+
raise RuntimeError(f"Pinecone index connection failed: {str(e)}") from e
|
97 |
+
|
98 |
+
@classmethod
|
99 |
+
def validate_dimensions(cls,
|
100 |
+
index: Any,
|
101 |
+
vector_dimensions: int) -> Tuple[bool, Optional[str]]:
|
102 |
+
"""
|
103 |
+
Validate that the vector dimensions match the Pinecone index configuration.
|
104 |
+
|
105 |
+
Args:
|
106 |
+
index: Pinecone index
|
107 |
+
vector_dimensions: Dimensions of the vectors to be uploaded
|
108 |
+
|
109 |
+
Returns:
|
110 |
+
Tuple of (is_valid, error_message)
|
111 |
+
"""
|
112 |
+
try:
|
113 |
+
# Get index stats
|
114 |
+
stats = index.describe_index_stats()
|
115 |
+
index_dimensions = stats.dimension
|
116 |
+
|
117 |
+
if index_dimensions != vector_dimensions:
|
118 |
+
error_msg = (f"Vector dimensions mismatch: Your vectors have {vector_dimensions} dimensions, "
|
119 |
+
f"but Pinecone index expects {index_dimensions} dimensions")
|
120 |
+
logger.error(error_msg)
|
121 |
+
return False, error_msg
|
122 |
+
|
123 |
+
return True, None
|
124 |
+
except Exception as e:
|
125 |
+
error_msg = f"Failed to validate dimensions: {str(e)}"
|
126 |
+
logger.error(error_msg)
|
127 |
+
return False, error_msg
|
128 |
+
|
129 |
+
@classmethod
|
130 |
+
def upsert_vectors_with_validation(cls,
|
131 |
+
index: Any,
|
132 |
+
vectors: List[Dict[str, Any]],
|
133 |
+
namespace: str = "",
|
134 |
+
batch_size: int = 100) -> Dict[str, Any]:
|
135 |
+
"""
|
136 |
+
Upsert vectors with dimension validation and batching.
|
137 |
+
|
138 |
+
Args:
|
139 |
+
index: Pinecone index
|
140 |
+
vectors: List of vectors to upsert, each with 'id', 'values', and optional 'metadata'
|
141 |
+
namespace: Namespace to upsert to
|
142 |
+
batch_size: Size of batches for upserting
|
143 |
+
|
144 |
+
Returns:
|
145 |
+
Result of upsert operation
|
146 |
+
"""
|
147 |
+
if not vectors:
|
148 |
+
return {"upserted_count": 0, "success": True}
|
149 |
+
|
150 |
+
# Validate dimensions with the first vector
|
151 |
+
if "values" in vectors[0] and len(vectors[0]["values"]) > 0:
|
152 |
+
vector_dim = len(vectors[0]["values"])
|
153 |
+
is_valid, error_msg = cls.validate_dimensions(index, vector_dim)
|
154 |
+
|
155 |
+
if not is_valid:
|
156 |
+
logger.error(f"Dimension validation failed: {error_msg}")
|
157 |
+
raise ValueError(f"Vector dimensions do not match Pinecone index configuration: {error_msg}")
|
158 |
+
|
159 |
+
# Batch upsert
|
160 |
+
total_upserted = 0
|
161 |
+
for i in range(0, len(vectors), batch_size):
|
162 |
+
batch = vectors[i:i+batch_size]
|
163 |
+
try:
|
164 |
+
result = index.upsert(vectors=batch, namespace=namespace)
|
165 |
+
batch_upserted = result.get("upserted_count", len(batch))
|
166 |
+
total_upserted += batch_upserted
|
167 |
+
logger.info(f"Upserted batch {i//batch_size + 1}: {batch_upserted} vectors")
|
168 |
+
except Exception as e:
|
169 |
+
logger.error(f"Failed to upsert batch {i//batch_size + 1}: {str(e)}")
|
170 |
+
raise RuntimeError(f"Vector upsert failed: {str(e)}") from e
|
171 |
+
|
172 |
+
return {"upserted_count": total_upserted, "success": True}
|
173 |
+
|
174 |
+
# Simplified function to check connection
|
175 |
+
def check_connection(api_key: str, index_name: str) -> bool:
|
176 |
+
"""
|
177 |
+
Test Pinecone connection and validate index exists.
|
178 |
+
|
179 |
+
Args:
|
180 |
+
api_key: Pinecone API key
|
181 |
+
index_name: Name of index to test
|
182 |
+
|
183 |
+
Returns:
|
184 |
+
True if connection successful, False otherwise
|
185 |
+
"""
|
186 |
+
try:
|
187 |
+
index = PineconeConnectionManager.get_index(api_key, index_name)
|
188 |
+
stats = index.describe_index_stats()
|
189 |
+
total_vectors = stats.total_vector_count
|
190 |
+
logger.info(f"Pinecone connection is working. Total vectors: {total_vectors}")
|
191 |
+
return True
|
192 |
+
except Exception as e:
|
193 |
+
logger.error(f"Pinecone connection failed: {str(e)}")
|
194 |
+
return False
|