Spaces:
Sleeping
Sleeping
yonnel
commited on
Commit
·
14e32e0
1
Parent(s):
0236fb6
Add admin router and vector update functionality; enhance environment configuration
Browse files- .env.example +14 -1
- app/main.py +23 -1
- app/routers/__init__.py +3 -0
- app/routers/admin.py +319 -0
- app/services/__init__.py +3 -0
- app/services/embedding_service.py +139 -0
- app/services/tmdb_service.py +88 -0
- app/services/vector_storage.py +153 -0
- app/services/vector_updater.py +170 -0
- requirements.txt +7 -9
.env.example
CHANGED
@@ -14,4 +14,17 @@ ENV=dev
|
|
14 |
LOG_LEVEL=INFO
|
15 |
|
16 |
# Remove adult content from TMDB results
|
17 |
-
FILTER_ADULT_CONTENT=true # Set to true to filter out adult content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
LOG_LEVEL=INFO
|
15 |
|
16 |
# Remove adult content from TMDB results
|
17 |
+
FILTER_ADULT_CONTENT=true # Set to true to filter out adult content
|
18 |
+
|
19 |
+
# Hugging Face configuration
|
20 |
+
HF_TOKEN=your_hf_token_here
|
21 |
+
HF_DATASET_REPO=your-username/karl-movie-vectors
|
22 |
+
|
23 |
+
# Vector update configuration
|
24 |
+
AUTO_UPDATE_VECTORS=true
|
25 |
+
UPDATE_INTERVAL_HOURS=24
|
26 |
+
BATCH_SIZE=100
|
27 |
+
MAX_MOVIES_LIMIT=10000
|
28 |
+
|
29 |
+
# Admin configuration
|
30 |
+
ADMIN_TOKEN=your_admin_token_here
|
app/main.py
CHANGED
@@ -187,6 +187,15 @@ def compute_barycenter(liked_indices: List[int], coords: np.ndarray):
|
|
187 |
# FastAPI app setup
|
188 |
app = FastAPI(title="Karl-Movie Vector Backend", version="1.0.0")
|
189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
# CORS configuration
|
191 |
DEV_ORIGINS = [
|
192 |
"http://localhost:5173",
|
@@ -210,7 +219,20 @@ app.add_middleware(
|
|
210 |
@app.on_event("startup")
|
211 |
async def startup_event():
|
212 |
"""Load data on startup"""
|
213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
|
215 |
@app.get("/health")
|
216 |
async def health_check():
|
|
|
187 |
# FastAPI app setup
|
188 |
app = FastAPI(title="Karl-Movie Vector Backend", version="1.0.0")
|
189 |
|
190 |
+
# Ajouter l'import du router admin
|
191 |
+
try:
|
192 |
+
from .routers import admin
|
193 |
+
except ImportError:
|
194 |
+
from app.routers import admin
|
195 |
+
|
196 |
+
# Ajouter le router admin
|
197 |
+
app.include_router(admin.router)
|
198 |
+
|
199 |
# CORS configuration
|
200 |
DEV_ORIGINS = [
|
201 |
"http://localhost:5173",
|
|
|
219 |
@app.on_event("startup")
|
220 |
async def startup_event():
|
221 |
"""Load data on startup"""
|
222 |
+
global vectors, id_map, faiss_index, movie_metadata
|
223 |
+
vectors, id_map, faiss_index, movie_metadata = load_data()
|
224 |
+
|
225 |
+
# Vérifier et mettre à jour les vecteurs si nécessaire au démarrage
|
226 |
+
if os.getenv('AUTO_UPDATE_VECTORS', 'false').lower() == 'true':
|
227 |
+
# Lancer en arrière-plan sans attendre
|
228 |
+
import asyncio
|
229 |
+
try:
|
230 |
+
from .services.vector_updater import VectorUpdater
|
231 |
+
except ImportError:
|
232 |
+
from app.services.vector_updater import VectorUpdater
|
233 |
+
|
234 |
+
vector_updater = VectorUpdater()
|
235 |
+
asyncio.create_task(vector_updater.update_vectors_if_needed())
|
236 |
|
237 |
@app.get("/health")
|
238 |
async def health_check():
|
app/routers/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Routers package for Karl Movie Vector Backend
|
3 |
+
"""
|
app/routers/admin.py
ADDED
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, HTTPException, Depends, BackgroundTasks
|
2 |
+
from fastapi.responses import HTMLResponse
|
3 |
+
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
4 |
+
import os
|
5 |
+
import logging
|
6 |
+
from datetime import datetime
|
7 |
+
from typing import Optional
|
8 |
+
|
9 |
+
# Import avec gestion des erreurs pour les imports relatifs
|
10 |
+
try:
|
11 |
+
from ..services.vector_updater import VectorUpdater
|
12 |
+
except ImportError:
|
13 |
+
from app.services.vector_updater import VectorUpdater
|
14 |
+
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
router = APIRouter(prefix="/admin", tags=["admin"])
|
17 |
+
security = HTTPBearer()
|
18 |
+
|
19 |
+
# Instance globale du updater
|
20 |
+
vector_updater = VectorUpdater()
|
21 |
+
|
22 |
+
def verify_admin_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
23 |
+
"""Vérification du token admin"""
|
24 |
+
admin_token = os.getenv('ADMIN_TOKEN')
|
25 |
+
if not admin_token or credentials.credentials != admin_token:
|
26 |
+
raise HTTPException(status_code=403, detail="Invalid admin token")
|
27 |
+
return credentials.credentials
|
28 |
+
|
29 |
+
@router.get("/", response_class=HTMLResponse)
|
30 |
+
async def admin_dashboard():
|
31 |
+
"""Interface web d'administration"""
|
32 |
+
html_content = """
|
33 |
+
<!DOCTYPE html>
|
34 |
+
<html lang="fr">
|
35 |
+
<head>
|
36 |
+
<meta charset="UTF-8">
|
37 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
38 |
+
<title>Karl Movie Vector - Admin</title>
|
39 |
+
<style>
|
40 |
+
body { font-family: Arial, sans-serif; margin: 40px; background: #f5f5f5; }
|
41 |
+
.container { max-width: 1000px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
|
42 |
+
h1 { color: #333; text-align: center; margin-bottom: 30px; }
|
43 |
+
.card { background: #f8f9fa; padding: 20px; margin: 20px 0; border-radius: 5px; border-left: 4px solid #007bff; }
|
44 |
+
.status { padding: 10px; margin: 10px 0; border-radius: 4px; }
|
45 |
+
.status.success { background: #d4edda; color: #155724; border: 1px solid #c3e6cb; }
|
46 |
+
.status.error { background: #f8d7da; color: #721c24; border: 1px solid #f5c6cb; }
|
47 |
+
.status.warning { background: #fff3cd; color: #856404; border: 1px solid #ffeaa7; }
|
48 |
+
.status.info { background: #d1ecf1; color: #0c5460; border: 1px solid #bee5eb; }
|
49 |
+
button { background: #007bff; color: white; border: none; padding: 12px 24px; border-radius: 4px; cursor: pointer; margin: 5px; font-size: 14px; }
|
50 |
+
button:hover { background: #0056b3; }
|
51 |
+
button:disabled { background: #6c757d; cursor: not-allowed; }
|
52 |
+
.danger { background: #dc3545; }
|
53 |
+
.danger:hover { background: #c82333; }
|
54 |
+
.loading { display: none; color: #007bff; margin: 10px 0; }
|
55 |
+
.log { background: #f8f9fa; border: 1px solid #dee2e6; padding: 15px; margin: 15px 0; border-radius: 4px; font-family: monospace; font-size: 12px; max-height: 400px; overflow-y: auto; white-space: pre-line; }
|
56 |
+
.config-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 10px; }
|
57 |
+
.config-item { background: white; padding: 10px; border-radius: 4px; border: 1px solid #dee2e6; }
|
58 |
+
</style>
|
59 |
+
</head>
|
60 |
+
<body>
|
61 |
+
<div class="container">
|
62 |
+
<h1>🎬 Karl Movie Vector - Administration</h1>
|
63 |
+
|
64 |
+
<div class="card">
|
65 |
+
<h3>📊 Statut du Système</h3>
|
66 |
+
<div id="status-info"></div>
|
67 |
+
<button onclick="loadStatus()">🔄 Actualiser le Statut</button>
|
68 |
+
</div>
|
69 |
+
|
70 |
+
<div class="card">
|
71 |
+
<h3>🚀 Mise à jour des Vecteurs</h3>
|
72 |
+
<p>Déclenchez manuellement la mise à jour des vecteurs de films depuis TMDB.</p>
|
73 |
+
<div>
|
74 |
+
<button onclick="updateVectors()" id="updateBtn">🔄 Mettre à jour (Conditionnel)</button>
|
75 |
+
<button onclick="forceUpdateVectors()" id="forceUpdateBtn" class="danger">⚡ Forcer la Mise à jour</button>
|
76 |
+
</div>
|
77 |
+
<div id="update-loading" class="loading">⏳ Mise à jour en cours...</div>
|
78 |
+
<div id="update-result"></div>
|
79 |
+
</div>
|
80 |
+
|
81 |
+
<div class="card">
|
82 |
+
<h3>📋 Logs de Mise à jour</h3>
|
83 |
+
<button onclick="loadLogs()">📄 Charger les Logs</button>
|
84 |
+
<button onclick="clearLogs()">🗑️ Effacer l'affichage</button>
|
85 |
+
<div id="logs" class="log">Cliquez sur "Charger les Logs" pour voir les logs...</div>
|
86 |
+
</div>
|
87 |
+
</div>
|
88 |
+
|
89 |
+
<script>
|
90 |
+
const API_BASE = '/admin';
|
91 |
+
let ADMIN_TOKEN = localStorage.getItem('admin_token');
|
92 |
+
|
93 |
+
if (!ADMIN_TOKEN) {
|
94 |
+
ADMIN_TOKEN = prompt('Token Admin:');
|
95 |
+
if (ADMIN_TOKEN) {
|
96 |
+
localStorage.setItem('admin_token', ADMIN_TOKEN);
|
97 |
+
}
|
98 |
+
}
|
99 |
+
|
100 |
+
const headers = {
|
101 |
+
'Authorization': `Bearer ${ADMIN_TOKEN}`,
|
102 |
+
'Content-Type': 'application/json'
|
103 |
+
};
|
104 |
+
|
105 |
+
async function apiCall(endpoint, method = 'GET') {
|
106 |
+
try {
|
107 |
+
const response = await fetch(`${API_BASE}${endpoint}`, {
|
108 |
+
method,
|
109 |
+
headers
|
110 |
+
});
|
111 |
+
|
112 |
+
if (response.status === 403) {
|
113 |
+
localStorage.removeItem('admin_token');
|
114 |
+
location.reload();
|
115 |
+
return;
|
116 |
+
}
|
117 |
+
|
118 |
+
if (!response.ok) {
|
119 |
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
120 |
+
}
|
121 |
+
|
122 |
+
return await response.json();
|
123 |
+
} catch (error) {
|
124 |
+
console.error('API Error:', error);
|
125 |
+
throw error;
|
126 |
+
}
|
127 |
+
}
|
128 |
+
|
129 |
+
async function loadStatus() {
|
130 |
+
try {
|
131 |
+
const status = await apiCall('/status');
|
132 |
+
const statusDiv = document.getElementById('status-info');
|
133 |
+
|
134 |
+
let statusClass = 'info';
|
135 |
+
if (status.is_updating) statusClass = 'warning';
|
136 |
+
else if (!status.hf_configured) statusClass = 'error';
|
137 |
+
else if (status.last_update_result && status.last_update_result.success) statusClass = 'success';
|
138 |
+
|
139 |
+
let lastUpdateInfo = 'Aucune';
|
140 |
+
if (status.last_update_result) {
|
141 |
+
const result = status.last_update_result;
|
142 |
+
if (result.success) {
|
143 |
+
lastUpdateInfo = `✅ ${result.count} films (${new Date(result.timestamp).toLocaleString()})`;
|
144 |
+
} else {
|
145 |
+
lastUpdateInfo = `❌ Erreur: ${result.error}`;
|
146 |
+
}
|
147 |
+
}
|
148 |
+
|
149 |
+
statusDiv.innerHTML = `
|
150 |
+
<div class="status ${statusClass}">
|
151 |
+
<div class="config-grid">
|
152 |
+
<div class="config-item">
|
153 |
+
<strong>Statut:</strong> ${status.is_updating ? '🔄 Mise à jour en cours...' : '✅ Prêt'}
|
154 |
+
</div>
|
155 |
+
<div class="config-item">
|
156 |
+
<strong>HF Configuré:</strong> ${status.hf_configured ? '✅ Oui' : '❌ Non'}
|
157 |
+
</div>
|
158 |
+
<div class="config-item">
|
159 |
+
<strong>Auto-update:</strong> ${status.auto_update_enabled ? '✅ Activé' : '❌ Désactivé'}
|
160 |
+
</div>
|
161 |
+
<div class="config-item">
|
162 |
+
<strong>Intervalle:</strong> ${status.update_interval_hours}h
|
163 |
+
</div>
|
164 |
+
<div class="config-item">
|
165 |
+
<strong>Taille batch:</strong> ${status.batch_size}
|
166 |
+
</div>
|
167 |
+
<div class="config-item">
|
168 |
+
<strong>Limite films:</strong> ${status.max_movies_limit}
|
169 |
+
</div>
|
170 |
+
<div class="config-item">
|
171 |
+
<strong>Logs:</strong> ${status.logs_count} entrées
|
172 |
+
</div>
|
173 |
+
<div class="config-item">
|
174 |
+
<strong>Dernière MAJ:</strong> ${lastUpdateInfo}
|
175 |
+
</div>
|
176 |
+
</div>
|
177 |
+
</div>
|
178 |
+
`;
|
179 |
+
} catch (error) {
|
180 |
+
document.getElementById('status-info').innerHTML = `
|
181 |
+
<div class="status error">❌ Erreur: ${error.message}</div>
|
182 |
+
`;
|
183 |
+
}
|
184 |
+
}
|
185 |
+
|
186 |
+
async function updateVectors() {
|
187 |
+
const btn = document.getElementById('updateBtn');
|
188 |
+
const loading = document.getElementById('update-loading');
|
189 |
+
const result = document.getElementById('update-result');
|
190 |
+
|
191 |
+
btn.disabled = true;
|
192 |
+
loading.style.display = 'block';
|
193 |
+
result.innerHTML = '';
|
194 |
+
|
195 |
+
try {
|
196 |
+
const response = await apiCall('/update-vectors', 'POST');
|
197 |
+
result.innerHTML = `
|
198 |
+
<div class="status ${response.success ? 'success' : 'warning'}">
|
199 |
+
${response.success ? '✅' : '⚠️'} ${response.message}
|
200 |
+
</div>
|
201 |
+
`;
|
202 |
+
|
203 |
+
// Actualiser le statut après quelques secondes
|
204 |
+
if (response.success) {
|
205 |
+
setTimeout(loadStatus, 2000);
|
206 |
+
}
|
207 |
+
} catch (error) {
|
208 |
+
result.innerHTML = `
|
209 |
+
<div class="status error">❌ Erreur: ${error.message}</div>
|
210 |
+
`;
|
211 |
+
} finally {
|
212 |
+
btn.disabled = false;
|
213 |
+
loading.style.display = 'none';
|
214 |
+
}
|
215 |
+
}
|
216 |
+
|
217 |
+
async function forceUpdateVectors() {
|
218 |
+
if (!confirm('Êtes-vous sûr de vouloir forcer la mise à jour ? Cela peut prendre plusieurs minutes et consommer des crédits API.')) {
|
219 |
+
return;
|
220 |
+
}
|
221 |
+
|
222 |
+
const btn = document.getElementById('forceUpdateBtn');
|
223 |
+
const loading = document.getElementById('update-loading');
|
224 |
+
const result = document.getElementById('update-result');
|
225 |
+
|
226 |
+
btn.disabled = true;
|
227 |
+
loading.style.display = 'block';
|
228 |
+
result.innerHTML = '';
|
229 |
+
|
230 |
+
try {
|
231 |
+
const response = await apiCall('/force-update-vectors', 'POST');
|
232 |
+
result.innerHTML = `
|
233 |
+
<div class="status ${response.success ? 'success' : 'error'}">
|
234 |
+
${response.success ? '✅' : '❌'} ${response.message}
|
235 |
+
</div>
|
236 |
+
`;
|
237 |
+
|
238 |
+
// Actualiser le statut après quelques secondes
|
239 |
+
if (response.success) {
|
240 |
+
setTimeout(loadStatus, 2000);
|
241 |
+
}
|
242 |
+
} catch (error) {
|
243 |
+
result.innerHTML = `
|
244 |
+
<div class="status error">❌ Erreur: ${error.message}</div>
|
245 |
+
`;
|
246 |
+
} finally {
|
247 |
+
btn.disabled = false;
|
248 |
+
loading.style.display = 'none';
|
249 |
+
}
|
250 |
+
}
|
251 |
+
|
252 |
+
async function loadLogs() {
|
253 |
+
try {
|
254 |
+
const response = await apiCall('/logs');
|
255 |
+
const logsDiv = document.getElementById('logs');
|
256 |
+
|
257 |
+
if (response.logs && response.logs.length > 0) {
|
258 |
+
logsDiv.innerHTML = response.logs.join('\\n');
|
259 |
+
logsDiv.scrollTop = logsDiv.scrollHeight;
|
260 |
+
} else {
|
261 |
+
logsDiv.innerHTML = 'Aucun log disponible';
|
262 |
+
}
|
263 |
+
} catch (error) {
|
264 |
+
document.getElementById('logs').innerHTML = `Erreur: ${error.message}`;
|
265 |
+
}
|
266 |
+
}
|
267 |
+
|
268 |
+
function clearLogs() {
|
269 |
+
document.getElementById('logs').innerHTML = 'Logs effacés (rechargez pour voir les nouveaux logs)';
|
270 |
+
}
|
271 |
+
|
272 |
+
// Charger le statut au démarrage
|
273 |
+
if (ADMIN_TOKEN) {
|
274 |
+
loadStatus();
|
275 |
+
|
276 |
+
// Auto-refresh du statut toutes les 30 secondes
|
277 |
+
setInterval(loadStatus, 30000);
|
278 |
+
}
|
279 |
+
</script>
|
280 |
+
</body>
|
281 |
+
</html>
|
282 |
+
"""
|
283 |
+
return HTMLResponse(content=html_content)
|
284 |
+
|
285 |
+
@router.get("/status")
|
286 |
+
async def get_status(token: str = Depends(verify_admin_token)):
|
287 |
+
"""Obtenir le statut du système"""
|
288 |
+
return vector_updater.get_update_status()
|
289 |
+
|
290 |
+
@router.post("/update-vectors")
|
291 |
+
async def update_vectors(background_tasks: BackgroundTasks, token: str = Depends(verify_admin_token)):
|
292 |
+
"""Déclencher une mise à jour si nécessaire"""
|
293 |
+
if vector_updater.is_updating:
|
294 |
+
return {"success": False, "message": "Une mise à jour est déjà en cours"}
|
295 |
+
|
296 |
+
# Lancer la mise à jour en arrière-plan
|
297 |
+
background_tasks.add_task(vector_updater.update_vectors_if_needed)
|
298 |
+
|
299 |
+
return {"success": True, "message": "Mise à jour programmée (vérification des conditions)"}
|
300 |
+
|
301 |
+
@router.post("/force-update-vectors")
|
302 |
+
async def force_update_vectors(background_tasks: BackgroundTasks, token: str = Depends(verify_admin_token)):
|
303 |
+
"""Forcer la mise à jour des vecteurs"""
|
304 |
+
if vector_updater.is_updating:
|
305 |
+
return {"success": False, "message": "Une mise à jour est déjà en cours"}
|
306 |
+
|
307 |
+
# Lancer la mise à jour forcée en arrière-plan
|
308 |
+
background_tasks.add_task(vector_updater.force_update_vectors)
|
309 |
+
|
310 |
+
return {"success": True, "message": "Mise à jour forcée programmée"}
|
311 |
+
|
312 |
+
@router.get("/logs")
|
313 |
+
async def get_logs(token: str = Depends(verify_admin_token)):
|
314 |
+
"""Obtenir les logs de mise à jour"""
|
315 |
+
try:
|
316 |
+
logs = vector_updater.get_logs()
|
317 |
+
return {"logs": logs}
|
318 |
+
except Exception as e:
|
319 |
+
return {"logs": [f"Erreur de lecture des logs: {e}"]}
|
app/services/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Services package for Karl Movie Vector Backend
|
3 |
+
"""
|
app/services/embedding_service.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import time
|
3 |
+
import logging
|
4 |
+
from typing import List, Optional, Dict
|
5 |
+
import os
|
6 |
+
from openai import OpenAI
|
7 |
+
|
8 |
+
logger = logging.getLogger(__name__)
|
9 |
+
|
10 |
+
class EmbeddingService:
|
11 |
+
"""Service pour générer des embeddings avec OpenAI"""
|
12 |
+
|
13 |
+
def __init__(self):
|
14 |
+
self.client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
15 |
+
self.model_name = "text-embedding-3-small"
|
16 |
+
|
17 |
+
def create_composite_text(self, movie_data: Dict) -> str:
|
18 |
+
"""Create composite text for embedding from movie data"""
|
19 |
+
parts = []
|
20 |
+
|
21 |
+
# Title
|
22 |
+
if movie_data.get('title'):
|
23 |
+
parts.append(f"Title: {movie_data['title']}")
|
24 |
+
|
25 |
+
# Tagline
|
26 |
+
if movie_data.get('tagline'):
|
27 |
+
parts.append(f"Tagline: {movie_data['tagline']}")
|
28 |
+
|
29 |
+
# Overview
|
30 |
+
if movie_data.get('overview'):
|
31 |
+
parts.append(f"Overview: {movie_data['overview']}")
|
32 |
+
|
33 |
+
# Release date
|
34 |
+
if movie_data.get('release_date'):
|
35 |
+
parts.append(f"Release Date: {movie_data['release_date']}")
|
36 |
+
|
37 |
+
# Original language
|
38 |
+
if movie_data.get('original_language'):
|
39 |
+
parts.append(f"Language: {movie_data['original_language']}")
|
40 |
+
|
41 |
+
# Spoken languages
|
42 |
+
if movie_data.get('spoken_languages'):
|
43 |
+
languages = [lang.get('iso_639_1', '') for lang in movie_data['spoken_languages'] if lang.get('iso_639_1')]
|
44 |
+
if languages:
|
45 |
+
parts.append(f"Spoken Languages: {', '.join(languages)}")
|
46 |
+
|
47 |
+
# Genres
|
48 |
+
if movie_data.get('genres'):
|
49 |
+
genres = [genre['name'] for genre in movie_data['genres']]
|
50 |
+
parts.append(f"Genres: {', '.join(genres)}")
|
51 |
+
|
52 |
+
# Production companies
|
53 |
+
if movie_data.get('production_companies'):
|
54 |
+
companies = [company['name'] for company in movie_data['production_companies']]
|
55 |
+
if companies:
|
56 |
+
parts.append(f"Production Companies: {', '.join(companies)}")
|
57 |
+
|
58 |
+
# Production countries
|
59 |
+
if movie_data.get('production_countries'):
|
60 |
+
countries = [country['name'] for country in movie_data['production_countries']]
|
61 |
+
if countries:
|
62 |
+
parts.append(f"Production Countries: {', '.join(countries)}")
|
63 |
+
|
64 |
+
# Budget (only if > 0)
|
65 |
+
if movie_data.get('budget') and movie_data['budget'] > 0:
|
66 |
+
parts.append(f"Budget: ${movie_data['budget']:,}")
|
67 |
+
|
68 |
+
# Popularity
|
69 |
+
if movie_data.get('popularity'):
|
70 |
+
parts.append(f"Popularity: {movie_data['popularity']}")
|
71 |
+
|
72 |
+
# Vote average
|
73 |
+
if movie_data.get('vote_average'):
|
74 |
+
parts.append(f"Vote Average: {movie_data['vote_average']}")
|
75 |
+
|
76 |
+
# Vote count
|
77 |
+
if movie_data.get('vote_count'):
|
78 |
+
parts.append(f"Vote Count: {movie_data['vote_count']}")
|
79 |
+
|
80 |
+
# Director(s)
|
81 |
+
if movie_data.get('credits', {}).get('crew'):
|
82 |
+
directors = [person['name'] for person in movie_data['credits']['crew'] if person['job'] == 'Director']
|
83 |
+
if directors:
|
84 |
+
parts.append(f"Director: {', '.join(directors)}")
|
85 |
+
|
86 |
+
# Top 5 cast
|
87 |
+
if movie_data.get('credits', {}).get('cast'):
|
88 |
+
top_cast = [person['name'] for person in movie_data['credits']['cast'][:5]]
|
89 |
+
if top_cast:
|
90 |
+
parts.append(f"Cast: {', '.join(top_cast)}")
|
91 |
+
|
92 |
+
return " / ".join(parts)
|
93 |
+
|
94 |
+
def get_embeddings_batch(self, texts: List[str], max_retries: int = 3) -> Optional[List[List[float]]]:
|
95 |
+
"""Get embeddings for a batch of texts with retry"""
|
96 |
+
for attempt in range(max_retries):
|
97 |
+
try:
|
98 |
+
response = self.client.embeddings.create(
|
99 |
+
model=self.model_name,
|
100 |
+
input=texts
|
101 |
+
)
|
102 |
+
return [embedding.embedding for embedding in response.data]
|
103 |
+
except Exception as e:
|
104 |
+
logger.error(f"OpenAI API error (attempt {attempt + 1}): {e}")
|
105 |
+
if attempt < max_retries - 1:
|
106 |
+
time.sleep(2 ** attempt)
|
107 |
+
return None
|
108 |
+
|
109 |
+
async def generate_batch_embeddings(self, movies: List[Dict], batch_size: int = 100) -> Optional[List[List[float]]]:
|
110 |
+
"""Generate embeddings for a batch of movies"""
|
111 |
+
try:
|
112 |
+
# Create composite texts
|
113 |
+
texts = []
|
114 |
+
for movie in movies:
|
115 |
+
composite_text = self.create_composite_text(movie)
|
116 |
+
texts.append(composite_text)
|
117 |
+
|
118 |
+
# Generate embeddings in smaller batches to avoid API limits
|
119 |
+
all_embeddings = []
|
120 |
+
|
121 |
+
for i in range(0, len(texts), batch_size):
|
122 |
+
batch_texts = texts[i:i + batch_size]
|
123 |
+
logger.debug(f"Generating embeddings for batch {i//batch_size + 1}")
|
124 |
+
|
125 |
+
batch_embeddings = self.get_embeddings_batch(batch_texts)
|
126 |
+
if batch_embeddings is None:
|
127 |
+
logger.error(f"Failed to generate embeddings for batch starting at {i}")
|
128 |
+
return None
|
129 |
+
|
130 |
+
all_embeddings.extend(batch_embeddings)
|
131 |
+
|
132 |
+
# Small delay between batches to respect rate limits
|
133 |
+
await asyncio.sleep(0.1)
|
134 |
+
|
135 |
+
return all_embeddings
|
136 |
+
|
137 |
+
except Exception as e:
|
138 |
+
logger.error(f"Error generating batch embeddings: {e}")
|
139 |
+
return None
|
app/services/tmdb_service.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import time
|
3 |
+
import logging
|
4 |
+
from typing import List, Optional, Dict
|
5 |
+
import os
|
6 |
+
|
7 |
+
logger = logging.getLogger(__name__)
|
8 |
+
|
9 |
+
class TMDBService:
|
10 |
+
"""Service pour interagir avec l'API TMDB"""
|
11 |
+
|
12 |
+
def __init__(self):
|
13 |
+
self.api_key = os.getenv('TMDB_API_KEY')
|
14 |
+
self.base_url = "https://api.themoviedb.org/3"
|
15 |
+
|
16 |
+
def _make_request(self, endpoint: str, params: dict = None, max_retries: int = 3) -> Optional[dict]:
|
17 |
+
"""Make API request with retry and backoff"""
|
18 |
+
if params is None:
|
19 |
+
params = {}
|
20 |
+
params['api_key'] = self.api_key
|
21 |
+
|
22 |
+
for attempt in range(max_retries):
|
23 |
+
try:
|
24 |
+
response = requests.get(f"{self.base_url}{endpoint}", params=params, timeout=10)
|
25 |
+
if response.status_code == 200:
|
26 |
+
return response.json()
|
27 |
+
elif response.status_code == 429:
|
28 |
+
# Rate limited, wait longer
|
29 |
+
wait_time = 2 ** attempt
|
30 |
+
logger.warning(f"Rate limited, waiting {wait_time}s before retry {attempt + 1}")
|
31 |
+
time.sleep(wait_time)
|
32 |
+
continue
|
33 |
+
else:
|
34 |
+
logger.error(f"TMDB API returned status {response.status_code}")
|
35 |
+
return None
|
36 |
+
except Exception as e:
|
37 |
+
logger.error(f"Request failed (attempt {attempt + 1}): {e}")
|
38 |
+
if attempt < max_retries - 1:
|
39 |
+
time.sleep(2 ** attempt)
|
40 |
+
return None
|
41 |
+
|
42 |
+
async def get_popular_movies(self, limit: int = 10000) -> List[Dict]:
|
43 |
+
"""Get popular movies from TMDB"""
|
44 |
+
movies = []
|
45 |
+
page = 1
|
46 |
+
filter_adult = os.getenv('FILTER_ADULT_CONTENT', 'true').lower() == 'true'
|
47 |
+
|
48 |
+
while len(movies) < limit:
|
49 |
+
logger.info(f"Fetching popular movies page {page}")
|
50 |
+
|
51 |
+
data = self._make_request("/movie/popular", {"page": page})
|
52 |
+
if not data or not data.get('results'):
|
53 |
+
break
|
54 |
+
|
55 |
+
for movie in data.get('results', []):
|
56 |
+
if len(movies) >= limit:
|
57 |
+
break
|
58 |
+
|
59 |
+
# Skip adult movies if filtering is enabled
|
60 |
+
if filter_adult and movie.get('adult', False):
|
61 |
+
continue
|
62 |
+
|
63 |
+
# Get detailed movie data
|
64 |
+
movie_details = self.get_movie_details(movie['id'])
|
65 |
+
if movie_details:
|
66 |
+
# Get credits
|
67 |
+
credits = self.get_movie_credits(movie['id'])
|
68 |
+
if credits:
|
69 |
+
movie_details['credits'] = credits
|
70 |
+
movies.append(movie_details)
|
71 |
+
|
72 |
+
# Check if we've reached the last page
|
73 |
+
if page >= data.get('total_pages', 0):
|
74 |
+
break
|
75 |
+
|
76 |
+
page += 1
|
77 |
+
time.sleep(0.25) # Rate limiting
|
78 |
+
|
79 |
+
logger.info(f"Collected {len(movies)} movies from TMDB")
|
80 |
+
return movies[:limit]
|
81 |
+
|
82 |
+
def get_movie_details(self, movie_id: int) -> Optional[dict]:
|
83 |
+
"""Get detailed movie information"""
|
84 |
+
return self._make_request(f"/movie/{movie_id}")
|
85 |
+
|
86 |
+
def get_movie_credits(self, movie_id: int) -> Optional[dict]:
|
87 |
+
"""Get movie cast and crew"""
|
88 |
+
return self._make_request(f"/movie/{movie_id}/credits")
|
app/services/vector_storage.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import Dataset, load_dataset
|
2 |
+
from huggingface_hub import HfApi, create_repo
|
3 |
+
import numpy as np
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
from typing import Dict, List, Tuple, Optional
|
7 |
+
import os
|
8 |
+
from datetime import datetime
|
9 |
+
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
class HFVectorStorage:
|
13 |
+
def __init__(self):
|
14 |
+
self.hf_token = os.getenv('HF_TOKEN')
|
15 |
+
self.repo_name = os.getenv('HF_DATASET_REPO')
|
16 |
+
self.api = HfApi(token=self.hf_token)
|
17 |
+
|
18 |
+
# Créer le repo s'il n'existe pas
|
19 |
+
if self.hf_token and self.repo_name:
|
20 |
+
try:
|
21 |
+
create_repo(
|
22 |
+
repo_id=self.repo_name,
|
23 |
+
repo_type="dataset",
|
24 |
+
token=self.hf_token,
|
25 |
+
private=True,
|
26 |
+
exist_ok=True
|
27 |
+
)
|
28 |
+
except Exception as e:
|
29 |
+
logger.warning(f"Repo creation warning: {e}")
|
30 |
+
|
31 |
+
def save_vectors(self, embeddings: np.ndarray, movies_data: List[Dict],
|
32 |
+
id_map: Dict, metadata: Dict) -> bool:
|
33 |
+
"""Sauvegarde les vecteurs sur HF Dataset Hub"""
|
34 |
+
try:
|
35 |
+
if not self.hf_token or not self.repo_name:
|
36 |
+
logger.error("HF_TOKEN or HF_DATASET_REPO not configured")
|
37 |
+
return False
|
38 |
+
|
39 |
+
# Préparer les données pour le dataset
|
40 |
+
dataset_dict = {
|
41 |
+
'movie_id': [movie['id'] for movie in movies_data],
|
42 |
+
'title': [movie['title'] for movie in movies_data],
|
43 |
+
'overview': [movie.get('overview', '') for movie in movies_data],
|
44 |
+
'genres': [movie.get('genres', []) for movie in movies_data],
|
45 |
+
'release_date': [movie.get('release_date', '') for movie in movies_data],
|
46 |
+
'embedding': embeddings.tolist(),
|
47 |
+
'tmdb_data': [json.dumps(movie) for movie in movies_data]
|
48 |
+
}
|
49 |
+
|
50 |
+
# Créer le dataset
|
51 |
+
dataset = Dataset.from_dict(dataset_dict)
|
52 |
+
|
53 |
+
# Upload vers HF Hub
|
54 |
+
dataset.push_to_hub(
|
55 |
+
self.repo_name,
|
56 |
+
token=self.hf_token,
|
57 |
+
commit_message=f"Update vectors - {datetime.now().isoformat()}"
|
58 |
+
)
|
59 |
+
|
60 |
+
# Sauvegarder les métadonnées
|
61 |
+
metadata_with_timestamp = {
|
62 |
+
**metadata,
|
63 |
+
'last_updated': datetime.now().isoformat(),
|
64 |
+
'total_movies': len(movies_data)
|
65 |
+
}
|
66 |
+
|
67 |
+
with open('temp_metadata.json', 'w') as f:
|
68 |
+
json.dump(metadata_with_timestamp, f, indent=2)
|
69 |
+
|
70 |
+
self.api.upload_file(
|
71 |
+
path_or_fileobj='temp_metadata.json',
|
72 |
+
path_in_repo='metadata.json',
|
73 |
+
repo_id=self.repo_name,
|
74 |
+
repo_type='dataset',
|
75 |
+
token=self.hf_token,
|
76 |
+
commit_message=f"Update metadata - {datetime.now().isoformat()}"
|
77 |
+
)
|
78 |
+
|
79 |
+
# Nettoyer le fichier temporaire
|
80 |
+
if os.path.exists('temp_metadata.json'):
|
81 |
+
os.remove('temp_metadata.json')
|
82 |
+
|
83 |
+
logger.info(f"Successfully saved {len(movies_data)} movie vectors to HF Hub")
|
84 |
+
return True
|
85 |
+
|
86 |
+
except Exception as e:
|
87 |
+
logger.error(f"Error saving vectors to HF Hub: {e}")
|
88 |
+
return False
|
89 |
+
|
90 |
+
def load_vectors(self) -> Optional[Tuple[np.ndarray, List[Dict], Dict, Dict]]:
|
91 |
+
"""Charge les vecteurs depuis HF Dataset Hub"""
|
92 |
+
try:
|
93 |
+
if not self.hf_token or not self.repo_name:
|
94 |
+
logger.error("HF_TOKEN or HF_DATASET_REPO not configured")
|
95 |
+
return None
|
96 |
+
|
97 |
+
# Charger le dataset
|
98 |
+
dataset = load_dataset(self.repo_name, token=self.hf_token)['train']
|
99 |
+
|
100 |
+
# Extraire les données
|
101 |
+
embeddings = np.array(dataset['embedding'])
|
102 |
+
|
103 |
+
movies_data = []
|
104 |
+
id_map = {}
|
105 |
+
|
106 |
+
for i, movie_id in enumerate(dataset['movie_id']):
|
107 |
+
movie_data = json.loads(dataset['tmdb_data'][i])
|
108 |
+
movies_data.append(movie_data)
|
109 |
+
id_map[movie_id] = i
|
110 |
+
|
111 |
+
# Charger les métadonnées
|
112 |
+
try:
|
113 |
+
metadata_file = self.api.hf_hub_download(
|
114 |
+
repo_id=self.repo_name,
|
115 |
+
filename='metadata.json',
|
116 |
+
repo_type='dataset',
|
117 |
+
token=self.hf_token
|
118 |
+
)
|
119 |
+
with open(metadata_file, 'r') as f:
|
120 |
+
metadata = json.load(f)
|
121 |
+
except:
|
122 |
+
metadata = {'last_updated': None}
|
123 |
+
|
124 |
+
logger.info(f"Successfully loaded {len(movies_data)} movie vectors from HF Hub")
|
125 |
+
return embeddings, movies_data, id_map, metadata
|
126 |
+
|
127 |
+
except Exception as e:
|
128 |
+
logger.error(f"Error loading vectors from HF Hub: {e}")
|
129 |
+
return None
|
130 |
+
|
131 |
+
def check_update_needed(self) -> bool:
|
132 |
+
"""Vérifie si une mise à jour est nécessaire"""
|
133 |
+
try:
|
134 |
+
update_interval = int(os.getenv('UPDATE_INTERVAL_HOURS', 24))
|
135 |
+
|
136 |
+
# Charger les métadonnées actuelles
|
137 |
+
result = self.load_vectors()
|
138 |
+
if not result:
|
139 |
+
return True
|
140 |
+
|
141 |
+
_, _, _, metadata = result
|
142 |
+
|
143 |
+
if not metadata.get('last_updated'):
|
144 |
+
return True
|
145 |
+
|
146 |
+
last_update = datetime.fromisoformat(metadata['last_updated'])
|
147 |
+
hours_since_update = (datetime.now() - last_update).total_seconds() / 3600
|
148 |
+
|
149 |
+
return hours_since_update >= update_interval
|
150 |
+
|
151 |
+
except Exception as e:
|
152 |
+
logger.error(f"Error checking update status: {e}")
|
153 |
+
return True
|
app/services/vector_updater.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import logging
|
3 |
+
from datetime import datetime
|
4 |
+
from typing import Optional, List
|
5 |
+
import os
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
from .vector_storage import HFVectorStorage
|
9 |
+
from .tmdb_service import TMDBService
|
10 |
+
from .embedding_service import EmbeddingService
|
11 |
+
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
+
|
14 |
+
class VectorUpdater:
|
15 |
+
def __init__(self):
|
16 |
+
self.storage = HFVectorStorage()
|
17 |
+
self.tmdb_service = TMDBService()
|
18 |
+
self.embedding_service = EmbeddingService()
|
19 |
+
self.is_updating = False
|
20 |
+
self.last_update_result = None
|
21 |
+
self.update_logs = []
|
22 |
+
|
23 |
+
def add_log(self, message: str, level: str = "INFO"):
|
24 |
+
"""Ajouter un log avec timestamp"""
|
25 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
26 |
+
log_entry = f"[{timestamp}] {level}: {message}"
|
27 |
+
self.update_logs.append(log_entry)
|
28 |
+
|
29 |
+
# Garder seulement les 100 derniers logs
|
30 |
+
if len(self.update_logs) > 100:
|
31 |
+
self.update_logs = self.update_logs[-100:]
|
32 |
+
|
33 |
+
# Log également dans le système de logging
|
34 |
+
log_level = getattr(logging, level.upper(), logging.INFO)
|
35 |
+
logger.log(log_level, message)
|
36 |
+
|
37 |
+
async def update_vectors_if_needed(self) -> bool:
|
38 |
+
"""Met à jour les vecteurs si nécessaire"""
|
39 |
+
if self.is_updating:
|
40 |
+
self.add_log("Update already in progress, skipping...", "WARNING")
|
41 |
+
return False
|
42 |
+
|
43 |
+
if not self.storage.check_update_needed():
|
44 |
+
self.add_log("Vectors are up to date, no update needed", "INFO")
|
45 |
+
return False
|
46 |
+
|
47 |
+
if not os.getenv('AUTO_UPDATE_VECTORS', 'false').lower() == 'true':
|
48 |
+
self.add_log("Auto update disabled", "INFO")
|
49 |
+
return False
|
50 |
+
|
51 |
+
self.add_log("Auto update conditions met, starting update...", "INFO")
|
52 |
+
return await self.force_update_vectors()
|
53 |
+
|
54 |
+
async def force_update_vectors(self) -> bool:
|
55 |
+
"""Force la mise à jour des vecteurs"""
|
56 |
+
if self.is_updating:
|
57 |
+
self.add_log("Update already in progress", "WARNING")
|
58 |
+
return False
|
59 |
+
|
60 |
+
self.is_updating = True
|
61 |
+
self.add_log("Starting vector update process...", "INFO")
|
62 |
+
|
63 |
+
try:
|
64 |
+
# Paramètres de configuration
|
65 |
+
batch_size = int(os.getenv('BATCH_SIZE', 100))
|
66 |
+
max_movies = int(os.getenv('MAX_MOVIES_LIMIT', 10000))
|
67 |
+
|
68 |
+
# Récupérer les films populaires depuis TMDB
|
69 |
+
self.add_log("Fetching movies from TMDB...", "INFO")
|
70 |
+
movies = await self.tmdb_service.get_popular_movies(limit=max_movies)
|
71 |
+
|
72 |
+
if not movies:
|
73 |
+
self.add_log("No movies fetched from TMDB", "ERROR")
|
74 |
+
self.last_update_result = {"success": False, "error": "No movies fetched"}
|
75 |
+
return False
|
76 |
+
|
77 |
+
self.add_log(f"Processing {len(movies)} movies in batches of {batch_size}", "INFO")
|
78 |
+
|
79 |
+
all_embeddings = []
|
80 |
+
processed_movies = []
|
81 |
+
id_map = {}
|
82 |
+
|
83 |
+
# Traiter par batches pour éviter les timeouts
|
84 |
+
for i in range(0, len(movies), batch_size):
|
85 |
+
batch = movies[i:i + batch_size]
|
86 |
+
batch_num = i//batch_size + 1
|
87 |
+
total_batches = (len(movies)-1)//batch_size + 1
|
88 |
+
|
89 |
+
self.add_log(f"Processing batch {batch_num}/{total_batches}", "INFO")
|
90 |
+
|
91 |
+
# Générer les embeddings pour le batch
|
92 |
+
batch_embeddings = await self.embedding_service.generate_batch_embeddings(batch)
|
93 |
+
|
94 |
+
if batch_embeddings is not None:
|
95 |
+
all_embeddings.extend(batch_embeddings)
|
96 |
+
|
97 |
+
for j, movie in enumerate(batch):
|
98 |
+
processed_movies.append(movie)
|
99 |
+
id_map[movie['id']] = len(processed_movies) - 1
|
100 |
+
else:
|
101 |
+
self.add_log(f"Failed to generate embeddings for batch {batch_num}", "WARNING")
|
102 |
+
|
103 |
+
# Pause entre les batches pour éviter le rate limiting
|
104 |
+
await asyncio.sleep(1)
|
105 |
+
|
106 |
+
if not all_embeddings:
|
107 |
+
self.add_log("No embeddings generated", "ERROR")
|
108 |
+
self.last_update_result = {"success": False, "error": "No embeddings generated"}
|
109 |
+
return False
|
110 |
+
|
111 |
+
# Convertir en numpy array
|
112 |
+
embeddings_array = np.array(all_embeddings)
|
113 |
+
self.add_log(f"Generated {len(all_embeddings)} embeddings", "INFO")
|
114 |
+
|
115 |
+
# Sauvegarder sur HF Hub
|
116 |
+
metadata = {
|
117 |
+
'update_timestamp': datetime.now().isoformat(),
|
118 |
+
'total_movies': len(processed_movies),
|
119 |
+
'embedding_model': getattr(self.embedding_service, 'model_name', 'unknown'),
|
120 |
+
'tmdb_api_version': '3',
|
121 |
+
'batch_size': batch_size,
|
122 |
+
'max_movies_limit': max_movies
|
123 |
+
}
|
124 |
+
|
125 |
+
self.add_log("Saving vectors to HF Hub...", "INFO")
|
126 |
+
success = self.storage.save_vectors(
|
127 |
+
embeddings_array,
|
128 |
+
processed_movies,
|
129 |
+
id_map,
|
130 |
+
metadata
|
131 |
+
)
|
132 |
+
|
133 |
+
if success:
|
134 |
+
self.add_log(f"Successfully updated {len(processed_movies)} movie vectors", "INFO")
|
135 |
+
self.last_update_result = {
|
136 |
+
"success": True,
|
137 |
+
"count": len(processed_movies),
|
138 |
+
"timestamp": datetime.now().isoformat()
|
139 |
+
}
|
140 |
+
return True
|
141 |
+
else:
|
142 |
+
self.add_log("Failed to save vectors to storage", "ERROR")
|
143 |
+
self.last_update_result = {"success": False, "error": "Failed to save to storage"}
|
144 |
+
return False
|
145 |
+
|
146 |
+
except Exception as e:
|
147 |
+
self.add_log(f"Error during vector update: {e}", "ERROR")
|
148 |
+
self.last_update_result = {"success": False, "error": str(e)}
|
149 |
+
return False
|
150 |
+
|
151 |
+
finally:
|
152 |
+
self.is_updating = False
|
153 |
+
self.add_log("Vector update process completed", "INFO")
|
154 |
+
|
155 |
+
def get_update_status(self) -> dict:
|
156 |
+
"""Retourne le statut de la mise à jour"""
|
157 |
+
return {
|
158 |
+
'is_updating': self.is_updating,
|
159 |
+
'auto_update_enabled': os.getenv('AUTO_UPDATE_VECTORS', 'false').lower() == 'true',
|
160 |
+
'update_interval_hours': int(os.getenv('UPDATE_INTERVAL_HOURS', 24)),
|
161 |
+
'batch_size': int(os.getenv('BATCH_SIZE', 100)),
|
162 |
+
'max_movies_limit': int(os.getenv('MAX_MOVIES_LIMIT', 10000)),
|
163 |
+
'last_update_result': self.last_update_result,
|
164 |
+
'logs_count': len(self.update_logs),
|
165 |
+
'hf_configured': bool(os.getenv('HF_TOKEN') and os.getenv('HF_DATASET_REPO'))
|
166 |
+
}
|
167 |
+
|
168 |
+
def get_logs(self) -> List[str]:
|
169 |
+
"""Retourne les logs de mise à jour"""
|
170 |
+
return self.update_logs.copy()
|
requirements.txt
CHANGED
@@ -1,12 +1,10 @@
|
|
1 |
fastapi==0.104.1
|
2 |
-
uvicorn
|
3 |
-
|
|
|
4 |
faiss-cpu==1.7.4
|
5 |
-
openai==1.
|
6 |
-
pydantic==2.11.5
|
7 |
-
pydantic-settings==2.9.1
|
8 |
-
python-multipart==0.0.6
|
9 |
-
requests==2.31.0
|
10 |
-
scikit-learn==1.3.2
|
11 |
python-dotenv==1.0.0
|
12 |
-
|
|
|
|
|
|
1 |
fastapi==0.104.1
|
2 |
+
uvicorn==0.24.0
|
3 |
+
pydantic==2.5.0
|
4 |
+
numpy==1.24.3
|
5 |
faiss-cpu==1.7.4
|
6 |
+
openai==1.3.5
|
|
|
|
|
|
|
|
|
|
|
7 |
python-dotenv==1.0.0
|
8 |
+
requests==2.31.0
|
9 |
+
datasets>=2.14.0
|
10 |
+
huggingface_hub>=0.17.0
|