yonnel commited on
Commit
14e32e0
·
1 Parent(s): 0236fb6

Add admin router and vector update functionality; enhance environment configuration

Browse files
.env.example CHANGED
@@ -14,4 +14,17 @@ ENV=dev
14
  LOG_LEVEL=INFO
15
 
16
  # Remove adult content from TMDB results
17
- FILTER_ADULT_CONTENT=true # Set to true to filter out adult content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  LOG_LEVEL=INFO
15
 
16
  # Remove adult content from TMDB results
17
+ FILTER_ADULT_CONTENT=true # Set to true to filter out adult content
18
+
19
+ # Hugging Face configuration
20
+ HF_TOKEN=your_hf_token_here
21
+ HF_DATASET_REPO=your-username/karl-movie-vectors
22
+
23
+ # Vector update configuration
24
+ AUTO_UPDATE_VECTORS=true
25
+ UPDATE_INTERVAL_HOURS=24
26
+ BATCH_SIZE=100
27
+ MAX_MOVIES_LIMIT=10000
28
+
29
+ # Admin configuration
30
+ ADMIN_TOKEN=your_admin_token_here
app/main.py CHANGED
@@ -187,6 +187,15 @@ def compute_barycenter(liked_indices: List[int], coords: np.ndarray):
187
  # FastAPI app setup
188
  app = FastAPI(title="Karl-Movie Vector Backend", version="1.0.0")
189
 
 
 
 
 
 
 
 
 
 
190
  # CORS configuration
191
  DEV_ORIGINS = [
192
  "http://localhost:5173",
@@ -210,7 +219,20 @@ app.add_middleware(
210
  @app.on_event("startup")
211
  async def startup_event():
212
  """Load data on startup"""
213
- load_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  @app.get("/health")
216
  async def health_check():
 
187
  # FastAPI app setup
188
  app = FastAPI(title="Karl-Movie Vector Backend", version="1.0.0")
189
 
190
+ # Ajouter l'import du router admin
191
+ try:
192
+ from .routers import admin
193
+ except ImportError:
194
+ from app.routers import admin
195
+
196
+ # Ajouter le router admin
197
+ app.include_router(admin.router)
198
+
199
  # CORS configuration
200
  DEV_ORIGINS = [
201
  "http://localhost:5173",
 
219
  @app.on_event("startup")
220
  async def startup_event():
221
  """Load data on startup"""
222
+ global vectors, id_map, faiss_index, movie_metadata
223
+ vectors, id_map, faiss_index, movie_metadata = load_data()
224
+
225
+ # Vérifier et mettre à jour les vecteurs si nécessaire au démarrage
226
+ if os.getenv('AUTO_UPDATE_VECTORS', 'false').lower() == 'true':
227
+ # Lancer en arrière-plan sans attendre
228
+ import asyncio
229
+ try:
230
+ from .services.vector_updater import VectorUpdater
231
+ except ImportError:
232
+ from app.services.vector_updater import VectorUpdater
233
+
234
+ vector_updater = VectorUpdater()
235
+ asyncio.create_task(vector_updater.update_vectors_if_needed())
236
 
237
  @app.get("/health")
238
  async def health_check():
app/routers/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Routers package for Karl Movie Vector Backend
3
+ """
app/routers/admin.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException, Depends, BackgroundTasks
2
+ from fastapi.responses import HTMLResponse
3
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
4
+ import os
5
+ import logging
6
+ from datetime import datetime
7
+ from typing import Optional
8
+
9
+ # Import avec gestion des erreurs pour les imports relatifs
10
+ try:
11
+ from ..services.vector_updater import VectorUpdater
12
+ except ImportError:
13
+ from app.services.vector_updater import VectorUpdater
14
+
15
+ logger = logging.getLogger(__name__)
16
+ router = APIRouter(prefix="/admin", tags=["admin"])
17
+ security = HTTPBearer()
18
+
19
+ # Instance globale du updater
20
+ vector_updater = VectorUpdater()
21
+
22
+ def verify_admin_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
23
+ """Vérification du token admin"""
24
+ admin_token = os.getenv('ADMIN_TOKEN')
25
+ if not admin_token or credentials.credentials != admin_token:
26
+ raise HTTPException(status_code=403, detail="Invalid admin token")
27
+ return credentials.credentials
28
+
29
+ @router.get("/", response_class=HTMLResponse)
30
+ async def admin_dashboard():
31
+ """Interface web d'administration"""
32
+ html_content = """
33
+ <!DOCTYPE html>
34
+ <html lang="fr">
35
+ <head>
36
+ <meta charset="UTF-8">
37
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
38
+ <title>Karl Movie Vector - Admin</title>
39
+ <style>
40
+ body { font-family: Arial, sans-serif; margin: 40px; background: #f5f5f5; }
41
+ .container { max-width: 1000px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
42
+ h1 { color: #333; text-align: center; margin-bottom: 30px; }
43
+ .card { background: #f8f9fa; padding: 20px; margin: 20px 0; border-radius: 5px; border-left: 4px solid #007bff; }
44
+ .status { padding: 10px; margin: 10px 0; border-radius: 4px; }
45
+ .status.success { background: #d4edda; color: #155724; border: 1px solid #c3e6cb; }
46
+ .status.error { background: #f8d7da; color: #721c24; border: 1px solid #f5c6cb; }
47
+ .status.warning { background: #fff3cd; color: #856404; border: 1px solid #ffeaa7; }
48
+ .status.info { background: #d1ecf1; color: #0c5460; border: 1px solid #bee5eb; }
49
+ button { background: #007bff; color: white; border: none; padding: 12px 24px; border-radius: 4px; cursor: pointer; margin: 5px; font-size: 14px; }
50
+ button:hover { background: #0056b3; }
51
+ button:disabled { background: #6c757d; cursor: not-allowed; }
52
+ .danger { background: #dc3545; }
53
+ .danger:hover { background: #c82333; }
54
+ .loading { display: none; color: #007bff; margin: 10px 0; }
55
+ .log { background: #f8f9fa; border: 1px solid #dee2e6; padding: 15px; margin: 15px 0; border-radius: 4px; font-family: monospace; font-size: 12px; max-height: 400px; overflow-y: auto; white-space: pre-line; }
56
+ .config-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 10px; }
57
+ .config-item { background: white; padding: 10px; border-radius: 4px; border: 1px solid #dee2e6; }
58
+ </style>
59
+ </head>
60
+ <body>
61
+ <div class="container">
62
+ <h1>🎬 Karl Movie Vector - Administration</h1>
63
+
64
+ <div class="card">
65
+ <h3>📊 Statut du Système</h3>
66
+ <div id="status-info"></div>
67
+ <button onclick="loadStatus()">🔄 Actualiser le Statut</button>
68
+ </div>
69
+
70
+ <div class="card">
71
+ <h3>🚀 Mise à jour des Vecteurs</h3>
72
+ <p>Déclenchez manuellement la mise à jour des vecteurs de films depuis TMDB.</p>
73
+ <div>
74
+ <button onclick="updateVectors()" id="updateBtn">🔄 Mettre à jour (Conditionnel)</button>
75
+ <button onclick="forceUpdateVectors()" id="forceUpdateBtn" class="danger">⚡ Forcer la Mise à jour</button>
76
+ </div>
77
+ <div id="update-loading" class="loading">⏳ Mise à jour en cours...</div>
78
+ <div id="update-result"></div>
79
+ </div>
80
+
81
+ <div class="card">
82
+ <h3>📋 Logs de Mise à jour</h3>
83
+ <button onclick="loadLogs()">📄 Charger les Logs</button>
84
+ <button onclick="clearLogs()">🗑️ Effacer l'affichage</button>
85
+ <div id="logs" class="log">Cliquez sur "Charger les Logs" pour voir les logs...</div>
86
+ </div>
87
+ </div>
88
+
89
+ <script>
90
+ const API_BASE = '/admin';
91
+ let ADMIN_TOKEN = localStorage.getItem('admin_token');
92
+
93
+ if (!ADMIN_TOKEN) {
94
+ ADMIN_TOKEN = prompt('Token Admin:');
95
+ if (ADMIN_TOKEN) {
96
+ localStorage.setItem('admin_token', ADMIN_TOKEN);
97
+ }
98
+ }
99
+
100
+ const headers = {
101
+ 'Authorization': `Bearer ${ADMIN_TOKEN}`,
102
+ 'Content-Type': 'application/json'
103
+ };
104
+
105
+ async function apiCall(endpoint, method = 'GET') {
106
+ try {
107
+ const response = await fetch(`${API_BASE}${endpoint}`, {
108
+ method,
109
+ headers
110
+ });
111
+
112
+ if (response.status === 403) {
113
+ localStorage.removeItem('admin_token');
114
+ location.reload();
115
+ return;
116
+ }
117
+
118
+ if (!response.ok) {
119
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
120
+ }
121
+
122
+ return await response.json();
123
+ } catch (error) {
124
+ console.error('API Error:', error);
125
+ throw error;
126
+ }
127
+ }
128
+
129
+ async function loadStatus() {
130
+ try {
131
+ const status = await apiCall('/status');
132
+ const statusDiv = document.getElementById('status-info');
133
+
134
+ let statusClass = 'info';
135
+ if (status.is_updating) statusClass = 'warning';
136
+ else if (!status.hf_configured) statusClass = 'error';
137
+ else if (status.last_update_result && status.last_update_result.success) statusClass = 'success';
138
+
139
+ let lastUpdateInfo = 'Aucune';
140
+ if (status.last_update_result) {
141
+ const result = status.last_update_result;
142
+ if (result.success) {
143
+ lastUpdateInfo = `✅ ${result.count} films (${new Date(result.timestamp).toLocaleString()})`;
144
+ } else {
145
+ lastUpdateInfo = `❌ Erreur: ${result.error}`;
146
+ }
147
+ }
148
+
149
+ statusDiv.innerHTML = `
150
+ <div class="status ${statusClass}">
151
+ <div class="config-grid">
152
+ <div class="config-item">
153
+ <strong>Statut:</strong> ${status.is_updating ? '🔄 Mise à jour en cours...' : '✅ Prêt'}
154
+ </div>
155
+ <div class="config-item">
156
+ <strong>HF Configuré:</strong> ${status.hf_configured ? '✅ Oui' : '❌ Non'}
157
+ </div>
158
+ <div class="config-item">
159
+ <strong>Auto-update:</strong> ${status.auto_update_enabled ? '✅ Activé' : '❌ Désactivé'}
160
+ </div>
161
+ <div class="config-item">
162
+ <strong>Intervalle:</strong> ${status.update_interval_hours}h
163
+ </div>
164
+ <div class="config-item">
165
+ <strong>Taille batch:</strong> ${status.batch_size}
166
+ </div>
167
+ <div class="config-item">
168
+ <strong>Limite films:</strong> ${status.max_movies_limit}
169
+ </div>
170
+ <div class="config-item">
171
+ <strong>Logs:</strong> ${status.logs_count} entrées
172
+ </div>
173
+ <div class="config-item">
174
+ <strong>Dernière MAJ:</strong> ${lastUpdateInfo}
175
+ </div>
176
+ </div>
177
+ </div>
178
+ `;
179
+ } catch (error) {
180
+ document.getElementById('status-info').innerHTML = `
181
+ <div class="status error">❌ Erreur: ${error.message}</div>
182
+ `;
183
+ }
184
+ }
185
+
186
+ async function updateVectors() {
187
+ const btn = document.getElementById('updateBtn');
188
+ const loading = document.getElementById('update-loading');
189
+ const result = document.getElementById('update-result');
190
+
191
+ btn.disabled = true;
192
+ loading.style.display = 'block';
193
+ result.innerHTML = '';
194
+
195
+ try {
196
+ const response = await apiCall('/update-vectors', 'POST');
197
+ result.innerHTML = `
198
+ <div class="status ${response.success ? 'success' : 'warning'}">
199
+ ${response.success ? '✅' : '⚠️'} ${response.message}
200
+ </div>
201
+ `;
202
+
203
+ // Actualiser le statut après quelques secondes
204
+ if (response.success) {
205
+ setTimeout(loadStatus, 2000);
206
+ }
207
+ } catch (error) {
208
+ result.innerHTML = `
209
+ <div class="status error">❌ Erreur: ${error.message}</div>
210
+ `;
211
+ } finally {
212
+ btn.disabled = false;
213
+ loading.style.display = 'none';
214
+ }
215
+ }
216
+
217
+ async function forceUpdateVectors() {
218
+ if (!confirm('Êtes-vous sûr de vouloir forcer la mise à jour ? Cela peut prendre plusieurs minutes et consommer des crédits API.')) {
219
+ return;
220
+ }
221
+
222
+ const btn = document.getElementById('forceUpdateBtn');
223
+ const loading = document.getElementById('update-loading');
224
+ const result = document.getElementById('update-result');
225
+
226
+ btn.disabled = true;
227
+ loading.style.display = 'block';
228
+ result.innerHTML = '';
229
+
230
+ try {
231
+ const response = await apiCall('/force-update-vectors', 'POST');
232
+ result.innerHTML = `
233
+ <div class="status ${response.success ? 'success' : 'error'}">
234
+ ${response.success ? '✅' : '❌'} ${response.message}
235
+ </div>
236
+ `;
237
+
238
+ // Actualiser le statut après quelques secondes
239
+ if (response.success) {
240
+ setTimeout(loadStatus, 2000);
241
+ }
242
+ } catch (error) {
243
+ result.innerHTML = `
244
+ <div class="status error">❌ Erreur: ${error.message}</div>
245
+ `;
246
+ } finally {
247
+ btn.disabled = false;
248
+ loading.style.display = 'none';
249
+ }
250
+ }
251
+
252
+ async function loadLogs() {
253
+ try {
254
+ const response = await apiCall('/logs');
255
+ const logsDiv = document.getElementById('logs');
256
+
257
+ if (response.logs && response.logs.length > 0) {
258
+ logsDiv.innerHTML = response.logs.join('\\n');
259
+ logsDiv.scrollTop = logsDiv.scrollHeight;
260
+ } else {
261
+ logsDiv.innerHTML = 'Aucun log disponible';
262
+ }
263
+ } catch (error) {
264
+ document.getElementById('logs').innerHTML = `Erreur: ${error.message}`;
265
+ }
266
+ }
267
+
268
+ function clearLogs() {
269
+ document.getElementById('logs').innerHTML = 'Logs effacés (rechargez pour voir les nouveaux logs)';
270
+ }
271
+
272
+ // Charger le statut au démarrage
273
+ if (ADMIN_TOKEN) {
274
+ loadStatus();
275
+
276
+ // Auto-refresh du statut toutes les 30 secondes
277
+ setInterval(loadStatus, 30000);
278
+ }
279
+ </script>
280
+ </body>
281
+ </html>
282
+ """
283
+ return HTMLResponse(content=html_content)
284
+
285
+ @router.get("/status")
286
+ async def get_status(token: str = Depends(verify_admin_token)):
287
+ """Obtenir le statut du système"""
288
+ return vector_updater.get_update_status()
289
+
290
+ @router.post("/update-vectors")
291
+ async def update_vectors(background_tasks: BackgroundTasks, token: str = Depends(verify_admin_token)):
292
+ """Déclencher une mise à jour si nécessaire"""
293
+ if vector_updater.is_updating:
294
+ return {"success": False, "message": "Une mise à jour est déjà en cours"}
295
+
296
+ # Lancer la mise à jour en arrière-plan
297
+ background_tasks.add_task(vector_updater.update_vectors_if_needed)
298
+
299
+ return {"success": True, "message": "Mise à jour programmée (vérification des conditions)"}
300
+
301
+ @router.post("/force-update-vectors")
302
+ async def force_update_vectors(background_tasks: BackgroundTasks, token: str = Depends(verify_admin_token)):
303
+ """Forcer la mise à jour des vecteurs"""
304
+ if vector_updater.is_updating:
305
+ return {"success": False, "message": "Une mise à jour est déjà en cours"}
306
+
307
+ # Lancer la mise à jour forcée en arrière-plan
308
+ background_tasks.add_task(vector_updater.force_update_vectors)
309
+
310
+ return {"success": True, "message": "Mise à jour forcée programmée"}
311
+
312
+ @router.get("/logs")
313
+ async def get_logs(token: str = Depends(verify_admin_token)):
314
+ """Obtenir les logs de mise à jour"""
315
+ try:
316
+ logs = vector_updater.get_logs()
317
+ return {"logs": logs}
318
+ except Exception as e:
319
+ return {"logs": [f"Erreur de lecture des logs: {e}"]}
app/services/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Services package for Karl Movie Vector Backend
3
+ """
app/services/embedding_service.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import time
3
+ import logging
4
+ from typing import List, Optional, Dict
5
+ import os
6
+ from openai import OpenAI
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class EmbeddingService:
11
+ """Service pour générer des embeddings avec OpenAI"""
12
+
13
+ def __init__(self):
14
+ self.client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
15
+ self.model_name = "text-embedding-3-small"
16
+
17
+ def create_composite_text(self, movie_data: Dict) -> str:
18
+ """Create composite text for embedding from movie data"""
19
+ parts = []
20
+
21
+ # Title
22
+ if movie_data.get('title'):
23
+ parts.append(f"Title: {movie_data['title']}")
24
+
25
+ # Tagline
26
+ if movie_data.get('tagline'):
27
+ parts.append(f"Tagline: {movie_data['tagline']}")
28
+
29
+ # Overview
30
+ if movie_data.get('overview'):
31
+ parts.append(f"Overview: {movie_data['overview']}")
32
+
33
+ # Release date
34
+ if movie_data.get('release_date'):
35
+ parts.append(f"Release Date: {movie_data['release_date']}")
36
+
37
+ # Original language
38
+ if movie_data.get('original_language'):
39
+ parts.append(f"Language: {movie_data['original_language']}")
40
+
41
+ # Spoken languages
42
+ if movie_data.get('spoken_languages'):
43
+ languages = [lang.get('iso_639_1', '') for lang in movie_data['spoken_languages'] if lang.get('iso_639_1')]
44
+ if languages:
45
+ parts.append(f"Spoken Languages: {', '.join(languages)}")
46
+
47
+ # Genres
48
+ if movie_data.get('genres'):
49
+ genres = [genre['name'] for genre in movie_data['genres']]
50
+ parts.append(f"Genres: {', '.join(genres)}")
51
+
52
+ # Production companies
53
+ if movie_data.get('production_companies'):
54
+ companies = [company['name'] for company in movie_data['production_companies']]
55
+ if companies:
56
+ parts.append(f"Production Companies: {', '.join(companies)}")
57
+
58
+ # Production countries
59
+ if movie_data.get('production_countries'):
60
+ countries = [country['name'] for country in movie_data['production_countries']]
61
+ if countries:
62
+ parts.append(f"Production Countries: {', '.join(countries)}")
63
+
64
+ # Budget (only if > 0)
65
+ if movie_data.get('budget') and movie_data['budget'] > 0:
66
+ parts.append(f"Budget: ${movie_data['budget']:,}")
67
+
68
+ # Popularity
69
+ if movie_data.get('popularity'):
70
+ parts.append(f"Popularity: {movie_data['popularity']}")
71
+
72
+ # Vote average
73
+ if movie_data.get('vote_average'):
74
+ parts.append(f"Vote Average: {movie_data['vote_average']}")
75
+
76
+ # Vote count
77
+ if movie_data.get('vote_count'):
78
+ parts.append(f"Vote Count: {movie_data['vote_count']}")
79
+
80
+ # Director(s)
81
+ if movie_data.get('credits', {}).get('crew'):
82
+ directors = [person['name'] for person in movie_data['credits']['crew'] if person['job'] == 'Director']
83
+ if directors:
84
+ parts.append(f"Director: {', '.join(directors)}")
85
+
86
+ # Top 5 cast
87
+ if movie_data.get('credits', {}).get('cast'):
88
+ top_cast = [person['name'] for person in movie_data['credits']['cast'][:5]]
89
+ if top_cast:
90
+ parts.append(f"Cast: {', '.join(top_cast)}")
91
+
92
+ return " / ".join(parts)
93
+
94
+ def get_embeddings_batch(self, texts: List[str], max_retries: int = 3) -> Optional[List[List[float]]]:
95
+ """Get embeddings for a batch of texts with retry"""
96
+ for attempt in range(max_retries):
97
+ try:
98
+ response = self.client.embeddings.create(
99
+ model=self.model_name,
100
+ input=texts
101
+ )
102
+ return [embedding.embedding for embedding in response.data]
103
+ except Exception as e:
104
+ logger.error(f"OpenAI API error (attempt {attempt + 1}): {e}")
105
+ if attempt < max_retries - 1:
106
+ time.sleep(2 ** attempt)
107
+ return None
108
+
109
+ async def generate_batch_embeddings(self, movies: List[Dict], batch_size: int = 100) -> Optional[List[List[float]]]:
110
+ """Generate embeddings for a batch of movies"""
111
+ try:
112
+ # Create composite texts
113
+ texts = []
114
+ for movie in movies:
115
+ composite_text = self.create_composite_text(movie)
116
+ texts.append(composite_text)
117
+
118
+ # Generate embeddings in smaller batches to avoid API limits
119
+ all_embeddings = []
120
+
121
+ for i in range(0, len(texts), batch_size):
122
+ batch_texts = texts[i:i + batch_size]
123
+ logger.debug(f"Generating embeddings for batch {i//batch_size + 1}")
124
+
125
+ batch_embeddings = self.get_embeddings_batch(batch_texts)
126
+ if batch_embeddings is None:
127
+ logger.error(f"Failed to generate embeddings for batch starting at {i}")
128
+ return None
129
+
130
+ all_embeddings.extend(batch_embeddings)
131
+
132
+ # Small delay between batches to respect rate limits
133
+ await asyncio.sleep(0.1)
134
+
135
+ return all_embeddings
136
+
137
+ except Exception as e:
138
+ logger.error(f"Error generating batch embeddings: {e}")
139
+ return None
app/services/tmdb_service.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import time
3
+ import logging
4
+ from typing import List, Optional, Dict
5
+ import os
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class TMDBService:
10
+ """Service pour interagir avec l'API TMDB"""
11
+
12
+ def __init__(self):
13
+ self.api_key = os.getenv('TMDB_API_KEY')
14
+ self.base_url = "https://api.themoviedb.org/3"
15
+
16
+ def _make_request(self, endpoint: str, params: dict = None, max_retries: int = 3) -> Optional[dict]:
17
+ """Make API request with retry and backoff"""
18
+ if params is None:
19
+ params = {}
20
+ params['api_key'] = self.api_key
21
+
22
+ for attempt in range(max_retries):
23
+ try:
24
+ response = requests.get(f"{self.base_url}{endpoint}", params=params, timeout=10)
25
+ if response.status_code == 200:
26
+ return response.json()
27
+ elif response.status_code == 429:
28
+ # Rate limited, wait longer
29
+ wait_time = 2 ** attempt
30
+ logger.warning(f"Rate limited, waiting {wait_time}s before retry {attempt + 1}")
31
+ time.sleep(wait_time)
32
+ continue
33
+ else:
34
+ logger.error(f"TMDB API returned status {response.status_code}")
35
+ return None
36
+ except Exception as e:
37
+ logger.error(f"Request failed (attempt {attempt + 1}): {e}")
38
+ if attempt < max_retries - 1:
39
+ time.sleep(2 ** attempt)
40
+ return None
41
+
42
+ async def get_popular_movies(self, limit: int = 10000) -> List[Dict]:
43
+ """Get popular movies from TMDB"""
44
+ movies = []
45
+ page = 1
46
+ filter_adult = os.getenv('FILTER_ADULT_CONTENT', 'true').lower() == 'true'
47
+
48
+ while len(movies) < limit:
49
+ logger.info(f"Fetching popular movies page {page}")
50
+
51
+ data = self._make_request("/movie/popular", {"page": page})
52
+ if not data or not data.get('results'):
53
+ break
54
+
55
+ for movie in data.get('results', []):
56
+ if len(movies) >= limit:
57
+ break
58
+
59
+ # Skip adult movies if filtering is enabled
60
+ if filter_adult and movie.get('adult', False):
61
+ continue
62
+
63
+ # Get detailed movie data
64
+ movie_details = self.get_movie_details(movie['id'])
65
+ if movie_details:
66
+ # Get credits
67
+ credits = self.get_movie_credits(movie['id'])
68
+ if credits:
69
+ movie_details['credits'] = credits
70
+ movies.append(movie_details)
71
+
72
+ # Check if we've reached the last page
73
+ if page >= data.get('total_pages', 0):
74
+ break
75
+
76
+ page += 1
77
+ time.sleep(0.25) # Rate limiting
78
+
79
+ logger.info(f"Collected {len(movies)} movies from TMDB")
80
+ return movies[:limit]
81
+
82
+ def get_movie_details(self, movie_id: int) -> Optional[dict]:
83
+ """Get detailed movie information"""
84
+ return self._make_request(f"/movie/{movie_id}")
85
+
86
+ def get_movie_credits(self, movie_id: int) -> Optional[dict]:
87
+ """Get movie cast and crew"""
88
+ return self._make_request(f"/movie/{movie_id}/credits")
app/services/vector_storage.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import Dataset, load_dataset
2
+ from huggingface_hub import HfApi, create_repo
3
+ import numpy as np
4
+ import json
5
+ import logging
6
+ from typing import Dict, List, Tuple, Optional
7
+ import os
8
+ from datetime import datetime
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class HFVectorStorage:
13
+ def __init__(self):
14
+ self.hf_token = os.getenv('HF_TOKEN')
15
+ self.repo_name = os.getenv('HF_DATASET_REPO')
16
+ self.api = HfApi(token=self.hf_token)
17
+
18
+ # Créer le repo s'il n'existe pas
19
+ if self.hf_token and self.repo_name:
20
+ try:
21
+ create_repo(
22
+ repo_id=self.repo_name,
23
+ repo_type="dataset",
24
+ token=self.hf_token,
25
+ private=True,
26
+ exist_ok=True
27
+ )
28
+ except Exception as e:
29
+ logger.warning(f"Repo creation warning: {e}")
30
+
31
+ def save_vectors(self, embeddings: np.ndarray, movies_data: List[Dict],
32
+ id_map: Dict, metadata: Dict) -> bool:
33
+ """Sauvegarde les vecteurs sur HF Dataset Hub"""
34
+ try:
35
+ if not self.hf_token or not self.repo_name:
36
+ logger.error("HF_TOKEN or HF_DATASET_REPO not configured")
37
+ return False
38
+
39
+ # Préparer les données pour le dataset
40
+ dataset_dict = {
41
+ 'movie_id': [movie['id'] for movie in movies_data],
42
+ 'title': [movie['title'] for movie in movies_data],
43
+ 'overview': [movie.get('overview', '') for movie in movies_data],
44
+ 'genres': [movie.get('genres', []) for movie in movies_data],
45
+ 'release_date': [movie.get('release_date', '') for movie in movies_data],
46
+ 'embedding': embeddings.tolist(),
47
+ 'tmdb_data': [json.dumps(movie) for movie in movies_data]
48
+ }
49
+
50
+ # Créer le dataset
51
+ dataset = Dataset.from_dict(dataset_dict)
52
+
53
+ # Upload vers HF Hub
54
+ dataset.push_to_hub(
55
+ self.repo_name,
56
+ token=self.hf_token,
57
+ commit_message=f"Update vectors - {datetime.now().isoformat()}"
58
+ )
59
+
60
+ # Sauvegarder les métadonnées
61
+ metadata_with_timestamp = {
62
+ **metadata,
63
+ 'last_updated': datetime.now().isoformat(),
64
+ 'total_movies': len(movies_data)
65
+ }
66
+
67
+ with open('temp_metadata.json', 'w') as f:
68
+ json.dump(metadata_with_timestamp, f, indent=2)
69
+
70
+ self.api.upload_file(
71
+ path_or_fileobj='temp_metadata.json',
72
+ path_in_repo='metadata.json',
73
+ repo_id=self.repo_name,
74
+ repo_type='dataset',
75
+ token=self.hf_token,
76
+ commit_message=f"Update metadata - {datetime.now().isoformat()}"
77
+ )
78
+
79
+ # Nettoyer le fichier temporaire
80
+ if os.path.exists('temp_metadata.json'):
81
+ os.remove('temp_metadata.json')
82
+
83
+ logger.info(f"Successfully saved {len(movies_data)} movie vectors to HF Hub")
84
+ return True
85
+
86
+ except Exception as e:
87
+ logger.error(f"Error saving vectors to HF Hub: {e}")
88
+ return False
89
+
90
+ def load_vectors(self) -> Optional[Tuple[np.ndarray, List[Dict], Dict, Dict]]:
91
+ """Charge les vecteurs depuis HF Dataset Hub"""
92
+ try:
93
+ if not self.hf_token or not self.repo_name:
94
+ logger.error("HF_TOKEN or HF_DATASET_REPO not configured")
95
+ return None
96
+
97
+ # Charger le dataset
98
+ dataset = load_dataset(self.repo_name, token=self.hf_token)['train']
99
+
100
+ # Extraire les données
101
+ embeddings = np.array(dataset['embedding'])
102
+
103
+ movies_data = []
104
+ id_map = {}
105
+
106
+ for i, movie_id in enumerate(dataset['movie_id']):
107
+ movie_data = json.loads(dataset['tmdb_data'][i])
108
+ movies_data.append(movie_data)
109
+ id_map[movie_id] = i
110
+
111
+ # Charger les métadonnées
112
+ try:
113
+ metadata_file = self.api.hf_hub_download(
114
+ repo_id=self.repo_name,
115
+ filename='metadata.json',
116
+ repo_type='dataset',
117
+ token=self.hf_token
118
+ )
119
+ with open(metadata_file, 'r') as f:
120
+ metadata = json.load(f)
121
+ except:
122
+ metadata = {'last_updated': None}
123
+
124
+ logger.info(f"Successfully loaded {len(movies_data)} movie vectors from HF Hub")
125
+ return embeddings, movies_data, id_map, metadata
126
+
127
+ except Exception as e:
128
+ logger.error(f"Error loading vectors from HF Hub: {e}")
129
+ return None
130
+
131
+ def check_update_needed(self) -> bool:
132
+ """Vérifie si une mise à jour est nécessaire"""
133
+ try:
134
+ update_interval = int(os.getenv('UPDATE_INTERVAL_HOURS', 24))
135
+
136
+ # Charger les métadonnées actuelles
137
+ result = self.load_vectors()
138
+ if not result:
139
+ return True
140
+
141
+ _, _, _, metadata = result
142
+
143
+ if not metadata.get('last_updated'):
144
+ return True
145
+
146
+ last_update = datetime.fromisoformat(metadata['last_updated'])
147
+ hours_since_update = (datetime.now() - last_update).total_seconds() / 3600
148
+
149
+ return hours_since_update >= update_interval
150
+
151
+ except Exception as e:
152
+ logger.error(f"Error checking update status: {e}")
153
+ return True
app/services/vector_updater.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ from datetime import datetime
4
+ from typing import Optional, List
5
+ import os
6
+ import numpy as np
7
+
8
+ from .vector_storage import HFVectorStorage
9
+ from .tmdb_service import TMDBService
10
+ from .embedding_service import EmbeddingService
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class VectorUpdater:
15
+ def __init__(self):
16
+ self.storage = HFVectorStorage()
17
+ self.tmdb_service = TMDBService()
18
+ self.embedding_service = EmbeddingService()
19
+ self.is_updating = False
20
+ self.last_update_result = None
21
+ self.update_logs = []
22
+
23
+ def add_log(self, message: str, level: str = "INFO"):
24
+ """Ajouter un log avec timestamp"""
25
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
26
+ log_entry = f"[{timestamp}] {level}: {message}"
27
+ self.update_logs.append(log_entry)
28
+
29
+ # Garder seulement les 100 derniers logs
30
+ if len(self.update_logs) > 100:
31
+ self.update_logs = self.update_logs[-100:]
32
+
33
+ # Log également dans le système de logging
34
+ log_level = getattr(logging, level.upper(), logging.INFO)
35
+ logger.log(log_level, message)
36
+
37
+ async def update_vectors_if_needed(self) -> bool:
38
+ """Met à jour les vecteurs si nécessaire"""
39
+ if self.is_updating:
40
+ self.add_log("Update already in progress, skipping...", "WARNING")
41
+ return False
42
+
43
+ if not self.storage.check_update_needed():
44
+ self.add_log("Vectors are up to date, no update needed", "INFO")
45
+ return False
46
+
47
+ if not os.getenv('AUTO_UPDATE_VECTORS', 'false').lower() == 'true':
48
+ self.add_log("Auto update disabled", "INFO")
49
+ return False
50
+
51
+ self.add_log("Auto update conditions met, starting update...", "INFO")
52
+ return await self.force_update_vectors()
53
+
54
+ async def force_update_vectors(self) -> bool:
55
+ """Force la mise à jour des vecteurs"""
56
+ if self.is_updating:
57
+ self.add_log("Update already in progress", "WARNING")
58
+ return False
59
+
60
+ self.is_updating = True
61
+ self.add_log("Starting vector update process...", "INFO")
62
+
63
+ try:
64
+ # Paramètres de configuration
65
+ batch_size = int(os.getenv('BATCH_SIZE', 100))
66
+ max_movies = int(os.getenv('MAX_MOVIES_LIMIT', 10000))
67
+
68
+ # Récupérer les films populaires depuis TMDB
69
+ self.add_log("Fetching movies from TMDB...", "INFO")
70
+ movies = await self.tmdb_service.get_popular_movies(limit=max_movies)
71
+
72
+ if not movies:
73
+ self.add_log("No movies fetched from TMDB", "ERROR")
74
+ self.last_update_result = {"success": False, "error": "No movies fetched"}
75
+ return False
76
+
77
+ self.add_log(f"Processing {len(movies)} movies in batches of {batch_size}", "INFO")
78
+
79
+ all_embeddings = []
80
+ processed_movies = []
81
+ id_map = {}
82
+
83
+ # Traiter par batches pour éviter les timeouts
84
+ for i in range(0, len(movies), batch_size):
85
+ batch = movies[i:i + batch_size]
86
+ batch_num = i//batch_size + 1
87
+ total_batches = (len(movies)-1)//batch_size + 1
88
+
89
+ self.add_log(f"Processing batch {batch_num}/{total_batches}", "INFO")
90
+
91
+ # Générer les embeddings pour le batch
92
+ batch_embeddings = await self.embedding_service.generate_batch_embeddings(batch)
93
+
94
+ if batch_embeddings is not None:
95
+ all_embeddings.extend(batch_embeddings)
96
+
97
+ for j, movie in enumerate(batch):
98
+ processed_movies.append(movie)
99
+ id_map[movie['id']] = len(processed_movies) - 1
100
+ else:
101
+ self.add_log(f"Failed to generate embeddings for batch {batch_num}", "WARNING")
102
+
103
+ # Pause entre les batches pour éviter le rate limiting
104
+ await asyncio.sleep(1)
105
+
106
+ if not all_embeddings:
107
+ self.add_log("No embeddings generated", "ERROR")
108
+ self.last_update_result = {"success": False, "error": "No embeddings generated"}
109
+ return False
110
+
111
+ # Convertir en numpy array
112
+ embeddings_array = np.array(all_embeddings)
113
+ self.add_log(f"Generated {len(all_embeddings)} embeddings", "INFO")
114
+
115
+ # Sauvegarder sur HF Hub
116
+ metadata = {
117
+ 'update_timestamp': datetime.now().isoformat(),
118
+ 'total_movies': len(processed_movies),
119
+ 'embedding_model': getattr(self.embedding_service, 'model_name', 'unknown'),
120
+ 'tmdb_api_version': '3',
121
+ 'batch_size': batch_size,
122
+ 'max_movies_limit': max_movies
123
+ }
124
+
125
+ self.add_log("Saving vectors to HF Hub...", "INFO")
126
+ success = self.storage.save_vectors(
127
+ embeddings_array,
128
+ processed_movies,
129
+ id_map,
130
+ metadata
131
+ )
132
+
133
+ if success:
134
+ self.add_log(f"Successfully updated {len(processed_movies)} movie vectors", "INFO")
135
+ self.last_update_result = {
136
+ "success": True,
137
+ "count": len(processed_movies),
138
+ "timestamp": datetime.now().isoformat()
139
+ }
140
+ return True
141
+ else:
142
+ self.add_log("Failed to save vectors to storage", "ERROR")
143
+ self.last_update_result = {"success": False, "error": "Failed to save to storage"}
144
+ return False
145
+
146
+ except Exception as e:
147
+ self.add_log(f"Error during vector update: {e}", "ERROR")
148
+ self.last_update_result = {"success": False, "error": str(e)}
149
+ return False
150
+
151
+ finally:
152
+ self.is_updating = False
153
+ self.add_log("Vector update process completed", "INFO")
154
+
155
+ def get_update_status(self) -> dict:
156
+ """Retourne le statut de la mise à jour"""
157
+ return {
158
+ 'is_updating': self.is_updating,
159
+ 'auto_update_enabled': os.getenv('AUTO_UPDATE_VECTORS', 'false').lower() == 'true',
160
+ 'update_interval_hours': int(os.getenv('UPDATE_INTERVAL_HOURS', 24)),
161
+ 'batch_size': int(os.getenv('BATCH_SIZE', 100)),
162
+ 'max_movies_limit': int(os.getenv('MAX_MOVIES_LIMIT', 10000)),
163
+ 'last_update_result': self.last_update_result,
164
+ 'logs_count': len(self.update_logs),
165
+ 'hf_configured': bool(os.getenv('HF_TOKEN') and os.getenv('HF_DATASET_REPO'))
166
+ }
167
+
168
+ def get_logs(self) -> List[str]:
169
+ """Retourne les logs de mise à jour"""
170
+ return self.update_logs.copy()
requirements.txt CHANGED
@@ -1,12 +1,10 @@
1
  fastapi==0.104.1
2
- uvicorn[standard]==0.24.0
3
- numpy==1.24.4
 
4
  faiss-cpu==1.7.4
5
- openai==1.51.0
6
- pydantic==2.11.5
7
- pydantic-settings==2.9.1
8
- python-multipart==0.0.6
9
- requests==2.31.0
10
- scikit-learn==1.3.2
11
  python-dotenv==1.0.0
12
- httpx==0.27.0
 
 
 
1
  fastapi==0.104.1
2
+ uvicorn==0.24.0
3
+ pydantic==2.5.0
4
+ numpy==1.24.3
5
  faiss-cpu==1.7.4
6
+ openai==1.3.5
 
 
 
 
 
7
  python-dotenv==1.0.0
8
+ requests==2.31.0
9
+ datasets>=2.14.0
10
+ huggingface_hub>=0.17.0