yonnel commited on
Commit
66fef64
Β·
0 Parent(s):

Initial clean commit - FastAPI movie backend without large data files

Browse files
Files changed (13) hide show
  1. .env.example +14 -0
  2. .gitattributes +36 -0
  3. .gitignore +57 -0
  4. Dockerfile +32 -0
  5. README.md +42 -0
  6. README_HF.md +42 -0
  7. app/__init__.py +6 -0
  8. app/build_index.py +485 -0
  9. app/main.py +303 -0
  10. app/settings.py +35 -0
  11. app/test_api.py +80 -0
  12. app/test_setup.py +121 -0
  13. requirements.txt +12 -0
.env.example ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenAI API key for embeddings
2
+ OPENAI_API_KEY=your_openai_api_key_here
3
+
4
+ # TMDB API key for movie data
5
+ TMDB_API_KEY=your_tmdb_api_key_here
6
+
7
+ # API authentication token
8
+ API_TOKEN=your_api_token_here
9
+
10
+ # Environment (dev/prod)
11
+ ENV=dev
12
+
13
+ # Logging level
14
+ LOG_LEVEL=INFO
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.index filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ pip-wheel-metadata/
20
+ share/python-wheels/
21
+ *.egg-info/
22
+ .installed.cfg
23
+ *.egg
24
+ MANIFEST
25
+
26
+ # Environment
27
+ .env
28
+ .venv
29
+ env/
30
+ venv/
31
+ ENV/
32
+ env.bak/
33
+ venv.bak/
34
+
35
+ # Data files (these are large and will be generated on deployment)
36
+ app/data/*.npy
37
+ app/data/*.index
38
+ app/data/movie_metadata.json
39
+ app/data/id_map.json
40
+ app/data/checkpoints/
41
+
42
+ # IDE
43
+ .vscode/
44
+ .idea/
45
+ *.swp
46
+ *.swo
47
+
48
+ # OS
49
+ .DS_Store
50
+ Thumbs.db
51
+
52
+ # Logs
53
+ *.log
54
+
55
+ # Temporary files
56
+ tmp/
57
+ temp/
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ gcc \
6
+ g++ \
7
+ && rm -rf /var/lib/apt/lists/*
8
+
9
+ # Set working directory
10
+ WORKDIR /app
11
+
12
+ # Copy requirements first for better caching
13
+ COPY requirements.txt .
14
+
15
+ # Install Python dependencies
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Copy application code
19
+ COPY app/ ./app/
20
+
21
+ # Create data directory
22
+ RUN mkdir -p app/data
23
+
24
+ # Expose port
25
+ EXPOSE 7860
26
+
27
+ # Health check
28
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
29
+ CMD curl -f http://localhost:7860/health || exit 1
30
+
31
+ # Run the application
32
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Karl Movie Vector Backend
3
+ emoji: 🎬
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ ---
10
+
11
+ # Karl Movie Vector Backend
12
+
13
+ FastAPI backend for semantic movie recommendations using FAISS and OpenAI embeddings. Powers intelligent movie discovery with geometric subspace algorithms.
14
+
15
+ ## Features
16
+
17
+ - Semantic movie search using OpenAI embeddings
18
+ - FAISS-powered vector similarity search
19
+ - Geometric subspace algorithms for multi-movie preferences
20
+ - ~150ms response time on CPU
21
+ - RESTful API with Bearer token authentication
22
+
23
+ ## API Usage
24
+
25
+ ```bash
26
+ curl -X POST "https://yonnel-karl-movie-vector-backend.hf.space/explore" \
27
+ -H "Authorization: Bearer YOUR_TOKEN" \
28
+ -H "Content-Type: application/json" \
29
+ -d '{
30
+ "liked_ids": [550, 680],
31
+ "disliked_ids": [],
32
+ "top_k": 100
33
+ }'
34
+ ```
35
+
36
+ ## Environment Variables
37
+
38
+ Set these in your Space settings:
39
+ - `OPENAI_API_KEY`: Your OpenAI API key
40
+ - `TMDB_API_KEY`: Your TMDB API key
41
+ - `API_TOKEN`: Authentication token for API access
42
+ - `ENV`: Set to "prod" for production
README_HF.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Karl Movie Vector Backend
3
+ emoji: 🎬
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ ---
10
+
11
+ # Karl Movie Vector Backend
12
+
13
+ FastAPI backend for semantic movie recommendations using FAISS and OpenAI embeddings. Powers intelligent movie discovery with geometric subspace algorithms.
14
+
15
+ ## Features
16
+
17
+ - Semantic movie search using OpenAI embeddings
18
+ - FAISS-powered vector similarity search
19
+ - Geometric subspace algorithms for multi-movie preferences
20
+ - ~150ms response time on CPU
21
+ - RESTful API with Bearer token authentication
22
+
23
+ ## API Usage
24
+
25
+ ```bash
26
+ curl -X POST "https://yonnel-karl-movie-vector-backend.hf.space/explore" \
27
+ -H "Authorization: Bearer YOUR_TOKEN" \
28
+ -H "Content-Type: application/json" \
29
+ -d '{
30
+ "liked_ids": [550, 680],
31
+ "disliked_ids": [],
32
+ "top_k": 100
33
+ }'
34
+ ```
35
+
36
+ ## Environment Variables
37
+
38
+ Set these in your Space settings:
39
+ - `OPENAI_API_KEY`: Your OpenAI API key
40
+ - `TMDB_API_KEY`: Your TMDB API key
41
+ - `API_TOKEN`: Authentication token for API access
42
+ - `ENV`: Set to "prod" for production
app/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ Karl-Movie Vector Backend
3
+ A FastAPI service for semantic movie recommendations using FAISS and OpenAI embeddings
4
+ """
5
+
6
+ __version__ = "1.0.0"
app/build_index.py ADDED
@@ -0,0 +1,485 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Build FAISS index from movie embeddings
3
+ This script should be run once to create the data files needed by the API
4
+ """
5
+ import os
6
+ import json
7
+ import numpy as np
8
+ import faiss
9
+ from openai import OpenAI
10
+ import requests
11
+ from typing import Dict, List, Optional
12
+ import time
13
+ import argparse
14
+ from concurrent.futures import ThreadPoolExecutor, as_completed
15
+ import logging
16
+ from settings import get_settings
17
+ import pickle
18
+
19
+ # Configure logging
20
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Checkpoint file paths
24
+ CHECKPOINT_DIR = "app/data/checkpoints"
25
+ MOVIE_DATA_CHECKPOINT = f"{CHECKPOINT_DIR}/movie_data.pkl"
26
+ EMBEDDINGS_CHECKPOINT = f"{CHECKPOINT_DIR}/embeddings_progress.pkl"
27
+ METADATA_CHECKPOINT = f"{CHECKPOINT_DIR}/metadata_progress.pkl"
28
+
29
+ def save_checkpoint(data, filepath: str):
30
+ """Save checkpoint data to file"""
31
+ os.makedirs(os.path.dirname(filepath), exist_ok=True)
32
+ with open(filepath, 'wb') as f:
33
+ pickle.dump(data, f)
34
+ logger.info(f"Checkpoint saved: {filepath}")
35
+
36
+ def load_checkpoint(filepath: str):
37
+ """Load checkpoint data from file"""
38
+ if os.path.exists(filepath):
39
+ with open(filepath, 'rb') as f:
40
+ data = pickle.load(f)
41
+ logger.info(f"Checkpoint loaded: {filepath}")
42
+ return data
43
+ return None
44
+
45
+ def cleanup_checkpoints():
46
+ """Remove checkpoint files after successful completion"""
47
+ import shutil
48
+ if os.path.exists(CHECKPOINT_DIR):
49
+ shutil.rmtree(CHECKPOINT_DIR)
50
+ logger.info("Checkpoint files cleaned up")
51
+
52
+ class TMDBClient:
53
+ """Client for TMDB API with retry and backoff"""
54
+
55
+ def __init__(self, api_key: str):
56
+ self.api_key = api_key
57
+ self.base_url = "https://api.themoviedb.org/3"
58
+ self.session = requests.Session()
59
+
60
+ def _make_request(self, endpoint: str, params: dict = None, max_retries: int = 3) -> Optional[dict]:
61
+ """Make API request with retry and backoff"""
62
+ if params is None:
63
+ params = {}
64
+ params['api_key'] = self.api_key
65
+
66
+ url = f"{self.base_url}{endpoint}"
67
+
68
+ for attempt in range(max_retries):
69
+ try:
70
+ response = self.session.get(url, params=params, timeout=10)
71
+
72
+ if response.status_code == 200:
73
+ return response.json()
74
+ elif response.status_code == 429:
75
+ # Rate limit - wait and retry
76
+ wait_time = 2 ** attempt
77
+ logger.warning(f"Rate limited, waiting {wait_time}s before retry...")
78
+ time.sleep(wait_time)
79
+ continue
80
+ elif response.status_code == 404:
81
+ logger.warning(f"Resource not found: {url}")
82
+ return None
83
+ else:
84
+ logger.error(f"API error {response.status_code}: {response.text}")
85
+
86
+ except requests.exceptions.RequestException as e:
87
+ logger.error(f"Request failed (attempt {attempt + 1}): {e}")
88
+ if attempt < max_retries - 1:
89
+ time.sleep(2 ** attempt)
90
+
91
+ return None
92
+
93
+ def get_popular_movies(self, max_pages: int = 100) -> List[int]:
94
+ """Get movie IDs from popular movies pagination"""
95
+ movie_ids = []
96
+
97
+ for page in range(1, max_pages + 1):
98
+ logger.info(f"Fetching popular movies page {page}/{max_pages}")
99
+
100
+ data = self._make_request("/movie/popular", {"page": page})
101
+ if not data:
102
+ logger.error(f"Failed to fetch page {page}")
103
+ break
104
+
105
+ # Check if we've exceeded total pages
106
+ if page > data.get('total_pages', 0):
107
+ logger.info(f"Reached last page ({data.get('total_pages')})")
108
+ break
109
+
110
+ # Extract movie IDs
111
+ for movie in data.get('results', []):
112
+ movie_ids.append(movie['id'])
113
+
114
+ # Rate limiting
115
+ time.sleep(0.25) # 4 requests per second max
116
+
117
+ logger.info(f"Collected {len(movie_ids)} movie IDs from {page} pages")
118
+ return movie_ids
119
+
120
+ def get_movie_details(self, movie_id: int) -> Optional[dict]:
121
+ """Get detailed movie information"""
122
+ return self._make_request(f"/movie/{movie_id}")
123
+
124
+ def get_movie_credits(self, movie_id: int) -> Optional[dict]:
125
+ """Get movie cast and crew"""
126
+ return self._make_request(f"/movie/{movie_id}/credits")
127
+
128
+ def fetch_movie_data(tmdb_client: TMDBClient, movie_ids: List[int], max_workers: int = 5) -> Dict[int, dict]:
129
+ """Fetch detailed data for all movies with controlled parallelization"""
130
+ movies_data = {}
131
+
132
+ def fetch_single_movie(movie_id: int) -> tuple:
133
+ """Fetch details and credits for a single movie"""
134
+ try:
135
+ # Get basic details
136
+ details = tmdb_client.get_movie_details(movie_id)
137
+ if not details:
138
+ return movie_id, None
139
+
140
+ # Get credits
141
+ credits = tmdb_client.get_movie_credits(movie_id)
142
+ if credits:
143
+ details['credits'] = credits
144
+
145
+ return movie_id, details
146
+
147
+ except Exception as e:
148
+ logger.error(f"Error fetching movie {movie_id}: {e}")
149
+ return movie_id, None
150
+
151
+ # Process movies in batches with controlled parallelization
152
+ batch_size = 50
153
+ total_movies = len(movie_ids)
154
+
155
+ for i in range(0, total_movies, batch_size):
156
+ batch = movie_ids[i:i + batch_size]
157
+ logger.info(f"Processing batch {i//batch_size + 1}/{(total_movies-1)//batch_size + 1} ({len(batch)} movies)")
158
+
159
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
160
+ futures = {executor.submit(fetch_single_movie, movie_id): movie_id for movie_id in batch}
161
+
162
+ for future in as_completed(futures):
163
+ movie_id, movie_data = future.result()
164
+ if movie_data:
165
+ movies_data[movie_id] = movie_data
166
+
167
+ # Sleep between batches to be respectful to API
168
+ time.sleep(1)
169
+
170
+ logger.info(f"Successfully fetched data for {len(movies_data)}/{total_movies} movies")
171
+ return movies_data
172
+
173
+ def create_composite_text(movie_data: Dict) -> str:
174
+ """Create composite text for embedding from movie data"""
175
+ parts = []
176
+
177
+ # Title
178
+ if movie_data.get('title'):
179
+ parts.append(f"Title: {movie_data['title']}")
180
+
181
+ # Tagline
182
+ if movie_data.get('tagline'):
183
+ parts.append(f"Tagline: {movie_data['tagline']}")
184
+
185
+ # Overview
186
+ if movie_data.get('overview'):
187
+ parts.append(f"Overview: {movie_data['overview']}")
188
+
189
+ # Release date
190
+ if movie_data.get('release_date'):
191
+ parts.append(f"Release Date: {movie_data['release_date']}")
192
+
193
+ # Original language
194
+ if movie_data.get('original_language'):
195
+ parts.append(f"Language: {movie_data['original_language']}")
196
+
197
+ # Spoken languages
198
+ if movie_data.get('spoken_languages'):
199
+ languages = [lang.get('iso_639_1', '') for lang in movie_data['spoken_languages'] if lang.get('iso_639_1')]
200
+ if languages:
201
+ parts.append(f"Spoken Languages: {', '.join(languages)}")
202
+
203
+ # Genres
204
+ if movie_data.get('genres'):
205
+ genres = [genre['name'] for genre in movie_data['genres']]
206
+ parts.append(f"Genres: {', '.join(genres)}")
207
+
208
+ # Production companies
209
+ if movie_data.get('production_companies'):
210
+ companies = [company['name'] for company in movie_data['production_companies']]
211
+ if companies:
212
+ parts.append(f"Production Companies: {', '.join(companies)}")
213
+
214
+ # Production countries
215
+ if movie_data.get('production_countries'):
216
+ countries = [country['name'] for country in movie_data['production_countries']]
217
+ if countries:
218
+ parts.append(f"Production Countries: {', '.join(countries)}")
219
+
220
+ # Budget (only if > 0)
221
+ if movie_data.get('budget') and movie_data['budget'] > 0:
222
+ parts.append(f"Budget: ${movie_data['budget']:,}")
223
+
224
+ # Popularity
225
+ if movie_data.get('popularity'):
226
+ parts.append(f"Popularity: {movie_data['popularity']}")
227
+
228
+ # Vote average
229
+ if movie_data.get('vote_average'):
230
+ parts.append(f"Vote Average: {movie_data['vote_average']}")
231
+
232
+ # Vote count
233
+ if movie_data.get('vote_count'):
234
+ parts.append(f"Vote Count: {movie_data['vote_count']}")
235
+
236
+ # Director(s)
237
+ if movie_data.get('credits', {}).get('crew'):
238
+ directors = [person['name'] for person in movie_data['credits']['crew'] if person['job'] == 'Director']
239
+ if directors:
240
+ parts.append(f"Director: {', '.join(directors)}")
241
+
242
+ # Top 5 cast
243
+ if movie_data.get('credits', {}).get('cast'):
244
+ top_cast = [person['name'] for person in movie_data['credits']['cast'][:5]]
245
+ if top_cast:
246
+ parts.append(f"Cast: {', '.join(top_cast)}")
247
+
248
+ return " / ".join(parts)
249
+
250
+ def get_embeddings_batch(texts: List[str], client: OpenAI, model: str = "text-embedding-3-small") -> List[List[float]]:
251
+ """Get embeddings for a batch of texts with retry"""
252
+ max_retries = 3
253
+
254
+ for attempt in range(max_retries):
255
+ try:
256
+ response = client.embeddings.create(
257
+ input=texts,
258
+ model=model
259
+ )
260
+ return [item.embedding for item in response.data]
261
+ except Exception as e:
262
+ logger.error(f"Error getting embeddings (attempt {attempt + 1}): {e}")
263
+ if attempt < max_retries - 1:
264
+ time.sleep(2 ** attempt)
265
+ else:
266
+ raise
267
+
268
+ def build_index(max_pages: int = 10, model: str = "text-embedding-3-small", use_faiss: bool = True):
269
+ """Main function to build the FAISS index and data files"""
270
+ settings = get_settings()
271
+
272
+ # Initialize clients
273
+ tmdb_client = TMDBClient(settings.tmdb_api_key)
274
+ openai_client = OpenAI(api_key=settings.openai_api_key)
275
+
276
+ # Create data directory
277
+ os.makedirs("app/data", exist_ok=True)
278
+
279
+ # Check for existing movie data checkpoint
280
+ movies_data = load_checkpoint(MOVIE_DATA_CHECKPOINT)
281
+
282
+ if movies_data is not None:
283
+ logger.info(f"πŸ”„ Resuming from checkpoint: {len(movies_data)} movies data found")
284
+ else:
285
+ # Step 1: Get movie IDs
286
+ logger.info(f"Fetching movie IDs from TMDB (max {max_pages} pages)...")
287
+ movie_ids = tmdb_client.get_popular_movies(max_pages=max_pages)
288
+
289
+ if not movie_ids:
290
+ logger.error("❌ No movie IDs retrieved from TMDB")
291
+ return
292
+
293
+ # Step 2: Fetch detailed movie data
294
+ logger.info(f"Fetching detailed data for {len(movie_ids)} movies...")
295
+ movies_data = fetch_movie_data(tmdb_client, movie_ids)
296
+
297
+ if not movies_data:
298
+ logger.error("❌ No movie data retrieved")
299
+ return
300
+
301
+ # Save movie data checkpoint
302
+ save_checkpoint(movies_data, MOVIE_DATA_CHECKPOINT)
303
+
304
+ # Step 3: Create composite texts and process embeddings in batches
305
+ logger.info("Creating embeddings...")
306
+ embeddings = []
307
+ id_map = {}
308
+ movie_metadata = {}
309
+ processed_movie_ids = set()
310
+
311
+ batch_size = 20 # Process 20 texts at a time
312
+
313
+ # Check for existing embedding progress
314
+ embedding_checkpoint = load_checkpoint(EMBEDDINGS_CHECKPOINT)
315
+ metadata_checkpoint = load_checkpoint(METADATA_CHECKPOINT)
316
+
317
+ if embedding_checkpoint is not None and metadata_checkpoint is not None:
318
+ embeddings = embedding_checkpoint['embeddings']
319
+ id_map = embedding_checkpoint['id_map']
320
+ processed_movie_ids = set(embedding_checkpoint['processed_movie_ids'])
321
+ movie_metadata = metadata_checkpoint
322
+ logger.info(f"πŸ”„ Resuming embeddings from checkpoint: {len(embeddings)} embeddings found")
323
+ else:
324
+ logger.info("Starting embeddings from scratch")
325
+
326
+ # Process remaining movies
327
+ remaining_movies = {k: v for k, v in movies_data.items() if k not in processed_movie_ids}
328
+ logger.info(f"Processing {len(remaining_movies)} remaining movies")
329
+
330
+ composite_texts = []
331
+ current_movie_ids = []
332
+
333
+ for movie_id, movie_data in remaining_movies.items():
334
+ # Create composite text
335
+ composite_text = create_composite_text(movie_data)
336
+ composite_texts.append(composite_text)
337
+ current_movie_ids.append(movie_id)
338
+
339
+ # Store metadata
340
+ release_year = 0
341
+ if movie_data.get("release_date"):
342
+ try:
343
+ release_year = int(movie_data["release_date"][:4])
344
+ except (ValueError, IndexError):
345
+ release_year = 0
346
+
347
+ movie_metadata[str(movie_id)] = {
348
+ "id": movie_id,
349
+ "title": movie_data.get("title", ""),
350
+ "year": release_year,
351
+ "poster_path": movie_data.get("poster_path"),
352
+ "release_date": movie_data.get("release_date"),
353
+ "genres": [g["name"] for g in movie_data.get("genres", [])]
354
+ }
355
+
356
+ # Process batch when full
357
+ if len(composite_texts) >= batch_size:
358
+ logger.info(f"Processing embedding batch ({len(embeddings)} done, {len(composite_texts)} in batch)")
359
+
360
+ try:
361
+ batch_embeddings = get_embeddings_batch(composite_texts, openai_client, model)
362
+ embeddings.extend(batch_embeddings)
363
+
364
+ # Update ID mapping and processed set
365
+ for i, mid in enumerate(current_movie_ids):
366
+ id_map[str(mid)] = len(id_map)
367
+ processed_movie_ids.add(mid)
368
+
369
+ # Save progress checkpoints
370
+ embedding_data = {
371
+ 'embeddings': embeddings,
372
+ 'id_map': id_map,
373
+ 'processed_movie_ids': list(processed_movie_ids)
374
+ }
375
+ save_checkpoint(embedding_data, EMBEDDINGS_CHECKPOINT)
376
+ save_checkpoint(movie_metadata, METADATA_CHECKPOINT)
377
+
378
+ # Clear batch
379
+ composite_texts = []
380
+ current_movie_ids = []
381
+
382
+ # Sleep between batches
383
+ time.sleep(0.5)
384
+
385
+ except Exception as e:
386
+ logger.error(f"Failed to process batch: {e}")
387
+ logger.info("Progress has been saved, you can restart the script to resume")
388
+ return
389
+
390
+ # Process remaining texts
391
+ if composite_texts:
392
+ logger.info(f"Processing final embedding batch ({len(composite_texts)} texts)")
393
+ try:
394
+ batch_embeddings = get_embeddings_batch(composite_texts, openai_client, model)
395
+ embeddings.extend(batch_embeddings)
396
+
397
+ for i, mid in enumerate(current_movie_ids):
398
+ id_map[str(mid)] = len(id_map)
399
+ processed_movie_ids.add(mid)
400
+
401
+ # Save final progress
402
+ embedding_data = {
403
+ 'embeddings': embeddings,
404
+ 'id_map': id_map,
405
+ 'processed_movie_ids': list(processed_movie_ids)
406
+ }
407
+ save_checkpoint(embedding_data, EMBEDDINGS_CHECKPOINT)
408
+ save_checkpoint(movie_metadata, METADATA_CHECKPOINT)
409
+
410
+ except Exception as e:
411
+ logger.error(f"Failed to process final batch: {e}")
412
+ logger.info("Progress has been saved, you can restart the script to resume")
413
+ return
414
+
415
+ if not embeddings:
416
+ logger.error("❌ No embeddings generated")
417
+ return
418
+
419
+ logger.info(f"Generated {len(embeddings)} embeddings")
420
+
421
+ # Step 4: Save embeddings as numpy array
422
+ embeddings_array = np.array(embeddings, dtype=np.float32)
423
+ np.save("app/data/movies.npy", embeddings_array)
424
+ logger.info(f"Saved embeddings matrix: {embeddings_array.shape}")
425
+
426
+ # Step 5: Build and save FAISS index
427
+ if use_faiss:
428
+ logger.info("Building FAISS index...")
429
+ dimension = embeddings_array.shape[1]
430
+
431
+ # Choose index type based on size
432
+ if len(embeddings) < 10000:
433
+ # For smaller datasets, use flat index
434
+ index = faiss.IndexFlatL2(dimension)
435
+ else:
436
+ # For larger datasets, use IVF index
437
+ nlist = min(int(np.sqrt(len(embeddings))), 1000)
438
+ quantizer = faiss.IndexFlatL2(dimension)
439
+ index = faiss.IndexIVFFlat(quantizer, dimension, nlist)
440
+ # Train the index
441
+ index.train(embeddings_array)
442
+
443
+ index.add(embeddings_array)
444
+ faiss.write_index(index, "app/data/faiss.index")
445
+ logger.info(f"FAISS index saved (type: {type(index).__name__}, dimension: {dimension})")
446
+
447
+ # Step 6: Save metadata files
448
+ with open("app/data/id_map.json", "w") as f:
449
+ json.dump(id_map, f)
450
+
451
+ with open("app/data/movie_metadata.json", "w") as f:
452
+ json.dump(movie_metadata, f)
453
+
454
+ logger.info("βœ… Index built successfully!")
455
+ logger.info(f" - {len(embeddings)} movies indexed")
456
+ logger.info(f" - Embedding model: {model}")
457
+ logger.info(f" - Files saved in app/data/")
458
+ logger.info(f" * movies.npy: embeddings matrix")
459
+ logger.info(f" * id_map.json: TMDB ID to matrix position mapping")
460
+ logger.info(f" * movie_metadata.json: movie metadata")
461
+ if use_faiss:
462
+ logger.info(f" * faiss.index: FAISS search index")
463
+
464
+ # Cleanup checkpoints
465
+ cleanup_checkpoints()
466
+
467
+ # Remove the old functions that are no longer needed
468
+ # create_movie_embedding and load_movie_data are replaced by the new implementation
469
+
470
+ if __name__ == "__main__":
471
+ parser = argparse.ArgumentParser(description="Build movie embeddings index from TMDB data")
472
+ parser.add_argument("--max-pages", type=int, default=10,
473
+ help="Maximum pages to fetch from TMDB popular movies (default: 10)")
474
+ parser.add_argument("--model", type=str, default="text-embedding-3-small",
475
+ help="OpenAI embedding model to use (default: text-embedding-3-small)")
476
+ parser.add_argument("--no-faiss", action="store_true",
477
+ help="Skip building FAISS index")
478
+
479
+ args = parser.parse_args()
480
+
481
+ build_index(
482
+ max_pages=args.max_pages,
483
+ model=args.model,
484
+ use_faiss=not args.no_faiss
485
+ )
app/main.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import numpy as np
4
+ import faiss
5
+ from fastapi import FastAPI, HTTPException, Depends, status
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
8
+ from pydantic import BaseModel
9
+ from typing import List, Optional
10
+ import logging
11
+ import time
12
+
13
+ # Configure logging
14
+ logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO").upper())
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Security
18
+ security = HTTPBearer()
19
+
20
+ def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
21
+ expected_token = os.getenv("API_TOKEN")
22
+ if not expected_token:
23
+ raise HTTPException(status_code=500, detail="API token not configured")
24
+ if credentials.credentials != expected_token:
25
+ raise HTTPException(status_code=401, detail="Invalid token")
26
+ return credentials.credentials
27
+
28
+ # Pydantic models
29
+ class ExploreRequest(BaseModel):
30
+ liked_ids: List[int]
31
+ disliked_ids: List[int] = []
32
+ top_k: int = 400
33
+
34
+ class MovieResult(BaseModel):
35
+ id: int
36
+ title: str
37
+ year: int
38
+ poster_path: Optional[str]
39
+ genres: List[str]
40
+ coords: List[float]
41
+
42
+ class ExploreResponse(BaseModel):
43
+ movies: List[MovieResult]
44
+ bary: List[float]
45
+ center: List[float]
46
+
47
+ # Global variables for loaded data
48
+ vectors = None
49
+ id_map = None
50
+ faiss_index = None
51
+ movie_metadata = None
52
+
53
+ def load_data():
54
+ """Load FAISS index, vectors, and metadata on startup"""
55
+ global vectors, id_map, faiss_index, movie_metadata
56
+
57
+ try:
58
+ # Load vectors
59
+ vectors = np.load("app/data/movies.npy")
60
+ logger.info(f"Loaded {vectors.shape[0]} movie vectors of dimension {vectors.shape[1]}")
61
+
62
+ # Load ID mapping
63
+ with open("app/data/id_map.json", "r") as f:
64
+ id_map = json.load(f)
65
+ logger.info(f"Loaded ID mapping for {len(id_map)} movies")
66
+
67
+ # Load FAISS index
68
+ faiss_index = faiss.read_index("app/data/faiss.index")
69
+ logger.info(f"Loaded FAISS index with {faiss_index.ntotal} vectors")
70
+
71
+ # Load movie metadata
72
+ with open("app/data/movie_metadata.json", "r") as f:
73
+ movie_metadata = json.load(f)
74
+ logger.info(f"Loaded metadata for {len(movie_metadata)} movies")
75
+
76
+ except Exception as e:
77
+ logger.error(f"Failed to load data: {e}")
78
+ raise
79
+
80
+ def build_plane(likes: np.ndarray, dislikes: np.ndarray = None, dim: int = 2):
81
+ """
82
+ Build user subspace from liked/disliked movies
83
+ Returns (axes, center) where axes is 2xD orthonormal matrix
84
+ """
85
+ n_likes = likes.shape[0] if likes is not None else 0
86
+ d = vectors.shape[1]
87
+
88
+ # Compute composite vector: +liked - 0.5*disliked
89
+ if n_likes == 0:
90
+ # Cold start: use global average
91
+ center = vectors.mean(0)
92
+ # Create random orthonormal basis
93
+ axes = np.random.randn(dim, d)
94
+ axes[0] /= np.linalg.norm(axes[0])
95
+ for i in range(1, dim):
96
+ for j in range(i):
97
+ axes[i] -= np.dot(axes[i], axes[j]) * axes[j]
98
+ axes[i] /= np.linalg.norm(axes[i])
99
+ else:
100
+ # Compute composite from likes and dislikes
101
+ composite = likes.mean(0)
102
+ if dislikes is not None and dislikes.shape[0] > 0:
103
+ composite -= 0.5 * dislikes.mean(0)
104
+
105
+ if n_likes == 1:
106
+ # One like: use as center, random orthogonal axes
107
+ center = composite
108
+ axis1 = np.random.randn(d)
109
+ axis1 /= np.linalg.norm(axis1)
110
+ axis2 = np.random.randn(d)
111
+ axis2 -= np.dot(axis2, axis1) * axis1
112
+ axis2 /= np.linalg.norm(axis2)
113
+ axes = np.vstack([axis1, axis2])
114
+ elif n_likes == 2:
115
+ # Two likes: line between them
116
+ center = likes.mean(0)
117
+ axis1 = likes[1] - likes[0]
118
+ axis1 /= np.linalg.norm(axis1)
119
+ axis2 = np.random.randn(d)
120
+ axis2 -= np.dot(axis2, axis1) * axis1
121
+ axis2 /= np.linalg.norm(axis2)
122
+ axes = np.vstack([axis1, axis2])
123
+ else:
124
+ # 3+ likes: PCA plane
125
+ center = likes.mean(0)
126
+ likes_centered = likes - center
127
+ u, s, vt = np.linalg.svd(likes_centered, full_matrices=False)
128
+ axes = vt[:2] # First 2 principal components
129
+
130
+ return axes, center
131
+
132
+ def assign_spiral_coords(n_movies: int):
133
+ """
134
+ Assign 2D grid coordinates in outward spiral pattern
135
+ Returns array of shape (n_movies, 2) with integer coordinates
136
+ """
137
+ coords = np.zeros((n_movies, 2), dtype=int)
138
+ if n_movies == 0:
139
+ return coords
140
+
141
+ coords[0] = [0, 0] # Start at origin
142
+
143
+ if n_movies == 1:
144
+ return coords
145
+
146
+ # Spiral pattern: right, up, left, down, repeat with increasing distances
147
+ dx, dy = [1, 0, -1, 0], [0, 1, 0, -1]
148
+ direction = 0
149
+ steps = 1
150
+ x, y = 0, 0
151
+ idx = 1
152
+
153
+ while idx < n_movies:
154
+ for _ in range(2): # Each step count is used twice (except the first)
155
+ for _ in range(steps):
156
+ if idx >= n_movies:
157
+ break
158
+ x += dx[direction]
159
+ y += dy[direction]
160
+ coords[idx] = [x, y]
161
+ idx += 1
162
+ direction = (direction + 1) % 4
163
+ if idx >= n_movies:
164
+ break
165
+ steps += 1
166
+
167
+ return coords
168
+
169
+ def compute_barycenter(liked_indices: List[int], coords: np.ndarray):
170
+ """Compute barycenter of liked movies in 2D grid"""
171
+ if not liked_indices:
172
+ return [0.0, 0.0]
173
+
174
+ liked_coords = coords[liked_indices]
175
+ bary = liked_coords.mean(0)
176
+ return bary.tolist()
177
+
178
+ # FastAPI app setup
179
+ app = FastAPI(title="Karl-Movie Vector Backend", version="1.0.0")
180
+
181
+ # CORS configuration
182
+ DEV_ORIGINS = [
183
+ "http://localhost:5173",
184
+ "http://127.0.0.1:5173",
185
+ "http://localhost:8888",
186
+ "https://*.bolt.run",
187
+ "https://*.stackblitz.io",
188
+ ]
189
+
190
+ PROD_ORIGINS = ["https://karl.movie"]
191
+
192
+ origins = DEV_ORIGINS if os.getenv("ENV") != "prod" else PROD_ORIGINS
193
+
194
+ app.add_middleware(
195
+ CORSMiddleware,
196
+ allow_origins=origins,
197
+ allow_methods=["POST", "GET"],
198
+ allow_headers=["*"],
199
+ )
200
+
201
+ @app.on_event("startup")
202
+ async def startup_event():
203
+ """Load data on startup"""
204
+ load_data()
205
+
206
+ @app.get("/health")
207
+ async def health_check():
208
+ """Health check endpoint"""
209
+ return {"status": "healthy", "vectors_loaded": vectors is not None}
210
+
211
+ @app.post("/explore", response_model=ExploreResponse)
212
+ async def explore(
213
+ request: ExploreRequest,
214
+ token: str = Depends(verify_token)
215
+ ):
216
+ """
217
+ Main endpoint: find movies closest to user's preference subspace
218
+ """
219
+ start_time = time.time()
220
+
221
+ try:
222
+ # Convert TMDB IDs to internal indices
223
+ liked_indices = []
224
+ disliked_indices = []
225
+
226
+ for tmdb_id in request.liked_ids:
227
+ if str(tmdb_id) in id_map:
228
+ liked_indices.append(id_map[str(tmdb_id)])
229
+ else:
230
+ logger.warning(f"TMDB ID {tmdb_id} not found in index")
231
+
232
+ for tmdb_id in request.disliked_ids:
233
+ if str(tmdb_id) in id_map:
234
+ disliked_indices.append(id_map[str(tmdb_id)])
235
+ else:
236
+ logger.warning(f"TMDB ID {tmdb_id} not found in index")
237
+
238
+ # Get embedding vectors
239
+ liked_vectors = vectors[liked_indices] if liked_indices else None
240
+ disliked_vectors = vectors[disliked_indices] if disliked_indices else None
241
+
242
+ # Build user subspace
243
+ axes, center = build_plane(liked_vectors, disliked_vectors)
244
+
245
+ # Project all vectors onto the 2D subspace
246
+ projections = np.dot(vectors - center, axes.T) # Shape: (N, 2)
247
+
248
+ # Reconstruct vectors in original space
249
+ reconstructed = np.dot(projections, axes) + center
250
+
251
+ # Compute distances to subspace (residuals)
252
+ residuals = np.linalg.norm(vectors - reconstructed, axis=1)
253
+
254
+ # Get top-k closest movies
255
+ top_k_indices = np.argpartition(residuals, min(request.top_k, len(residuals)))[:request.top_k]
256
+ top_k_indices = top_k_indices[np.argsort(residuals[top_k_indices])]
257
+
258
+ # Assign spiral coordinates
259
+ spiral_coords = assign_spiral_coords(len(top_k_indices))
260
+
261
+ # Compute barycenter of liked movies
262
+ liked_positions = [i for i, idx in enumerate(top_k_indices) if idx in liked_indices]
263
+ bary = compute_barycenter(liked_positions, spiral_coords)
264
+
265
+ # Translate grid so barycenter is at origin
266
+ spiral_coords = spiral_coords - np.array(bary)
267
+
268
+ # Build response
269
+ movies = []
270
+ reverse_id_map = {v: k for k, v in id_map.items()}
271
+
272
+ for i, movie_idx in enumerate(top_k_indices):
273
+ tmdb_id = int(reverse_id_map[movie_idx])
274
+ metadata = movie_metadata.get(str(tmdb_id), {})
275
+
276
+ movie = MovieResult(
277
+ id=tmdb_id,
278
+ title=metadata.get("title", f"Movie {tmdb_id}"),
279
+ year=metadata.get("year", 0),
280
+ poster_path=metadata.get("poster_path"),
281
+ genres=metadata.get("genres", []),
282
+ coords=spiral_coords[i].tolist()
283
+ )
284
+ movies.append(movie)
285
+
286
+ response = ExploreResponse(
287
+ movies=movies,
288
+ bary=[0.0, 0.0], # Always [0,0] since we translated
289
+ center=center.tolist()
290
+ )
291
+
292
+ elapsed = time.time() - start_time
293
+ logger.info(f"Explore request processed in {elapsed:.3f}s - {len(request.liked_ids)} likes, {len(request.disliked_ids)} dislikes, {len(movies)} results")
294
+
295
+ return response
296
+
297
+ except Exception as e:
298
+ logger.error(f"Error processing explore request: {e}")
299
+ raise HTTPException(status_code=500, detail=str(e))
300
+
301
+ if __name__ == "__main__":
302
+ import uvicorn
303
+ uvicorn.run(app, host="0.0.0.0", port=8000)
app/settings.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Settings and environment configuration
3
+ """
4
+ import os
5
+ from functools import lru_cache
6
+ from pydantic_settings import BaseSettings
7
+
8
+
9
+ class Settings(BaseSettings):
10
+ """Application settings"""
11
+
12
+ # OpenAI API key for embeddings
13
+ openai_api_key: str
14
+
15
+ # TMDB API key for movie data
16
+ tmdb_api_key: str
17
+
18
+ # API authentication token
19
+ api_token: str
20
+
21
+ # Environment (dev/prod)
22
+ env: str = "dev"
23
+
24
+ # Logging level
25
+ log_level: str = "INFO"
26
+
27
+ class Config:
28
+ env_file = ".env"
29
+ env_file_encoding = "utf-8"
30
+
31
+
32
+ @lru_cache()
33
+ def get_settings() -> Settings:
34
+ """Get cached settings instance"""
35
+ return Settings()
app/test_api.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test l'API /explore avec des exemples de films
3
+ """
4
+ import requests
5
+ import json
6
+
7
+ # Configuration
8
+ API_URL = "http://localhost:8000"
9
+ API_TOKEN = "your-api-token" # Remplacez par votre token
10
+
11
+ def test_explore_endpoint():
12
+ """Test l'endpoint /explore avec diffΓ©rents scΓ©narios"""
13
+
14
+ # Lire les mΓ©tadonnΓ©es pour avoir des IDs de test
15
+ with open("app/data/movie_metadata.json", "r") as f:
16
+ metadata = json.load(f)
17
+
18
+ # Prendre les premiers films comme exemples
19
+ movie_ids = list(metadata.keys())[:5]
20
+ print(f"Films de test disponibles : {[metadata[mid]['title'] for mid in movie_ids]}")
21
+
22
+ # Test 1: Recherche avec 1 film aimΓ©
23
+ print("\n🎬 Test 1: Recherche avec 1 film aimé")
24
+ test_request = {
25
+ "liked_ids": [int(movie_ids[0])],
26
+ "disliked_ids": [],
27
+ "top_k": 10
28
+ }
29
+
30
+ try:
31
+ response = requests.post(
32
+ f"{API_URL}/explore",
33
+ json=test_request,
34
+ headers={"Authorization": f"Bearer {API_TOKEN}"}
35
+ )
36
+
37
+ if response.status_code == 200:
38
+ data = response.json()
39
+ print(f"βœ… TrouvΓ© {len(data['movies'])} films similaires")
40
+ print(f"Film aimΓ©: {metadata[movie_ids[0]]['title']}")
41
+ print("Films recommandΓ©s:")
42
+ for movie in data['movies'][:3]:
43
+ print(f" - {movie['title']} ({movie['year']}) - {movie['genres']}")
44
+ else:
45
+ print(f"❌ Erreur {response.status_code}: {response.text}")
46
+
47
+ except Exception as e:
48
+ print(f"❌ Erreur de connexion: {e}")
49
+ print("πŸ’‘ VΓ©rifiez que votre API_TOKEN est correct dans le .env")
50
+
51
+ # Test 2: Recherche avec 2 films aimΓ©s
52
+ print("\n🎬 Test 2: Recherche avec 2 films aimés")
53
+ test_request = {
54
+ "liked_ids": [int(movie_ids[0]), int(movie_ids[1])],
55
+ "disliked_ids": [],
56
+ "top_k": 10
57
+ }
58
+
59
+ try:
60
+ response = requests.post(
61
+ f"{API_URL}/explore",
62
+ json=test_request,
63
+ headers={"Authorization": f"Bearer {API_TOKEN}"}
64
+ )
65
+
66
+ if response.status_code == 200:
67
+ data = response.json()
68
+ print(f"βœ… TrouvΓ© {len(data['movies'])} films similaires")
69
+ print(f"Films aimΓ©s: {metadata[movie_ids[0]]['title']}, {metadata[movie_ids[1]]['title']}")
70
+ print("Barycenter:", data['bary'])
71
+ else:
72
+ print(f"❌ Erreur {response.status_code}: {response.text}")
73
+
74
+ except Exception as e:
75
+ print(f"❌ Erreur: {e}")
76
+
77
+ if __name__ == "__main__":
78
+ print("πŸ§ͺ Test de l'API /explore")
79
+ print("=" * 40)
80
+ test_explore_endpoint()
app/test_setup.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test script for TMDB data loading and embedding generation
3
+ Run this to validate your setup before building the full index
4
+ """
5
+ import os
6
+ import sys
7
+ import json
8
+ from settings import get_settings
9
+ from build_index import TMDBClient, create_composite_text, get_embeddings_batch
10
+ from openai import OpenAI
11
+
12
+ def test_tmdb_connection():
13
+ """Test TMDB API connection"""
14
+ print("πŸ” Testing TMDB API connection...")
15
+
16
+ try:
17
+ settings = get_settings()
18
+ tmdb_client = TMDBClient(settings.tmdb_api_key)
19
+
20
+ # Test getting popular movies (just first page)
21
+ movie_ids = tmdb_client.get_popular_movies(max_pages=1)
22
+
23
+ if movie_ids:
24
+ print(f"βœ… Successfully fetched {len(movie_ids)} movie IDs from TMDB")
25
+
26
+ # Test getting details for first movie
27
+ movie_data = tmdb_client.get_movie_details(movie_ids[0])
28
+ if movie_data:
29
+ print(f"βœ… Successfully fetched details for movie: {movie_data.get('title', 'Unknown')}")
30
+
31
+ # Test getting credits
32
+ credits = tmdb_client.get_movie_credits(movie_ids[0])
33
+ if credits:
34
+ print(f"βœ… Successfully fetched credits (cast: {len(credits.get('cast', []))}, crew: {len(credits.get('crew', []))})")
35
+ else:
36
+ print("⚠️ Could not fetch credits")
37
+
38
+ return movie_data, credits
39
+ else:
40
+ print("❌ Could not fetch movie details")
41
+ else:
42
+ print("❌ Could not fetch movie IDs")
43
+
44
+ except Exception as e:
45
+ print(f"❌ TMDB API error: {e}")
46
+
47
+ return None, None
48
+
49
+ def test_composite_text(movie_data, credits):
50
+ """Test composite text creation"""
51
+ print("\nπŸ“ Testing composite text creation...")
52
+
53
+ if movie_data:
54
+ # Add credits to movie data
55
+ if credits:
56
+ movie_data['credits'] = credits
57
+
58
+ composite_text = create_composite_text(movie_data)
59
+ print(f"βœ… Generated composite text ({len(composite_text)} chars)")
60
+ print(f"Preview: {composite_text[:200]}...")
61
+ return composite_text
62
+ else:
63
+ print("❌ No movie data to test")
64
+ return None
65
+
66
+ def test_embeddings(composite_text):
67
+ """Test embedding generation"""
68
+ print("\nπŸ€– Testing embedding generation...")
69
+
70
+ if composite_text:
71
+ try:
72
+ settings = get_settings()
73
+ openai_client = OpenAI(api_key=settings.openai_api_key)
74
+
75
+ embeddings = get_embeddings_batch([composite_text], openai_client)
76
+
77
+ if embeddings:
78
+ embedding = embeddings[0]
79
+ print(f"βœ… Generated embedding (dimension: {len(embedding)})")
80
+ print(f"Sample values: {embedding[:5]}...")
81
+ return embedding
82
+ else:
83
+ print("❌ No embeddings generated")
84
+
85
+ except Exception as e:
86
+ print(f"❌ Embedding error: {e}")
87
+ else:
88
+ print("❌ No composite text to test")
89
+
90
+ return None
91
+
92
+ def main():
93
+ """Run all tests"""
94
+ print("🎬 Karl Movie Vector Backend - Test Suite")
95
+ print("=" * 50)
96
+
97
+ # Test environment variables
98
+ print("πŸ”§ Checking environment variables...")
99
+ try:
100
+ settings = get_settings()
101
+ print(f"βœ… OpenAI API key: {'sk-...' + settings.openai_api_key[-10:] if settings.openai_api_key else 'Not set'}")
102
+ print(f"βœ… TMDB API key: {'...' + settings.tmdb_api_key[-10:] if settings.tmdb_api_key else 'Not set'}")
103
+ except Exception as e:
104
+ print(f"❌ Settings error: {e}")
105
+ print("Make sure you have a .env file with OPENAI_API_KEY and TMDB_API_KEY")
106
+ return
107
+
108
+ # Run tests
109
+ movie_data, credits = test_tmdb_connection()
110
+ composite_text = test_composite_text(movie_data, credits)
111
+ embedding = test_embeddings(composite_text)
112
+
113
+ print("\n" + "=" * 50)
114
+ if movie_data and composite_text and embedding:
115
+ print("πŸŽ‰ All tests passed! You can now run the full build:")
116
+ print(" python app/build_index.py --max-pages 3")
117
+ else:
118
+ print("❌ Some tests failed. Check your API keys and internet connection.")
119
+
120
+ if __name__ == "__main__":
121
+ main()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn[standard]==0.24.0
3
+ numpy==1.24.4
4
+ faiss-cpu==1.7.4
5
+ openai==1.51.0
6
+ pydantic==2.11.5
7
+ pydantic-settings==2.9.1
8
+ python-multipart==0.0.6
9
+ requests==2.31.0
10
+ scikit-learn==1.3.2
11
+ python-dotenv==1.0.0
12
+ httpx==0.27.0