Spaces:
Sleeping
Sleeping
yonnel
commited on
Commit
Β·
66fef64
0
Parent(s):
Initial clean commit - FastAPI movie backend without large data files
Browse files- .env.example +14 -0
- .gitattributes +36 -0
- .gitignore +57 -0
- Dockerfile +32 -0
- README.md +42 -0
- README_HF.md +42 -0
- app/__init__.py +6 -0
- app/build_index.py +485 -0
- app/main.py +303 -0
- app/settings.py +35 -0
- app/test_api.py +80 -0
- app/test_setup.py +121 -0
- requirements.txt +12 -0
.env.example
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# OpenAI API key for embeddings
|
2 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
3 |
+
|
4 |
+
# TMDB API key for movie data
|
5 |
+
TMDB_API_KEY=your_tmdb_api_key_here
|
6 |
+
|
7 |
+
# API authentication token
|
8 |
+
API_TOKEN=your_api_token_here
|
9 |
+
|
10 |
+
# Environment (dev/prod)
|
11 |
+
ENV=dev
|
12 |
+
|
13 |
+
# Logging level
|
14 |
+
LOG_LEVEL=INFO
|
.gitattributes
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.index filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
.Python
|
7 |
+
build/
|
8 |
+
develop-eggs/
|
9 |
+
dist/
|
10 |
+
downloads/
|
11 |
+
eggs/
|
12 |
+
.eggs/
|
13 |
+
lib/
|
14 |
+
lib64/
|
15 |
+
parts/
|
16 |
+
sdist/
|
17 |
+
var/
|
18 |
+
wheels/
|
19 |
+
pip-wheel-metadata/
|
20 |
+
share/python-wheels/
|
21 |
+
*.egg-info/
|
22 |
+
.installed.cfg
|
23 |
+
*.egg
|
24 |
+
MANIFEST
|
25 |
+
|
26 |
+
# Environment
|
27 |
+
.env
|
28 |
+
.venv
|
29 |
+
env/
|
30 |
+
venv/
|
31 |
+
ENV/
|
32 |
+
env.bak/
|
33 |
+
venv.bak/
|
34 |
+
|
35 |
+
# Data files (these are large and will be generated on deployment)
|
36 |
+
app/data/*.npy
|
37 |
+
app/data/*.index
|
38 |
+
app/data/movie_metadata.json
|
39 |
+
app/data/id_map.json
|
40 |
+
app/data/checkpoints/
|
41 |
+
|
42 |
+
# IDE
|
43 |
+
.vscode/
|
44 |
+
.idea/
|
45 |
+
*.swp
|
46 |
+
*.swo
|
47 |
+
|
48 |
+
# OS
|
49 |
+
.DS_Store
|
50 |
+
Thumbs.db
|
51 |
+
|
52 |
+
# Logs
|
53 |
+
*.log
|
54 |
+
|
55 |
+
# Temporary files
|
56 |
+
tmp/
|
57 |
+
temp/
|
Dockerfile
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim
|
2 |
+
|
3 |
+
# Install system dependencies
|
4 |
+
RUN apt-get update && apt-get install -y \
|
5 |
+
gcc \
|
6 |
+
g++ \
|
7 |
+
&& rm -rf /var/lib/apt/lists/*
|
8 |
+
|
9 |
+
# Set working directory
|
10 |
+
WORKDIR /app
|
11 |
+
|
12 |
+
# Copy requirements first for better caching
|
13 |
+
COPY requirements.txt .
|
14 |
+
|
15 |
+
# Install Python dependencies
|
16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
17 |
+
|
18 |
+
# Copy application code
|
19 |
+
COPY app/ ./app/
|
20 |
+
|
21 |
+
# Create data directory
|
22 |
+
RUN mkdir -p app/data
|
23 |
+
|
24 |
+
# Expose port
|
25 |
+
EXPOSE 7860
|
26 |
+
|
27 |
+
# Health check
|
28 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
29 |
+
CMD curl -f http://localhost:7860/health || exit 1
|
30 |
+
|
31 |
+
# Run the application
|
32 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Karl Movie Vector Backend
|
3 |
+
emoji: π¬
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
license: mit
|
9 |
+
---
|
10 |
+
|
11 |
+
# Karl Movie Vector Backend
|
12 |
+
|
13 |
+
FastAPI backend for semantic movie recommendations using FAISS and OpenAI embeddings. Powers intelligent movie discovery with geometric subspace algorithms.
|
14 |
+
|
15 |
+
## Features
|
16 |
+
|
17 |
+
- Semantic movie search using OpenAI embeddings
|
18 |
+
- FAISS-powered vector similarity search
|
19 |
+
- Geometric subspace algorithms for multi-movie preferences
|
20 |
+
- ~150ms response time on CPU
|
21 |
+
- RESTful API with Bearer token authentication
|
22 |
+
|
23 |
+
## API Usage
|
24 |
+
|
25 |
+
```bash
|
26 |
+
curl -X POST "https://yonnel-karl-movie-vector-backend.hf.space/explore" \
|
27 |
+
-H "Authorization: Bearer YOUR_TOKEN" \
|
28 |
+
-H "Content-Type: application/json" \
|
29 |
+
-d '{
|
30 |
+
"liked_ids": [550, 680],
|
31 |
+
"disliked_ids": [],
|
32 |
+
"top_k": 100
|
33 |
+
}'
|
34 |
+
```
|
35 |
+
|
36 |
+
## Environment Variables
|
37 |
+
|
38 |
+
Set these in your Space settings:
|
39 |
+
- `OPENAI_API_KEY`: Your OpenAI API key
|
40 |
+
- `TMDB_API_KEY`: Your TMDB API key
|
41 |
+
- `API_TOKEN`: Authentication token for API access
|
42 |
+
- `ENV`: Set to "prod" for production
|
README_HF.md
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Karl Movie Vector Backend
|
3 |
+
emoji: π¬
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
license: mit
|
9 |
+
---
|
10 |
+
|
11 |
+
# Karl Movie Vector Backend
|
12 |
+
|
13 |
+
FastAPI backend for semantic movie recommendations using FAISS and OpenAI embeddings. Powers intelligent movie discovery with geometric subspace algorithms.
|
14 |
+
|
15 |
+
## Features
|
16 |
+
|
17 |
+
- Semantic movie search using OpenAI embeddings
|
18 |
+
- FAISS-powered vector similarity search
|
19 |
+
- Geometric subspace algorithms for multi-movie preferences
|
20 |
+
- ~150ms response time on CPU
|
21 |
+
- RESTful API with Bearer token authentication
|
22 |
+
|
23 |
+
## API Usage
|
24 |
+
|
25 |
+
```bash
|
26 |
+
curl -X POST "https://yonnel-karl-movie-vector-backend.hf.space/explore" \
|
27 |
+
-H "Authorization: Bearer YOUR_TOKEN" \
|
28 |
+
-H "Content-Type: application/json" \
|
29 |
+
-d '{
|
30 |
+
"liked_ids": [550, 680],
|
31 |
+
"disliked_ids": [],
|
32 |
+
"top_k": 100
|
33 |
+
}'
|
34 |
+
```
|
35 |
+
|
36 |
+
## Environment Variables
|
37 |
+
|
38 |
+
Set these in your Space settings:
|
39 |
+
- `OPENAI_API_KEY`: Your OpenAI API key
|
40 |
+
- `TMDB_API_KEY`: Your TMDB API key
|
41 |
+
- `API_TOKEN`: Authentication token for API access
|
42 |
+
- `ENV`: Set to "prod" for production
|
app/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Karl-Movie Vector Backend
|
3 |
+
A FastAPI service for semantic movie recommendations using FAISS and OpenAI embeddings
|
4 |
+
"""
|
5 |
+
|
6 |
+
__version__ = "1.0.0"
|
app/build_index.py
ADDED
@@ -0,0 +1,485 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Build FAISS index from movie embeddings
|
3 |
+
This script should be run once to create the data files needed by the API
|
4 |
+
"""
|
5 |
+
import os
|
6 |
+
import json
|
7 |
+
import numpy as np
|
8 |
+
import faiss
|
9 |
+
from openai import OpenAI
|
10 |
+
import requests
|
11 |
+
from typing import Dict, List, Optional
|
12 |
+
import time
|
13 |
+
import argparse
|
14 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
15 |
+
import logging
|
16 |
+
from settings import get_settings
|
17 |
+
import pickle
|
18 |
+
|
19 |
+
# Configure logging
|
20 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
21 |
+
logger = logging.getLogger(__name__)
|
22 |
+
|
23 |
+
# Checkpoint file paths
|
24 |
+
CHECKPOINT_DIR = "app/data/checkpoints"
|
25 |
+
MOVIE_DATA_CHECKPOINT = f"{CHECKPOINT_DIR}/movie_data.pkl"
|
26 |
+
EMBEDDINGS_CHECKPOINT = f"{CHECKPOINT_DIR}/embeddings_progress.pkl"
|
27 |
+
METADATA_CHECKPOINT = f"{CHECKPOINT_DIR}/metadata_progress.pkl"
|
28 |
+
|
29 |
+
def save_checkpoint(data, filepath: str):
|
30 |
+
"""Save checkpoint data to file"""
|
31 |
+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
32 |
+
with open(filepath, 'wb') as f:
|
33 |
+
pickle.dump(data, f)
|
34 |
+
logger.info(f"Checkpoint saved: {filepath}")
|
35 |
+
|
36 |
+
def load_checkpoint(filepath: str):
|
37 |
+
"""Load checkpoint data from file"""
|
38 |
+
if os.path.exists(filepath):
|
39 |
+
with open(filepath, 'rb') as f:
|
40 |
+
data = pickle.load(f)
|
41 |
+
logger.info(f"Checkpoint loaded: {filepath}")
|
42 |
+
return data
|
43 |
+
return None
|
44 |
+
|
45 |
+
def cleanup_checkpoints():
|
46 |
+
"""Remove checkpoint files after successful completion"""
|
47 |
+
import shutil
|
48 |
+
if os.path.exists(CHECKPOINT_DIR):
|
49 |
+
shutil.rmtree(CHECKPOINT_DIR)
|
50 |
+
logger.info("Checkpoint files cleaned up")
|
51 |
+
|
52 |
+
class TMDBClient:
|
53 |
+
"""Client for TMDB API with retry and backoff"""
|
54 |
+
|
55 |
+
def __init__(self, api_key: str):
|
56 |
+
self.api_key = api_key
|
57 |
+
self.base_url = "https://api.themoviedb.org/3"
|
58 |
+
self.session = requests.Session()
|
59 |
+
|
60 |
+
def _make_request(self, endpoint: str, params: dict = None, max_retries: int = 3) -> Optional[dict]:
|
61 |
+
"""Make API request with retry and backoff"""
|
62 |
+
if params is None:
|
63 |
+
params = {}
|
64 |
+
params['api_key'] = self.api_key
|
65 |
+
|
66 |
+
url = f"{self.base_url}{endpoint}"
|
67 |
+
|
68 |
+
for attempt in range(max_retries):
|
69 |
+
try:
|
70 |
+
response = self.session.get(url, params=params, timeout=10)
|
71 |
+
|
72 |
+
if response.status_code == 200:
|
73 |
+
return response.json()
|
74 |
+
elif response.status_code == 429:
|
75 |
+
# Rate limit - wait and retry
|
76 |
+
wait_time = 2 ** attempt
|
77 |
+
logger.warning(f"Rate limited, waiting {wait_time}s before retry...")
|
78 |
+
time.sleep(wait_time)
|
79 |
+
continue
|
80 |
+
elif response.status_code == 404:
|
81 |
+
logger.warning(f"Resource not found: {url}")
|
82 |
+
return None
|
83 |
+
else:
|
84 |
+
logger.error(f"API error {response.status_code}: {response.text}")
|
85 |
+
|
86 |
+
except requests.exceptions.RequestException as e:
|
87 |
+
logger.error(f"Request failed (attempt {attempt + 1}): {e}")
|
88 |
+
if attempt < max_retries - 1:
|
89 |
+
time.sleep(2 ** attempt)
|
90 |
+
|
91 |
+
return None
|
92 |
+
|
93 |
+
def get_popular_movies(self, max_pages: int = 100) -> List[int]:
|
94 |
+
"""Get movie IDs from popular movies pagination"""
|
95 |
+
movie_ids = []
|
96 |
+
|
97 |
+
for page in range(1, max_pages + 1):
|
98 |
+
logger.info(f"Fetching popular movies page {page}/{max_pages}")
|
99 |
+
|
100 |
+
data = self._make_request("/movie/popular", {"page": page})
|
101 |
+
if not data:
|
102 |
+
logger.error(f"Failed to fetch page {page}")
|
103 |
+
break
|
104 |
+
|
105 |
+
# Check if we've exceeded total pages
|
106 |
+
if page > data.get('total_pages', 0):
|
107 |
+
logger.info(f"Reached last page ({data.get('total_pages')})")
|
108 |
+
break
|
109 |
+
|
110 |
+
# Extract movie IDs
|
111 |
+
for movie in data.get('results', []):
|
112 |
+
movie_ids.append(movie['id'])
|
113 |
+
|
114 |
+
# Rate limiting
|
115 |
+
time.sleep(0.25) # 4 requests per second max
|
116 |
+
|
117 |
+
logger.info(f"Collected {len(movie_ids)} movie IDs from {page} pages")
|
118 |
+
return movie_ids
|
119 |
+
|
120 |
+
def get_movie_details(self, movie_id: int) -> Optional[dict]:
|
121 |
+
"""Get detailed movie information"""
|
122 |
+
return self._make_request(f"/movie/{movie_id}")
|
123 |
+
|
124 |
+
def get_movie_credits(self, movie_id: int) -> Optional[dict]:
|
125 |
+
"""Get movie cast and crew"""
|
126 |
+
return self._make_request(f"/movie/{movie_id}/credits")
|
127 |
+
|
128 |
+
def fetch_movie_data(tmdb_client: TMDBClient, movie_ids: List[int], max_workers: int = 5) -> Dict[int, dict]:
|
129 |
+
"""Fetch detailed data for all movies with controlled parallelization"""
|
130 |
+
movies_data = {}
|
131 |
+
|
132 |
+
def fetch_single_movie(movie_id: int) -> tuple:
|
133 |
+
"""Fetch details and credits for a single movie"""
|
134 |
+
try:
|
135 |
+
# Get basic details
|
136 |
+
details = tmdb_client.get_movie_details(movie_id)
|
137 |
+
if not details:
|
138 |
+
return movie_id, None
|
139 |
+
|
140 |
+
# Get credits
|
141 |
+
credits = tmdb_client.get_movie_credits(movie_id)
|
142 |
+
if credits:
|
143 |
+
details['credits'] = credits
|
144 |
+
|
145 |
+
return movie_id, details
|
146 |
+
|
147 |
+
except Exception as e:
|
148 |
+
logger.error(f"Error fetching movie {movie_id}: {e}")
|
149 |
+
return movie_id, None
|
150 |
+
|
151 |
+
# Process movies in batches with controlled parallelization
|
152 |
+
batch_size = 50
|
153 |
+
total_movies = len(movie_ids)
|
154 |
+
|
155 |
+
for i in range(0, total_movies, batch_size):
|
156 |
+
batch = movie_ids[i:i + batch_size]
|
157 |
+
logger.info(f"Processing batch {i//batch_size + 1}/{(total_movies-1)//batch_size + 1} ({len(batch)} movies)")
|
158 |
+
|
159 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
160 |
+
futures = {executor.submit(fetch_single_movie, movie_id): movie_id for movie_id in batch}
|
161 |
+
|
162 |
+
for future in as_completed(futures):
|
163 |
+
movie_id, movie_data = future.result()
|
164 |
+
if movie_data:
|
165 |
+
movies_data[movie_id] = movie_data
|
166 |
+
|
167 |
+
# Sleep between batches to be respectful to API
|
168 |
+
time.sleep(1)
|
169 |
+
|
170 |
+
logger.info(f"Successfully fetched data for {len(movies_data)}/{total_movies} movies")
|
171 |
+
return movies_data
|
172 |
+
|
173 |
+
def create_composite_text(movie_data: Dict) -> str:
|
174 |
+
"""Create composite text for embedding from movie data"""
|
175 |
+
parts = []
|
176 |
+
|
177 |
+
# Title
|
178 |
+
if movie_data.get('title'):
|
179 |
+
parts.append(f"Title: {movie_data['title']}")
|
180 |
+
|
181 |
+
# Tagline
|
182 |
+
if movie_data.get('tagline'):
|
183 |
+
parts.append(f"Tagline: {movie_data['tagline']}")
|
184 |
+
|
185 |
+
# Overview
|
186 |
+
if movie_data.get('overview'):
|
187 |
+
parts.append(f"Overview: {movie_data['overview']}")
|
188 |
+
|
189 |
+
# Release date
|
190 |
+
if movie_data.get('release_date'):
|
191 |
+
parts.append(f"Release Date: {movie_data['release_date']}")
|
192 |
+
|
193 |
+
# Original language
|
194 |
+
if movie_data.get('original_language'):
|
195 |
+
parts.append(f"Language: {movie_data['original_language']}")
|
196 |
+
|
197 |
+
# Spoken languages
|
198 |
+
if movie_data.get('spoken_languages'):
|
199 |
+
languages = [lang.get('iso_639_1', '') for lang in movie_data['spoken_languages'] if lang.get('iso_639_1')]
|
200 |
+
if languages:
|
201 |
+
parts.append(f"Spoken Languages: {', '.join(languages)}")
|
202 |
+
|
203 |
+
# Genres
|
204 |
+
if movie_data.get('genres'):
|
205 |
+
genres = [genre['name'] for genre in movie_data['genres']]
|
206 |
+
parts.append(f"Genres: {', '.join(genres)}")
|
207 |
+
|
208 |
+
# Production companies
|
209 |
+
if movie_data.get('production_companies'):
|
210 |
+
companies = [company['name'] for company in movie_data['production_companies']]
|
211 |
+
if companies:
|
212 |
+
parts.append(f"Production Companies: {', '.join(companies)}")
|
213 |
+
|
214 |
+
# Production countries
|
215 |
+
if movie_data.get('production_countries'):
|
216 |
+
countries = [country['name'] for country in movie_data['production_countries']]
|
217 |
+
if countries:
|
218 |
+
parts.append(f"Production Countries: {', '.join(countries)}")
|
219 |
+
|
220 |
+
# Budget (only if > 0)
|
221 |
+
if movie_data.get('budget') and movie_data['budget'] > 0:
|
222 |
+
parts.append(f"Budget: ${movie_data['budget']:,}")
|
223 |
+
|
224 |
+
# Popularity
|
225 |
+
if movie_data.get('popularity'):
|
226 |
+
parts.append(f"Popularity: {movie_data['popularity']}")
|
227 |
+
|
228 |
+
# Vote average
|
229 |
+
if movie_data.get('vote_average'):
|
230 |
+
parts.append(f"Vote Average: {movie_data['vote_average']}")
|
231 |
+
|
232 |
+
# Vote count
|
233 |
+
if movie_data.get('vote_count'):
|
234 |
+
parts.append(f"Vote Count: {movie_data['vote_count']}")
|
235 |
+
|
236 |
+
# Director(s)
|
237 |
+
if movie_data.get('credits', {}).get('crew'):
|
238 |
+
directors = [person['name'] for person in movie_data['credits']['crew'] if person['job'] == 'Director']
|
239 |
+
if directors:
|
240 |
+
parts.append(f"Director: {', '.join(directors)}")
|
241 |
+
|
242 |
+
# Top 5 cast
|
243 |
+
if movie_data.get('credits', {}).get('cast'):
|
244 |
+
top_cast = [person['name'] for person in movie_data['credits']['cast'][:5]]
|
245 |
+
if top_cast:
|
246 |
+
parts.append(f"Cast: {', '.join(top_cast)}")
|
247 |
+
|
248 |
+
return " / ".join(parts)
|
249 |
+
|
250 |
+
def get_embeddings_batch(texts: List[str], client: OpenAI, model: str = "text-embedding-3-small") -> List[List[float]]:
|
251 |
+
"""Get embeddings for a batch of texts with retry"""
|
252 |
+
max_retries = 3
|
253 |
+
|
254 |
+
for attempt in range(max_retries):
|
255 |
+
try:
|
256 |
+
response = client.embeddings.create(
|
257 |
+
input=texts,
|
258 |
+
model=model
|
259 |
+
)
|
260 |
+
return [item.embedding for item in response.data]
|
261 |
+
except Exception as e:
|
262 |
+
logger.error(f"Error getting embeddings (attempt {attempt + 1}): {e}")
|
263 |
+
if attempt < max_retries - 1:
|
264 |
+
time.sleep(2 ** attempt)
|
265 |
+
else:
|
266 |
+
raise
|
267 |
+
|
268 |
+
def build_index(max_pages: int = 10, model: str = "text-embedding-3-small", use_faiss: bool = True):
|
269 |
+
"""Main function to build the FAISS index and data files"""
|
270 |
+
settings = get_settings()
|
271 |
+
|
272 |
+
# Initialize clients
|
273 |
+
tmdb_client = TMDBClient(settings.tmdb_api_key)
|
274 |
+
openai_client = OpenAI(api_key=settings.openai_api_key)
|
275 |
+
|
276 |
+
# Create data directory
|
277 |
+
os.makedirs("app/data", exist_ok=True)
|
278 |
+
|
279 |
+
# Check for existing movie data checkpoint
|
280 |
+
movies_data = load_checkpoint(MOVIE_DATA_CHECKPOINT)
|
281 |
+
|
282 |
+
if movies_data is not None:
|
283 |
+
logger.info(f"π Resuming from checkpoint: {len(movies_data)} movies data found")
|
284 |
+
else:
|
285 |
+
# Step 1: Get movie IDs
|
286 |
+
logger.info(f"Fetching movie IDs from TMDB (max {max_pages} pages)...")
|
287 |
+
movie_ids = tmdb_client.get_popular_movies(max_pages=max_pages)
|
288 |
+
|
289 |
+
if not movie_ids:
|
290 |
+
logger.error("β No movie IDs retrieved from TMDB")
|
291 |
+
return
|
292 |
+
|
293 |
+
# Step 2: Fetch detailed movie data
|
294 |
+
logger.info(f"Fetching detailed data for {len(movie_ids)} movies...")
|
295 |
+
movies_data = fetch_movie_data(tmdb_client, movie_ids)
|
296 |
+
|
297 |
+
if not movies_data:
|
298 |
+
logger.error("β No movie data retrieved")
|
299 |
+
return
|
300 |
+
|
301 |
+
# Save movie data checkpoint
|
302 |
+
save_checkpoint(movies_data, MOVIE_DATA_CHECKPOINT)
|
303 |
+
|
304 |
+
# Step 3: Create composite texts and process embeddings in batches
|
305 |
+
logger.info("Creating embeddings...")
|
306 |
+
embeddings = []
|
307 |
+
id_map = {}
|
308 |
+
movie_metadata = {}
|
309 |
+
processed_movie_ids = set()
|
310 |
+
|
311 |
+
batch_size = 20 # Process 20 texts at a time
|
312 |
+
|
313 |
+
# Check for existing embedding progress
|
314 |
+
embedding_checkpoint = load_checkpoint(EMBEDDINGS_CHECKPOINT)
|
315 |
+
metadata_checkpoint = load_checkpoint(METADATA_CHECKPOINT)
|
316 |
+
|
317 |
+
if embedding_checkpoint is not None and metadata_checkpoint is not None:
|
318 |
+
embeddings = embedding_checkpoint['embeddings']
|
319 |
+
id_map = embedding_checkpoint['id_map']
|
320 |
+
processed_movie_ids = set(embedding_checkpoint['processed_movie_ids'])
|
321 |
+
movie_metadata = metadata_checkpoint
|
322 |
+
logger.info(f"π Resuming embeddings from checkpoint: {len(embeddings)} embeddings found")
|
323 |
+
else:
|
324 |
+
logger.info("Starting embeddings from scratch")
|
325 |
+
|
326 |
+
# Process remaining movies
|
327 |
+
remaining_movies = {k: v for k, v in movies_data.items() if k not in processed_movie_ids}
|
328 |
+
logger.info(f"Processing {len(remaining_movies)} remaining movies")
|
329 |
+
|
330 |
+
composite_texts = []
|
331 |
+
current_movie_ids = []
|
332 |
+
|
333 |
+
for movie_id, movie_data in remaining_movies.items():
|
334 |
+
# Create composite text
|
335 |
+
composite_text = create_composite_text(movie_data)
|
336 |
+
composite_texts.append(composite_text)
|
337 |
+
current_movie_ids.append(movie_id)
|
338 |
+
|
339 |
+
# Store metadata
|
340 |
+
release_year = 0
|
341 |
+
if movie_data.get("release_date"):
|
342 |
+
try:
|
343 |
+
release_year = int(movie_data["release_date"][:4])
|
344 |
+
except (ValueError, IndexError):
|
345 |
+
release_year = 0
|
346 |
+
|
347 |
+
movie_metadata[str(movie_id)] = {
|
348 |
+
"id": movie_id,
|
349 |
+
"title": movie_data.get("title", ""),
|
350 |
+
"year": release_year,
|
351 |
+
"poster_path": movie_data.get("poster_path"),
|
352 |
+
"release_date": movie_data.get("release_date"),
|
353 |
+
"genres": [g["name"] for g in movie_data.get("genres", [])]
|
354 |
+
}
|
355 |
+
|
356 |
+
# Process batch when full
|
357 |
+
if len(composite_texts) >= batch_size:
|
358 |
+
logger.info(f"Processing embedding batch ({len(embeddings)} done, {len(composite_texts)} in batch)")
|
359 |
+
|
360 |
+
try:
|
361 |
+
batch_embeddings = get_embeddings_batch(composite_texts, openai_client, model)
|
362 |
+
embeddings.extend(batch_embeddings)
|
363 |
+
|
364 |
+
# Update ID mapping and processed set
|
365 |
+
for i, mid in enumerate(current_movie_ids):
|
366 |
+
id_map[str(mid)] = len(id_map)
|
367 |
+
processed_movie_ids.add(mid)
|
368 |
+
|
369 |
+
# Save progress checkpoints
|
370 |
+
embedding_data = {
|
371 |
+
'embeddings': embeddings,
|
372 |
+
'id_map': id_map,
|
373 |
+
'processed_movie_ids': list(processed_movie_ids)
|
374 |
+
}
|
375 |
+
save_checkpoint(embedding_data, EMBEDDINGS_CHECKPOINT)
|
376 |
+
save_checkpoint(movie_metadata, METADATA_CHECKPOINT)
|
377 |
+
|
378 |
+
# Clear batch
|
379 |
+
composite_texts = []
|
380 |
+
current_movie_ids = []
|
381 |
+
|
382 |
+
# Sleep between batches
|
383 |
+
time.sleep(0.5)
|
384 |
+
|
385 |
+
except Exception as e:
|
386 |
+
logger.error(f"Failed to process batch: {e}")
|
387 |
+
logger.info("Progress has been saved, you can restart the script to resume")
|
388 |
+
return
|
389 |
+
|
390 |
+
# Process remaining texts
|
391 |
+
if composite_texts:
|
392 |
+
logger.info(f"Processing final embedding batch ({len(composite_texts)} texts)")
|
393 |
+
try:
|
394 |
+
batch_embeddings = get_embeddings_batch(composite_texts, openai_client, model)
|
395 |
+
embeddings.extend(batch_embeddings)
|
396 |
+
|
397 |
+
for i, mid in enumerate(current_movie_ids):
|
398 |
+
id_map[str(mid)] = len(id_map)
|
399 |
+
processed_movie_ids.add(mid)
|
400 |
+
|
401 |
+
# Save final progress
|
402 |
+
embedding_data = {
|
403 |
+
'embeddings': embeddings,
|
404 |
+
'id_map': id_map,
|
405 |
+
'processed_movie_ids': list(processed_movie_ids)
|
406 |
+
}
|
407 |
+
save_checkpoint(embedding_data, EMBEDDINGS_CHECKPOINT)
|
408 |
+
save_checkpoint(movie_metadata, METADATA_CHECKPOINT)
|
409 |
+
|
410 |
+
except Exception as e:
|
411 |
+
logger.error(f"Failed to process final batch: {e}")
|
412 |
+
logger.info("Progress has been saved, you can restart the script to resume")
|
413 |
+
return
|
414 |
+
|
415 |
+
if not embeddings:
|
416 |
+
logger.error("β No embeddings generated")
|
417 |
+
return
|
418 |
+
|
419 |
+
logger.info(f"Generated {len(embeddings)} embeddings")
|
420 |
+
|
421 |
+
# Step 4: Save embeddings as numpy array
|
422 |
+
embeddings_array = np.array(embeddings, dtype=np.float32)
|
423 |
+
np.save("app/data/movies.npy", embeddings_array)
|
424 |
+
logger.info(f"Saved embeddings matrix: {embeddings_array.shape}")
|
425 |
+
|
426 |
+
# Step 5: Build and save FAISS index
|
427 |
+
if use_faiss:
|
428 |
+
logger.info("Building FAISS index...")
|
429 |
+
dimension = embeddings_array.shape[1]
|
430 |
+
|
431 |
+
# Choose index type based on size
|
432 |
+
if len(embeddings) < 10000:
|
433 |
+
# For smaller datasets, use flat index
|
434 |
+
index = faiss.IndexFlatL2(dimension)
|
435 |
+
else:
|
436 |
+
# For larger datasets, use IVF index
|
437 |
+
nlist = min(int(np.sqrt(len(embeddings))), 1000)
|
438 |
+
quantizer = faiss.IndexFlatL2(dimension)
|
439 |
+
index = faiss.IndexIVFFlat(quantizer, dimension, nlist)
|
440 |
+
# Train the index
|
441 |
+
index.train(embeddings_array)
|
442 |
+
|
443 |
+
index.add(embeddings_array)
|
444 |
+
faiss.write_index(index, "app/data/faiss.index")
|
445 |
+
logger.info(f"FAISS index saved (type: {type(index).__name__}, dimension: {dimension})")
|
446 |
+
|
447 |
+
# Step 6: Save metadata files
|
448 |
+
with open("app/data/id_map.json", "w") as f:
|
449 |
+
json.dump(id_map, f)
|
450 |
+
|
451 |
+
with open("app/data/movie_metadata.json", "w") as f:
|
452 |
+
json.dump(movie_metadata, f)
|
453 |
+
|
454 |
+
logger.info("β
Index built successfully!")
|
455 |
+
logger.info(f" - {len(embeddings)} movies indexed")
|
456 |
+
logger.info(f" - Embedding model: {model}")
|
457 |
+
logger.info(f" - Files saved in app/data/")
|
458 |
+
logger.info(f" * movies.npy: embeddings matrix")
|
459 |
+
logger.info(f" * id_map.json: TMDB ID to matrix position mapping")
|
460 |
+
logger.info(f" * movie_metadata.json: movie metadata")
|
461 |
+
if use_faiss:
|
462 |
+
logger.info(f" * faiss.index: FAISS search index")
|
463 |
+
|
464 |
+
# Cleanup checkpoints
|
465 |
+
cleanup_checkpoints()
|
466 |
+
|
467 |
+
# Remove the old functions that are no longer needed
|
468 |
+
# create_movie_embedding and load_movie_data are replaced by the new implementation
|
469 |
+
|
470 |
+
if __name__ == "__main__":
|
471 |
+
parser = argparse.ArgumentParser(description="Build movie embeddings index from TMDB data")
|
472 |
+
parser.add_argument("--max-pages", type=int, default=10,
|
473 |
+
help="Maximum pages to fetch from TMDB popular movies (default: 10)")
|
474 |
+
parser.add_argument("--model", type=str, default="text-embedding-3-small",
|
475 |
+
help="OpenAI embedding model to use (default: text-embedding-3-small)")
|
476 |
+
parser.add_argument("--no-faiss", action="store_true",
|
477 |
+
help="Skip building FAISS index")
|
478 |
+
|
479 |
+
args = parser.parse_args()
|
480 |
+
|
481 |
+
build_index(
|
482 |
+
max_pages=args.max_pages,
|
483 |
+
model=args.model,
|
484 |
+
use_faiss=not args.no_faiss
|
485 |
+
)
|
app/main.py
ADDED
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import numpy as np
|
4 |
+
import faiss
|
5 |
+
from fastapi import FastAPI, HTTPException, Depends, status
|
6 |
+
from fastapi.middleware.cors import CORSMiddleware
|
7 |
+
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
8 |
+
from pydantic import BaseModel
|
9 |
+
from typing import List, Optional
|
10 |
+
import logging
|
11 |
+
import time
|
12 |
+
|
13 |
+
# Configure logging
|
14 |
+
logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO").upper())
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
# Security
|
18 |
+
security = HTTPBearer()
|
19 |
+
|
20 |
+
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
21 |
+
expected_token = os.getenv("API_TOKEN")
|
22 |
+
if not expected_token:
|
23 |
+
raise HTTPException(status_code=500, detail="API token not configured")
|
24 |
+
if credentials.credentials != expected_token:
|
25 |
+
raise HTTPException(status_code=401, detail="Invalid token")
|
26 |
+
return credentials.credentials
|
27 |
+
|
28 |
+
# Pydantic models
|
29 |
+
class ExploreRequest(BaseModel):
|
30 |
+
liked_ids: List[int]
|
31 |
+
disliked_ids: List[int] = []
|
32 |
+
top_k: int = 400
|
33 |
+
|
34 |
+
class MovieResult(BaseModel):
|
35 |
+
id: int
|
36 |
+
title: str
|
37 |
+
year: int
|
38 |
+
poster_path: Optional[str]
|
39 |
+
genres: List[str]
|
40 |
+
coords: List[float]
|
41 |
+
|
42 |
+
class ExploreResponse(BaseModel):
|
43 |
+
movies: List[MovieResult]
|
44 |
+
bary: List[float]
|
45 |
+
center: List[float]
|
46 |
+
|
47 |
+
# Global variables for loaded data
|
48 |
+
vectors = None
|
49 |
+
id_map = None
|
50 |
+
faiss_index = None
|
51 |
+
movie_metadata = None
|
52 |
+
|
53 |
+
def load_data():
|
54 |
+
"""Load FAISS index, vectors, and metadata on startup"""
|
55 |
+
global vectors, id_map, faiss_index, movie_metadata
|
56 |
+
|
57 |
+
try:
|
58 |
+
# Load vectors
|
59 |
+
vectors = np.load("app/data/movies.npy")
|
60 |
+
logger.info(f"Loaded {vectors.shape[0]} movie vectors of dimension {vectors.shape[1]}")
|
61 |
+
|
62 |
+
# Load ID mapping
|
63 |
+
with open("app/data/id_map.json", "r") as f:
|
64 |
+
id_map = json.load(f)
|
65 |
+
logger.info(f"Loaded ID mapping for {len(id_map)} movies")
|
66 |
+
|
67 |
+
# Load FAISS index
|
68 |
+
faiss_index = faiss.read_index("app/data/faiss.index")
|
69 |
+
logger.info(f"Loaded FAISS index with {faiss_index.ntotal} vectors")
|
70 |
+
|
71 |
+
# Load movie metadata
|
72 |
+
with open("app/data/movie_metadata.json", "r") as f:
|
73 |
+
movie_metadata = json.load(f)
|
74 |
+
logger.info(f"Loaded metadata for {len(movie_metadata)} movies")
|
75 |
+
|
76 |
+
except Exception as e:
|
77 |
+
logger.error(f"Failed to load data: {e}")
|
78 |
+
raise
|
79 |
+
|
80 |
+
def build_plane(likes: np.ndarray, dislikes: np.ndarray = None, dim: int = 2):
|
81 |
+
"""
|
82 |
+
Build user subspace from liked/disliked movies
|
83 |
+
Returns (axes, center) where axes is 2xD orthonormal matrix
|
84 |
+
"""
|
85 |
+
n_likes = likes.shape[0] if likes is not None else 0
|
86 |
+
d = vectors.shape[1]
|
87 |
+
|
88 |
+
# Compute composite vector: +liked - 0.5*disliked
|
89 |
+
if n_likes == 0:
|
90 |
+
# Cold start: use global average
|
91 |
+
center = vectors.mean(0)
|
92 |
+
# Create random orthonormal basis
|
93 |
+
axes = np.random.randn(dim, d)
|
94 |
+
axes[0] /= np.linalg.norm(axes[0])
|
95 |
+
for i in range(1, dim):
|
96 |
+
for j in range(i):
|
97 |
+
axes[i] -= np.dot(axes[i], axes[j]) * axes[j]
|
98 |
+
axes[i] /= np.linalg.norm(axes[i])
|
99 |
+
else:
|
100 |
+
# Compute composite from likes and dislikes
|
101 |
+
composite = likes.mean(0)
|
102 |
+
if dislikes is not None and dislikes.shape[0] > 0:
|
103 |
+
composite -= 0.5 * dislikes.mean(0)
|
104 |
+
|
105 |
+
if n_likes == 1:
|
106 |
+
# One like: use as center, random orthogonal axes
|
107 |
+
center = composite
|
108 |
+
axis1 = np.random.randn(d)
|
109 |
+
axis1 /= np.linalg.norm(axis1)
|
110 |
+
axis2 = np.random.randn(d)
|
111 |
+
axis2 -= np.dot(axis2, axis1) * axis1
|
112 |
+
axis2 /= np.linalg.norm(axis2)
|
113 |
+
axes = np.vstack([axis1, axis2])
|
114 |
+
elif n_likes == 2:
|
115 |
+
# Two likes: line between them
|
116 |
+
center = likes.mean(0)
|
117 |
+
axis1 = likes[1] - likes[0]
|
118 |
+
axis1 /= np.linalg.norm(axis1)
|
119 |
+
axis2 = np.random.randn(d)
|
120 |
+
axis2 -= np.dot(axis2, axis1) * axis1
|
121 |
+
axis2 /= np.linalg.norm(axis2)
|
122 |
+
axes = np.vstack([axis1, axis2])
|
123 |
+
else:
|
124 |
+
# 3+ likes: PCA plane
|
125 |
+
center = likes.mean(0)
|
126 |
+
likes_centered = likes - center
|
127 |
+
u, s, vt = np.linalg.svd(likes_centered, full_matrices=False)
|
128 |
+
axes = vt[:2] # First 2 principal components
|
129 |
+
|
130 |
+
return axes, center
|
131 |
+
|
132 |
+
def assign_spiral_coords(n_movies: int):
|
133 |
+
"""
|
134 |
+
Assign 2D grid coordinates in outward spiral pattern
|
135 |
+
Returns array of shape (n_movies, 2) with integer coordinates
|
136 |
+
"""
|
137 |
+
coords = np.zeros((n_movies, 2), dtype=int)
|
138 |
+
if n_movies == 0:
|
139 |
+
return coords
|
140 |
+
|
141 |
+
coords[0] = [0, 0] # Start at origin
|
142 |
+
|
143 |
+
if n_movies == 1:
|
144 |
+
return coords
|
145 |
+
|
146 |
+
# Spiral pattern: right, up, left, down, repeat with increasing distances
|
147 |
+
dx, dy = [1, 0, -1, 0], [0, 1, 0, -1]
|
148 |
+
direction = 0
|
149 |
+
steps = 1
|
150 |
+
x, y = 0, 0
|
151 |
+
idx = 1
|
152 |
+
|
153 |
+
while idx < n_movies:
|
154 |
+
for _ in range(2): # Each step count is used twice (except the first)
|
155 |
+
for _ in range(steps):
|
156 |
+
if idx >= n_movies:
|
157 |
+
break
|
158 |
+
x += dx[direction]
|
159 |
+
y += dy[direction]
|
160 |
+
coords[idx] = [x, y]
|
161 |
+
idx += 1
|
162 |
+
direction = (direction + 1) % 4
|
163 |
+
if idx >= n_movies:
|
164 |
+
break
|
165 |
+
steps += 1
|
166 |
+
|
167 |
+
return coords
|
168 |
+
|
169 |
+
def compute_barycenter(liked_indices: List[int], coords: np.ndarray):
|
170 |
+
"""Compute barycenter of liked movies in 2D grid"""
|
171 |
+
if not liked_indices:
|
172 |
+
return [0.0, 0.0]
|
173 |
+
|
174 |
+
liked_coords = coords[liked_indices]
|
175 |
+
bary = liked_coords.mean(0)
|
176 |
+
return bary.tolist()
|
177 |
+
|
178 |
+
# FastAPI app setup
|
179 |
+
app = FastAPI(title="Karl-Movie Vector Backend", version="1.0.0")
|
180 |
+
|
181 |
+
# CORS configuration
|
182 |
+
DEV_ORIGINS = [
|
183 |
+
"http://localhost:5173",
|
184 |
+
"http://127.0.0.1:5173",
|
185 |
+
"http://localhost:8888",
|
186 |
+
"https://*.bolt.run",
|
187 |
+
"https://*.stackblitz.io",
|
188 |
+
]
|
189 |
+
|
190 |
+
PROD_ORIGINS = ["https://karl.movie"]
|
191 |
+
|
192 |
+
origins = DEV_ORIGINS if os.getenv("ENV") != "prod" else PROD_ORIGINS
|
193 |
+
|
194 |
+
app.add_middleware(
|
195 |
+
CORSMiddleware,
|
196 |
+
allow_origins=origins,
|
197 |
+
allow_methods=["POST", "GET"],
|
198 |
+
allow_headers=["*"],
|
199 |
+
)
|
200 |
+
|
201 |
+
@app.on_event("startup")
|
202 |
+
async def startup_event():
|
203 |
+
"""Load data on startup"""
|
204 |
+
load_data()
|
205 |
+
|
206 |
+
@app.get("/health")
|
207 |
+
async def health_check():
|
208 |
+
"""Health check endpoint"""
|
209 |
+
return {"status": "healthy", "vectors_loaded": vectors is not None}
|
210 |
+
|
211 |
+
@app.post("/explore", response_model=ExploreResponse)
|
212 |
+
async def explore(
|
213 |
+
request: ExploreRequest,
|
214 |
+
token: str = Depends(verify_token)
|
215 |
+
):
|
216 |
+
"""
|
217 |
+
Main endpoint: find movies closest to user's preference subspace
|
218 |
+
"""
|
219 |
+
start_time = time.time()
|
220 |
+
|
221 |
+
try:
|
222 |
+
# Convert TMDB IDs to internal indices
|
223 |
+
liked_indices = []
|
224 |
+
disliked_indices = []
|
225 |
+
|
226 |
+
for tmdb_id in request.liked_ids:
|
227 |
+
if str(tmdb_id) in id_map:
|
228 |
+
liked_indices.append(id_map[str(tmdb_id)])
|
229 |
+
else:
|
230 |
+
logger.warning(f"TMDB ID {tmdb_id} not found in index")
|
231 |
+
|
232 |
+
for tmdb_id in request.disliked_ids:
|
233 |
+
if str(tmdb_id) in id_map:
|
234 |
+
disliked_indices.append(id_map[str(tmdb_id)])
|
235 |
+
else:
|
236 |
+
logger.warning(f"TMDB ID {tmdb_id} not found in index")
|
237 |
+
|
238 |
+
# Get embedding vectors
|
239 |
+
liked_vectors = vectors[liked_indices] if liked_indices else None
|
240 |
+
disliked_vectors = vectors[disliked_indices] if disliked_indices else None
|
241 |
+
|
242 |
+
# Build user subspace
|
243 |
+
axes, center = build_plane(liked_vectors, disliked_vectors)
|
244 |
+
|
245 |
+
# Project all vectors onto the 2D subspace
|
246 |
+
projections = np.dot(vectors - center, axes.T) # Shape: (N, 2)
|
247 |
+
|
248 |
+
# Reconstruct vectors in original space
|
249 |
+
reconstructed = np.dot(projections, axes) + center
|
250 |
+
|
251 |
+
# Compute distances to subspace (residuals)
|
252 |
+
residuals = np.linalg.norm(vectors - reconstructed, axis=1)
|
253 |
+
|
254 |
+
# Get top-k closest movies
|
255 |
+
top_k_indices = np.argpartition(residuals, min(request.top_k, len(residuals)))[:request.top_k]
|
256 |
+
top_k_indices = top_k_indices[np.argsort(residuals[top_k_indices])]
|
257 |
+
|
258 |
+
# Assign spiral coordinates
|
259 |
+
spiral_coords = assign_spiral_coords(len(top_k_indices))
|
260 |
+
|
261 |
+
# Compute barycenter of liked movies
|
262 |
+
liked_positions = [i for i, idx in enumerate(top_k_indices) if idx in liked_indices]
|
263 |
+
bary = compute_barycenter(liked_positions, spiral_coords)
|
264 |
+
|
265 |
+
# Translate grid so barycenter is at origin
|
266 |
+
spiral_coords = spiral_coords - np.array(bary)
|
267 |
+
|
268 |
+
# Build response
|
269 |
+
movies = []
|
270 |
+
reverse_id_map = {v: k for k, v in id_map.items()}
|
271 |
+
|
272 |
+
for i, movie_idx in enumerate(top_k_indices):
|
273 |
+
tmdb_id = int(reverse_id_map[movie_idx])
|
274 |
+
metadata = movie_metadata.get(str(tmdb_id), {})
|
275 |
+
|
276 |
+
movie = MovieResult(
|
277 |
+
id=tmdb_id,
|
278 |
+
title=metadata.get("title", f"Movie {tmdb_id}"),
|
279 |
+
year=metadata.get("year", 0),
|
280 |
+
poster_path=metadata.get("poster_path"),
|
281 |
+
genres=metadata.get("genres", []),
|
282 |
+
coords=spiral_coords[i].tolist()
|
283 |
+
)
|
284 |
+
movies.append(movie)
|
285 |
+
|
286 |
+
response = ExploreResponse(
|
287 |
+
movies=movies,
|
288 |
+
bary=[0.0, 0.0], # Always [0,0] since we translated
|
289 |
+
center=center.tolist()
|
290 |
+
)
|
291 |
+
|
292 |
+
elapsed = time.time() - start_time
|
293 |
+
logger.info(f"Explore request processed in {elapsed:.3f}s - {len(request.liked_ids)} likes, {len(request.disliked_ids)} dislikes, {len(movies)} results")
|
294 |
+
|
295 |
+
return response
|
296 |
+
|
297 |
+
except Exception as e:
|
298 |
+
logger.error(f"Error processing explore request: {e}")
|
299 |
+
raise HTTPException(status_code=500, detail=str(e))
|
300 |
+
|
301 |
+
if __name__ == "__main__":
|
302 |
+
import uvicorn
|
303 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
app/settings.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Settings and environment configuration
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
from functools import lru_cache
|
6 |
+
from pydantic_settings import BaseSettings
|
7 |
+
|
8 |
+
|
9 |
+
class Settings(BaseSettings):
|
10 |
+
"""Application settings"""
|
11 |
+
|
12 |
+
# OpenAI API key for embeddings
|
13 |
+
openai_api_key: str
|
14 |
+
|
15 |
+
# TMDB API key for movie data
|
16 |
+
tmdb_api_key: str
|
17 |
+
|
18 |
+
# API authentication token
|
19 |
+
api_token: str
|
20 |
+
|
21 |
+
# Environment (dev/prod)
|
22 |
+
env: str = "dev"
|
23 |
+
|
24 |
+
# Logging level
|
25 |
+
log_level: str = "INFO"
|
26 |
+
|
27 |
+
class Config:
|
28 |
+
env_file = ".env"
|
29 |
+
env_file_encoding = "utf-8"
|
30 |
+
|
31 |
+
|
32 |
+
@lru_cache()
|
33 |
+
def get_settings() -> Settings:
|
34 |
+
"""Get cached settings instance"""
|
35 |
+
return Settings()
|
app/test_api.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Test l'API /explore avec des exemples de films
|
3 |
+
"""
|
4 |
+
import requests
|
5 |
+
import json
|
6 |
+
|
7 |
+
# Configuration
|
8 |
+
API_URL = "http://localhost:8000"
|
9 |
+
API_TOKEN = "your-api-token" # Remplacez par votre token
|
10 |
+
|
11 |
+
def test_explore_endpoint():
|
12 |
+
"""Test l'endpoint /explore avec diffΓ©rents scΓ©narios"""
|
13 |
+
|
14 |
+
# Lire les mΓ©tadonnΓ©es pour avoir des IDs de test
|
15 |
+
with open("app/data/movie_metadata.json", "r") as f:
|
16 |
+
metadata = json.load(f)
|
17 |
+
|
18 |
+
# Prendre les premiers films comme exemples
|
19 |
+
movie_ids = list(metadata.keys())[:5]
|
20 |
+
print(f"Films de test disponibles : {[metadata[mid]['title'] for mid in movie_ids]}")
|
21 |
+
|
22 |
+
# Test 1: Recherche avec 1 film aimΓ©
|
23 |
+
print("\nπ¬ Test 1: Recherche avec 1 film aimΓ©")
|
24 |
+
test_request = {
|
25 |
+
"liked_ids": [int(movie_ids[0])],
|
26 |
+
"disliked_ids": [],
|
27 |
+
"top_k": 10
|
28 |
+
}
|
29 |
+
|
30 |
+
try:
|
31 |
+
response = requests.post(
|
32 |
+
f"{API_URL}/explore",
|
33 |
+
json=test_request,
|
34 |
+
headers={"Authorization": f"Bearer {API_TOKEN}"}
|
35 |
+
)
|
36 |
+
|
37 |
+
if response.status_code == 200:
|
38 |
+
data = response.json()
|
39 |
+
print(f"β
TrouvΓ© {len(data['movies'])} films similaires")
|
40 |
+
print(f"Film aimΓ©: {metadata[movie_ids[0]]['title']}")
|
41 |
+
print("Films recommandΓ©s:")
|
42 |
+
for movie in data['movies'][:3]:
|
43 |
+
print(f" - {movie['title']} ({movie['year']}) - {movie['genres']}")
|
44 |
+
else:
|
45 |
+
print(f"β Erreur {response.status_code}: {response.text}")
|
46 |
+
|
47 |
+
except Exception as e:
|
48 |
+
print(f"β Erreur de connexion: {e}")
|
49 |
+
print("π‘ VΓ©rifiez que votre API_TOKEN est correct dans le .env")
|
50 |
+
|
51 |
+
# Test 2: Recherche avec 2 films aimΓ©s
|
52 |
+
print("\nπ¬ Test 2: Recherche avec 2 films aimΓ©s")
|
53 |
+
test_request = {
|
54 |
+
"liked_ids": [int(movie_ids[0]), int(movie_ids[1])],
|
55 |
+
"disliked_ids": [],
|
56 |
+
"top_k": 10
|
57 |
+
}
|
58 |
+
|
59 |
+
try:
|
60 |
+
response = requests.post(
|
61 |
+
f"{API_URL}/explore",
|
62 |
+
json=test_request,
|
63 |
+
headers={"Authorization": f"Bearer {API_TOKEN}"}
|
64 |
+
)
|
65 |
+
|
66 |
+
if response.status_code == 200:
|
67 |
+
data = response.json()
|
68 |
+
print(f"β
TrouvΓ© {len(data['movies'])} films similaires")
|
69 |
+
print(f"Films aimΓ©s: {metadata[movie_ids[0]]['title']}, {metadata[movie_ids[1]]['title']}")
|
70 |
+
print("Barycenter:", data['bary'])
|
71 |
+
else:
|
72 |
+
print(f"β Erreur {response.status_code}: {response.text}")
|
73 |
+
|
74 |
+
except Exception as e:
|
75 |
+
print(f"β Erreur: {e}")
|
76 |
+
|
77 |
+
if __name__ == "__main__":
|
78 |
+
print("π§ͺ Test de l'API /explore")
|
79 |
+
print("=" * 40)
|
80 |
+
test_explore_endpoint()
|
app/test_setup.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Test script for TMDB data loading and embedding generation
|
3 |
+
Run this to validate your setup before building the full index
|
4 |
+
"""
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
import json
|
8 |
+
from settings import get_settings
|
9 |
+
from build_index import TMDBClient, create_composite_text, get_embeddings_batch
|
10 |
+
from openai import OpenAI
|
11 |
+
|
12 |
+
def test_tmdb_connection():
|
13 |
+
"""Test TMDB API connection"""
|
14 |
+
print("π Testing TMDB API connection...")
|
15 |
+
|
16 |
+
try:
|
17 |
+
settings = get_settings()
|
18 |
+
tmdb_client = TMDBClient(settings.tmdb_api_key)
|
19 |
+
|
20 |
+
# Test getting popular movies (just first page)
|
21 |
+
movie_ids = tmdb_client.get_popular_movies(max_pages=1)
|
22 |
+
|
23 |
+
if movie_ids:
|
24 |
+
print(f"β
Successfully fetched {len(movie_ids)} movie IDs from TMDB")
|
25 |
+
|
26 |
+
# Test getting details for first movie
|
27 |
+
movie_data = tmdb_client.get_movie_details(movie_ids[0])
|
28 |
+
if movie_data:
|
29 |
+
print(f"β
Successfully fetched details for movie: {movie_data.get('title', 'Unknown')}")
|
30 |
+
|
31 |
+
# Test getting credits
|
32 |
+
credits = tmdb_client.get_movie_credits(movie_ids[0])
|
33 |
+
if credits:
|
34 |
+
print(f"β
Successfully fetched credits (cast: {len(credits.get('cast', []))}, crew: {len(credits.get('crew', []))})")
|
35 |
+
else:
|
36 |
+
print("β οΈ Could not fetch credits")
|
37 |
+
|
38 |
+
return movie_data, credits
|
39 |
+
else:
|
40 |
+
print("β Could not fetch movie details")
|
41 |
+
else:
|
42 |
+
print("β Could not fetch movie IDs")
|
43 |
+
|
44 |
+
except Exception as e:
|
45 |
+
print(f"β TMDB API error: {e}")
|
46 |
+
|
47 |
+
return None, None
|
48 |
+
|
49 |
+
def test_composite_text(movie_data, credits):
|
50 |
+
"""Test composite text creation"""
|
51 |
+
print("\nπ Testing composite text creation...")
|
52 |
+
|
53 |
+
if movie_data:
|
54 |
+
# Add credits to movie data
|
55 |
+
if credits:
|
56 |
+
movie_data['credits'] = credits
|
57 |
+
|
58 |
+
composite_text = create_composite_text(movie_data)
|
59 |
+
print(f"β
Generated composite text ({len(composite_text)} chars)")
|
60 |
+
print(f"Preview: {composite_text[:200]}...")
|
61 |
+
return composite_text
|
62 |
+
else:
|
63 |
+
print("β No movie data to test")
|
64 |
+
return None
|
65 |
+
|
66 |
+
def test_embeddings(composite_text):
|
67 |
+
"""Test embedding generation"""
|
68 |
+
print("\nπ€ Testing embedding generation...")
|
69 |
+
|
70 |
+
if composite_text:
|
71 |
+
try:
|
72 |
+
settings = get_settings()
|
73 |
+
openai_client = OpenAI(api_key=settings.openai_api_key)
|
74 |
+
|
75 |
+
embeddings = get_embeddings_batch([composite_text], openai_client)
|
76 |
+
|
77 |
+
if embeddings:
|
78 |
+
embedding = embeddings[0]
|
79 |
+
print(f"β
Generated embedding (dimension: {len(embedding)})")
|
80 |
+
print(f"Sample values: {embedding[:5]}...")
|
81 |
+
return embedding
|
82 |
+
else:
|
83 |
+
print("β No embeddings generated")
|
84 |
+
|
85 |
+
except Exception as e:
|
86 |
+
print(f"β Embedding error: {e}")
|
87 |
+
else:
|
88 |
+
print("β No composite text to test")
|
89 |
+
|
90 |
+
return None
|
91 |
+
|
92 |
+
def main():
|
93 |
+
"""Run all tests"""
|
94 |
+
print("π¬ Karl Movie Vector Backend - Test Suite")
|
95 |
+
print("=" * 50)
|
96 |
+
|
97 |
+
# Test environment variables
|
98 |
+
print("π§ Checking environment variables...")
|
99 |
+
try:
|
100 |
+
settings = get_settings()
|
101 |
+
print(f"β
OpenAI API key: {'sk-...' + settings.openai_api_key[-10:] if settings.openai_api_key else 'Not set'}")
|
102 |
+
print(f"β
TMDB API key: {'...' + settings.tmdb_api_key[-10:] if settings.tmdb_api_key else 'Not set'}")
|
103 |
+
except Exception as e:
|
104 |
+
print(f"β Settings error: {e}")
|
105 |
+
print("Make sure you have a .env file with OPENAI_API_KEY and TMDB_API_KEY")
|
106 |
+
return
|
107 |
+
|
108 |
+
# Run tests
|
109 |
+
movie_data, credits = test_tmdb_connection()
|
110 |
+
composite_text = test_composite_text(movie_data, credits)
|
111 |
+
embedding = test_embeddings(composite_text)
|
112 |
+
|
113 |
+
print("\n" + "=" * 50)
|
114 |
+
if movie_data and composite_text and embedding:
|
115 |
+
print("π All tests passed! You can now run the full build:")
|
116 |
+
print(" python app/build_index.py --max-pages 3")
|
117 |
+
else:
|
118 |
+
print("β Some tests failed. Check your API keys and internet connection.")
|
119 |
+
|
120 |
+
if __name__ == "__main__":
|
121 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.104.1
|
2 |
+
uvicorn[standard]==0.24.0
|
3 |
+
numpy==1.24.4
|
4 |
+
faiss-cpu==1.7.4
|
5 |
+
openai==1.51.0
|
6 |
+
pydantic==2.11.5
|
7 |
+
pydantic-settings==2.9.1
|
8 |
+
python-multipart==0.0.6
|
9 |
+
requests==2.31.0
|
10 |
+
scikit-learn==1.3.2
|
11 |
+
python-dotenv==1.0.0
|
12 |
+
httpx==0.27.0
|