yonnel
Add automatic data generation on startup for Hugging Face deployment
b1c879a
#!/usr/bin/env python3
"""
Startup script that builds the index if data files don't exist,
then starts the FastAPI application.
"""
import os
import subprocess
import sys
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def check_data_files():
"""Check if all required data files exist"""
required_files = [
"app/data/faiss.index",
"app/data/movies.npy",
"app/data/id_map.json",
"app/data/movie_metadata.json"
]
missing_files = []
for file_path in required_files:
if not os.path.exists(file_path):
missing_files.append(file_path)
return missing_files
def build_index():
"""Run the build_index script"""
logger.info("πŸ”§ Building movie index and data files...")
try:
# Run build_index with reduced dataset for faster startup on HF
result = subprocess.run([
sys.executable, "-m", "app.build_index",
"--max-pages", "5" # Reduced for faster startup
], check=True, capture_output=True, text=True)
logger.info("βœ… Index built successfully!")
logger.info(result.stdout)
except subprocess.CalledProcessError as e:
logger.error("❌ Failed to build index:")
logger.error(e.stderr)
raise
def start_api():
"""Start the FastAPI application"""
logger.info("πŸš€ Starting FastAPI application...")
os.execv(sys.executable, [
sys.executable, "-m", "uvicorn",
"app.main:app",
"--host", "0.0.0.0",
"--port", "7860"
])
if __name__ == "__main__":
logger.info("🎬 Karl Movie Vector Backend - Starting up...")
# Check if data files exist
missing_files = check_data_files()
if missing_files:
logger.info(f"πŸ“ Missing data files: {missing_files}")
logger.info("πŸ”„ This is the first startup - building index...")
# Build the index
build_index()
# Verify files were created
missing_after_build = check_data_files()
if missing_after_build:
logger.error(f"❌ Still missing files after build: {missing_after_build}")
sys.exit(1)
else:
logger.info("βœ… All data files present, skipping index build")
# Start the API
start_api()