Spaces:

JJTsao
/

rag-movie-api

Running

App Files Files Community

JJ Tsao commited on Jun 17

Commit

123b5dd

2 Parent(s): 1005046 a036209

Merge branch 'main' of https://huggingface.co/spaces/JJTsao/rag-movie-api

Browse files

Files changed (7) hide show

Dockerfile +10 -7
app/chatbot.py +80 -0
app/llm_services.py +99 -0
data/bm25_files/movie_bm25_model.joblib +2 -2
data/bm25_files/movie_bm25_vocab.joblib +2 -2
data/bm25_files/tv_bm25_model.joblib +2 -2
data/bm25_files/tv_bm25_vocab.joblib +2 -2

Dockerfile CHANGED Viewed

@@ -1,19 +1,22 @@
-# Use a slim Python base image
 FROM python:3.10-slim
-# Set working directory
 WORKDIR /app
-# Install Python dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy all project files into the container
 COPY . .
-# Set environment to unbuffered (cleaner logs)
 ENV PYTHONUNBUFFERED=1
-# Run FastAPI app on port 7860 (required by HF Spaces)
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

+# Use slim base image
 FROM python:3.10-slim
+# Create a dedicated cache directory and assign permissions
+RUN mkdir -p /home/user/.cache && chmod -R 777 /home/user/.cache
 WORKDIR /app
+# Set cache env vars **before** installing anything
+ENV HF_HOME=/home/user/.cache/huggingface \
+    TRANSFORMERS_CACHE=/home/user/.cache/huggingface \
+    SENTENCE_TRANSFORMERS_HOME=/home/user/.cache/huggingface
+# Install dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 ENV PYTHONUNBUFFERED=1
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

app/chatbot.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+from app.llm_services import call_chat_model_openai
+def sanitize_markdown(md_text: str) -> str:
+    return re.sub(r'!\[.*?\]\(.*?\)', '', md_text)
+def build_chat_fn(retriever, intent_classifier):
+    def chat(
+        question,
+        history,
+        media_type="movies",
+        genres=None,
+        providers=None,
+        year_range=None,
+    ):
+        full_t0 = time.time()
+        with ThreadPoolExecutor() as executor:
+            # Classify user intent to determine if it is a recommendation ask
+            t0 = time.time()
+            intent_future = executor.submit(
+                lambda q: intent_classifier(q)[0]["label"] == "recommendation", question
+            )
+            print(f"\n🧠 executor.submit(classify_intent) took {time.time() - t0:.3f}s")
+            # Embed user query as dense vector asynchronously
+            t0 = time.time()
+            query_vector_future = executor.submit(retriever.embed_dense, question)
+            print(f"🧵 executor.submit(embed_text) took {time.time() - t0:.3f}s")
+            # Wait for results
+            t0 = time.time()
+            is_rec_intent = intent_future.result()
+            print(f"✅ classify_intent() result received in {time.time() - t0:.3f}s")
+            t0 = time.time()
+            dense_vector = query_vector_future.result()
+            print(f"📈 embed_text() result received in {time.time() - t0:.3f}s")
+        # Embed user query as sparse vector for hybrid retrieval
+        t0 = time.time()
+        sparse_vector = retriever.embed_sparse(question, media_type)
+        print(f"📈 embed_sparse() result received in {time.time() - t0:.3f}s")
+        if is_rec_intent:
+            yield "[[MODE:recommendation]]\n"
+            t0 = time.time()
+            retrieved_movies = retriever.retrieve_and_rerank(
+                dense_vector,
+                sparse_vector,
+                media_type.lower(),
+                genres,
+                providers,
+                year_range,
+            )
+            print(f"\n📚 retrieve_and_rerank() took {time.time() - t0:.3f}s")
+            context = retriever.format_context(retrieved_movies)
+            user_message = f"{question}\n\nContext:\nBased on the following retrieved {media_type.lower()}, suggest the best recommendations.\n\n{context}"
+            print(f"✨ Total chat() prep time before streaming: {time.time() - full_t0:.3f}s")
+            for chunk in call_chat_model_openai(history, user_message):
+                yield chunk
+        else:
+            yield "[[MODE:chat]]\n"
+            user_message = f"The user did not ask for a recommendation. Ask them to be more specific. Answer this as a general question: {question}"
+            print(f"✨ Total chat() prep time before streaming: {time.time() - full_t0:.3f}s")
+            for chunk in call_chat_model_openai(history, user_message):
+                yield sanitize_markdown(chunk)
+    return chat

app/llm_services.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import time
+import torch
+from openai import OpenAI
+from sentence_transformers import SentenceTransformer
+from app.config import EMBEDDING_MODEL, OPENAI_MODEL, OPENAI_API_KEY
+# === LLM Config ===
+_sentence_model = None  # Not loaded at import time
+# === Clients ===
+openai_client = OpenAI(api_key=OPENAI_API_KEY)
+# === System Prompt ===
+SYSTEM_PROMPT = """
+You are a professional film curator and critic. Your role is to analyze the user's preferences and recommend high-quality films or TV shows using only the provided list.
+Focus on:
+- Artistic merit and storytelling
+- Genres, themes, tone, and emotional resonance
+- IMDB and Rotten Tomatoes ratings
+- Strong character-driven or thematically rich selections
+### Response Format (in markdown):
+1. Start with a concise 2 sentences **opening paragraph** that contextualizes the theme and the overall viewing experience the user is seeking. At the end of this paragraph, insert the token: <!-- END_INTRO -->.
+2. Then, for each recommendation, use the following format (repeat for each title). At the end of each movie recommendation block, insert the token: <!-- END_MOVIE -->:
+```
+### <Number>. <Movie Title>
+- POSTER_PATH: /abc123.jpg
+- BACKDROP_PATH: /abc123.jpg
+- GENRES: Genre1, Genre2, ...
+- IMDB_RATING: X.X
+- ROTTEN_TOMATOES_RATING: XX%
+- TRAILER_KEY: abc123
+- WHY_YOU_MIGHT_ENJOY_IT: <Short paragraph explaining the appeal based on character, themes, tone, and relevance to the user's intent.>
+<!-- END_MOVIE -->
+```
+3. End with a brief **closing paragraph** that summarizes the emotional or intellectual throughline across the recommendations, and affirms their alignment with the user's preferences.
+Write in **Markdown** only. Be concise, authoritative, and avoid overly generic statements. Each "Why You Might Enjoy It" should be specific and grounded in the movie’s themes, storytelling, or cultural relevance.
+"""
+def load_sentence_model():
+    global _sentence_model
+    if _sentence_model is None:
+        print("⏳ Loading embedding model...")
+        _sentence_model = SentenceTransformer(
+            EMBEDDING_MODEL, device="cuda" if torch.cuda.is_available() else "cpu"
+        )
+        print(f"🔥 Model '{EMBEDDING_MODEL}' loaded. Performing GPU warmup...")
+        # Realistic multi-sentence warmup to trigger full CUDA graph
+        warmup_sentences = [
+            "A suspenseful thriller with deep character development and moral ambiguity.",
+            "Coming-of-age story with emotional storytelling and strong ensemble performances.",
+            "Mind-bending sci-fi with philosophical undertones and high concept ideas.",
+            "Recommend me some comedies.",
+        ]
+        _ = _sentence_model.encode(warmup_sentences, show_progress_bar=False)
+        time.sleep(0.5)
+        _ = _sentence_model.encode(warmup_sentences, show_progress_bar=False)
+        print("🚀 Embedding model fully warmed up.")
+    return _sentence_model
+def embed_text(text: str) -> list[float]:
+    model = load_sentence_model()
+    return model.encode(text).tolist()
+def build_chat_history(history: list, max_turns: int = 5) -> list:
+    return [
+        {"role": msg.role, "content": msg.content}
+        for msg in history[-max_turns * 2:]
+    ]
+def call_chat_model_openai(history, user_message: str):
+    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    messages += build_chat_history(history or [])
+    messages.append({"role": "user", "content": user_message})
+    response = openai_client.chat.completions.create(
+        model=OPENAI_MODEL, messages=messages, temperature=0.7, stream=True
+    )
+    for chunk in response:
+        delta = chunk.choices[0].delta.content
+        if delta:
+            yield delta

data/bm25_files/movie_bm25_model.joblib CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:829dfd9b3e50992a2617415e4d45e05eb8b887206e77ff5250416b7cd5dac0d2
-size 6637836

 version https://git-lfs.github.com/spec/v1
+oid sha256:acf76920893471f0ee91cdc1c2fb20c42d8585f12dbc1dc10dcbeff2be720475
+size 291

data/bm25_files/movie_bm25_vocab.joblib CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5d1304c2611aa1072156fe60aa0298934dc72a8f85c5f484ba49ff641223dadf
-size 688111

 version https://git-lfs.github.com/spec/v1
+oid sha256:2f721046d72d43dd9a6808f21e2dd04a174c7d67e89aad23f3696c5854fa3abc
+size 289

data/bm25_files/tv_bm25_model.joblib CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b7275086436201a11b2027fa28727d8d05cf6c3180bb68de2cd0deb12e5dc62
-size 5712382

 version https://git-lfs.github.com/spec/v1
+oid sha256:25a6ff1e336835bd87c6b53dc1732142193d5d75f28acafb6e94dd5db5718fc0
+size 291

data/bm25_files/tv_bm25_vocab.joblib CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:db37c47df4a7f73e792c479d72095a62a35389be3ed01997aa60ab57d0e320a5
-size 497815

 version https://git-lfs.github.com/spec/v1
+oid sha256:dee6416043ca93b626ee9816b992c22024138e8a6d707b1b1ee001b121a8268c
+size 289