JJ Tsao commited on
Commit
123b5dd
·
2 Parent(s): 1005046 a036209

Merge branch 'main' of https://huggingface.co/spaces/JJTsao/rag-movie-api

Browse files
Dockerfile CHANGED
@@ -1,19 +1,22 @@
1
- # Use a slim Python base image
2
  FROM python:3.10-slim
3
 
4
- # Set working directory
 
 
5
  WORKDIR /app
6
 
7
- # Install Python dependencies
 
 
 
 
 
8
  COPY requirements.txt .
9
  RUN pip install --no-cache-dir -r requirements.txt
10
 
11
- # Copy all project files into the container
12
  COPY . .
13
 
14
- # Set environment to unbuffered (cleaner logs)
15
  ENV PYTHONUNBUFFERED=1
16
 
17
- # Run FastAPI app on port 7860 (required by HF Spaces)
18
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
19
-
 
1
+ # Use slim base image
2
  FROM python:3.10-slim
3
 
4
+ # Create a dedicated cache directory and assign permissions
5
+ RUN mkdir -p /home/user/.cache && chmod -R 777 /home/user/.cache
6
+
7
  WORKDIR /app
8
 
9
+ # Set cache env vars **before** installing anything
10
+ ENV HF_HOME=/home/user/.cache/huggingface \
11
+ TRANSFORMERS_CACHE=/home/user/.cache/huggingface \
12
+ SENTENCE_TRANSFORMERS_HOME=/home/user/.cache/huggingface
13
+
14
+ # Install dependencies
15
  COPY requirements.txt .
16
  RUN pip install --no-cache-dir -r requirements.txt
17
 
 
18
  COPY . .
19
 
 
20
  ENV PYTHONUNBUFFERED=1
21
 
 
22
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
app/chatbot.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import time
3
+ from concurrent.futures import ThreadPoolExecutor
4
+
5
+ from app.llm_services import call_chat_model_openai
6
+
7
+
8
+ def sanitize_markdown(md_text: str) -> str:
9
+ return re.sub(r'!\[.*?\]\(.*?\)', '', md_text)
10
+
11
+
12
+ def build_chat_fn(retriever, intent_classifier):
13
+ def chat(
14
+ question,
15
+ history,
16
+ media_type="movies",
17
+ genres=None,
18
+ providers=None,
19
+ year_range=None,
20
+ ):
21
+ full_t0 = time.time()
22
+
23
+ with ThreadPoolExecutor() as executor:
24
+ # Classify user intent to determine if it is a recommendation ask
25
+ t0 = time.time()
26
+ intent_future = executor.submit(
27
+ lambda q: intent_classifier(q)[0]["label"] == "recommendation", question
28
+ )
29
+ print(f"\n🧠 executor.submit(classify_intent) took {time.time() - t0:.3f}s")
30
+
31
+ # Embed user query as dense vector asynchronously
32
+ t0 = time.time()
33
+ query_vector_future = executor.submit(retriever.embed_dense, question)
34
+ print(f"🧵 executor.submit(embed_text) took {time.time() - t0:.3f}s")
35
+
36
+ # Wait for results
37
+ t0 = time.time()
38
+ is_rec_intent = intent_future.result()
39
+ print(f"✅ classify_intent() result received in {time.time() - t0:.3f}s")
40
+
41
+ t0 = time.time()
42
+ dense_vector = query_vector_future.result()
43
+ print(f"📈 embed_text() result received in {time.time() - t0:.3f}s")
44
+
45
+ # Embed user query as sparse vector for hybrid retrieval
46
+ t0 = time.time()
47
+ sparse_vector = retriever.embed_sparse(question, media_type)
48
+ print(f"📈 embed_sparse() result received in {time.time() - t0:.3f}s")
49
+
50
+ if is_rec_intent:
51
+ yield "[[MODE:recommendation]]\n"
52
+
53
+ t0 = time.time()
54
+ retrieved_movies = retriever.retrieve_and_rerank(
55
+ dense_vector,
56
+ sparse_vector,
57
+ media_type.lower(),
58
+ genres,
59
+ providers,
60
+ year_range,
61
+ )
62
+ print(f"\n📚 retrieve_and_rerank() took {time.time() - t0:.3f}s")
63
+
64
+ context = retriever.format_context(retrieved_movies)
65
+ user_message = f"{question}\n\nContext:\nBased on the following retrieved {media_type.lower()}, suggest the best recommendations.\n\n{context}"
66
+
67
+ print(f"✨ Total chat() prep time before streaming: {time.time() - full_t0:.3f}s")
68
+ for chunk in call_chat_model_openai(history, user_message):
69
+ yield chunk
70
+
71
+ else:
72
+ yield "[[MODE:chat]]\n"
73
+
74
+ user_message = f"The user did not ask for a recommendation. Ask them to be more specific. Answer this as a general question: {question}"
75
+
76
+ print(f"✨ Total chat() prep time before streaming: {time.time() - full_t0:.3f}s")
77
+ for chunk in call_chat_model_openai(history, user_message):
78
+ yield sanitize_markdown(chunk)
79
+
80
+ return chat
app/llm_services.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import torch
4
+ from openai import OpenAI
5
+ from sentence_transformers import SentenceTransformer
6
+ from app.config import EMBEDDING_MODEL, OPENAI_MODEL, OPENAI_API_KEY
7
+
8
+ # === LLM Config ===
9
+ _sentence_model = None # Not loaded at import time
10
+
11
+ # === Clients ===
12
+ openai_client = OpenAI(api_key=OPENAI_API_KEY)
13
+
14
+ # === System Prompt ===
15
+ SYSTEM_PROMPT = """
16
+ You are a professional film curator and critic. Your role is to analyze the user's preferences and recommend high-quality films or TV shows using only the provided list.
17
+
18
+ Focus on:
19
+
20
+ - Artistic merit and storytelling
21
+ - Genres, themes, tone, and emotional resonance
22
+ - IMDB and Rotten Tomatoes ratings
23
+ - Strong character-driven or thematically rich selections
24
+
25
+ ### Response Format (in markdown):
26
+
27
+ 1. Start with a concise 2 sentences **opening paragraph** that contextualizes the theme and the overall viewing experience the user is seeking. At the end of this paragraph, insert the token: <!-- END_INTRO -->.
28
+
29
+ 2. Then, for each recommendation, use the following format (repeat for each title). At the end of each movie recommendation block, insert the token: <!-- END_MOVIE -->:
30
+
31
+ ```
32
+ ### <Number>. <Movie Title>
33
+ - POSTER_PATH: /abc123.jpg
34
+ - BACKDROP_PATH: /abc123.jpg
35
+ - GENRES: Genre1, Genre2, ...
36
+ - IMDB_RATING: X.X
37
+ - ROTTEN_TOMATOES_RATING: XX%
38
+ - TRAILER_KEY: abc123
39
+ - WHY_YOU_MIGHT_ENJOY_IT: <Short paragraph explaining the appeal based on character, themes, tone, and relevance to the user's intent.>
40
+ <!-- END_MOVIE -->
41
+ ```
42
+
43
+ 3. End with a brief **closing paragraph** that summarizes the emotional or intellectual throughline across the recommendations, and affirms their alignment with the user's preferences.
44
+
45
+ Write in **Markdown** only. Be concise, authoritative, and avoid overly generic statements. Each "Why You Might Enjoy It" should be specific and grounded in the movie’s themes, storytelling, or cultural relevance.
46
+ """
47
+
48
+
49
+ def load_sentence_model():
50
+ global _sentence_model
51
+ if _sentence_model is None:
52
+ print("⏳ Loading embedding model...")
53
+ _sentence_model = SentenceTransformer(
54
+ EMBEDDING_MODEL, device="cuda" if torch.cuda.is_available() else "cpu"
55
+ )
56
+
57
+ print(f"🔥 Model '{EMBEDDING_MODEL}' loaded. Performing GPU warmup...")
58
+
59
+ # Realistic multi-sentence warmup to trigger full CUDA graph
60
+ warmup_sentences = [
61
+ "A suspenseful thriller with deep character development and moral ambiguity.",
62
+ "Coming-of-age story with emotional storytelling and strong ensemble performances.",
63
+ "Mind-bending sci-fi with philosophical undertones and high concept ideas.",
64
+ "Recommend me some comedies.",
65
+ ]
66
+ _ = _sentence_model.encode(warmup_sentences, show_progress_bar=False)
67
+ time.sleep(0.5)
68
+ _ = _sentence_model.encode(warmup_sentences, show_progress_bar=False)
69
+ print("🚀 Embedding model fully warmed up.")
70
+
71
+ return _sentence_model
72
+
73
+
74
+ def embed_text(text: str) -> list[float]:
75
+ model = load_sentence_model()
76
+ return model.encode(text).tolist()
77
+
78
+
79
+ def build_chat_history(history: list, max_turns: int = 5) -> list:
80
+ return [
81
+ {"role": msg.role, "content": msg.content}
82
+ for msg in history[-max_turns * 2:]
83
+ ]
84
+
85
+
86
+
87
+ def call_chat_model_openai(history, user_message: str):
88
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
89
+ messages += build_chat_history(history or [])
90
+ messages.append({"role": "user", "content": user_message})
91
+
92
+ response = openai_client.chat.completions.create(
93
+ model=OPENAI_MODEL, messages=messages, temperature=0.7, stream=True
94
+ )
95
+
96
+ for chunk in response:
97
+ delta = chunk.choices[0].delta.content
98
+ if delta:
99
+ yield delta
data/bm25_files/movie_bm25_model.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:829dfd9b3e50992a2617415e4d45e05eb8b887206e77ff5250416b7cd5dac0d2
3
- size 6637836
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acf76920893471f0ee91cdc1c2fb20c42d8585f12dbc1dc10dcbeff2be720475
3
+ size 291
data/bm25_files/movie_bm25_vocab.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d1304c2611aa1072156fe60aa0298934dc72a8f85c5f484ba49ff641223dadf
3
- size 688111
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f721046d72d43dd9a6808f21e2dd04a174c7d67e89aad23f3696c5854fa3abc
3
+ size 289
data/bm25_files/tv_bm25_model.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b7275086436201a11b2027fa28727d8d05cf6c3180bb68de2cd0deb12e5dc62
3
- size 5712382
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25a6ff1e336835bd87c6b53dc1732142193d5d75f28acafb6e94dd5db5718fc0
3
+ size 291
data/bm25_files/tv_bm25_vocab.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db37c47df4a7f73e792c479d72095a62a35389be3ed01997aa60ab57d0e320a5
3
- size 497815
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dee6416043ca93b626ee9816b992c22024138e8a6d707b1b1ee001b121a8268c
3
+ size 289