Tim Luka Horstmann commited on
Commit
a29c4ff
·
1 Parent(s): 83ec808

Switched to Llama-3.2-1B Q4_K, added impersonation, optimized performance

Browse files
Files changed (1) hide show
  1. app.py +32 -20
app.py CHANGED
@@ -19,31 +19,33 @@ app = FastAPI()
19
  # Authenticate with Hugging Face
20
  hf_token = os.getenv("HF_TOKEN")
21
  if not hf_token:
22
- logger.error("HF_TOKEN environment variable not set. Required for gated models.")
23
  raise ValueError("HF_TOKEN not set")
24
  login(token=hf_token)
25
 
 
 
 
 
 
26
  try:
27
- # Load precomputed CV embeddings and build FAISS index
28
  logger.info("Loading CV embeddings from cv_embeddings.json")
29
  with open("cv_embeddings.json", "r", encoding="utf-8") as f:
30
  cv_data = json.load(f)
31
  cv_chunks = [item["chunk"] for item in cv_data]
32
  cv_embeddings = np.array([item["embedding"] for item in cv_data]).astype('float32')
33
- faiss.normalize_L2(cv_embeddings) # Normalize for cosine similarity
34
- faiss_index = faiss.IndexFlatIP(cv_embeddings.shape[1]) # Inner Product for cosine similarity
35
  faiss_index.add(cv_embeddings)
36
  logger.info("FAISS index built successfully")
37
- logger.info("CV embeddings loaded successfully")
38
 
39
  # Load embedding model
40
  logger.info("Loading SentenceTransformer model")
41
- embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
42
  logger.info("SentenceTransformer model loaded")
43
 
44
- # Load Qwen 2.5 7B model
45
- repo_id = "bartowski/Llama-3.2-3B-Instruct-GGUF" # "bartowski/Qwen2.5-7B-Instruct-GGUF"
46
- filename = "Llama-3.2-3B-Instruct-Q6_K_L.gguf" # "Qwen2.5-7B-Instruct-Q4_K_M.gguf"
47
  logger.info(f"Loading {filename} model")
48
  model_path = hf_hub_download(
49
  repo_id=repo_id,
@@ -54,11 +56,11 @@ try:
54
 
55
  generator = Llama(
56
  model_path=model_path,
57
- n_ctx=2048,
58
- n_threads=4,
59
- n_batch=512, # Increase batch size for faster eval
60
- n_gpu_layers=0, # Explicitly set to 0 (no GPU in HF Spaces)
61
- verbose=True, # Keep for perf logging
62
  )
63
  logger.info(f"{filename} model loaded")
64
 
@@ -68,7 +70,6 @@ except Exception as e:
68
 
69
  def retrieve_context(query, top_k=3):
70
  try:
71
- # Encode query and normalize for FAISS
72
  query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
73
  query_embedding = query_embedding.reshape(1, -1)
74
  faiss.normalize_L2(query_embedding)
@@ -83,8 +84,9 @@ def stream_response(query):
83
  logger.info(f"Processing query: {query}")
84
  context = retrieve_context(query)
85
  prompt = (
86
- f"<|im_start|>system\nYou are a helpful assistant.\nI am Tim Luka Horstmann, a German Computer Scientist. Based on my CV:\n{context}\n<|im_end|>\n"
87
- f"<|im_start|>user\nQuestion: {query}\nAnswer:<|im_end>\n"
 
88
  f"<|im_start|>assistant\n"
89
  )
90
 
@@ -93,8 +95,8 @@ def stream_response(query):
93
  max_tokens=512,
94
  stream=True,
95
  stop=["<|im_end|>", "[DONE]"],
96
- temperature=0.7, # Slightly lower for consistency
97
- top_p=0.9, # Narrow sampling for faster generation
98
  ):
99
  yield f"data: {chunk['choices'][0]['text']}\n\n"
100
  yield "data: [DONE]\n\n"
@@ -117,10 +119,20 @@ async def predict(request: QueryRequest):
117
  async def health_check():
118
  return {"status": "healthy"}
119
 
 
 
 
 
 
 
 
 
 
 
120
  @app.on_event("startup")
121
  async def warm_up_model():
122
  logger.info("Warming up the model...")
123
- dummy_query = "Hello, please warm up your model."
124
  for _ in stream_response(dummy_query):
125
  pass
126
  logger.info("Model warm-up complete.")
 
19
  # Authenticate with Hugging Face
20
  hf_token = os.getenv("HF_TOKEN")
21
  if not hf_token:
22
+ logger.error("HF_TOKEN environment variable not set.")
23
  raise ValueError("HF_TOKEN not set")
24
  login(token=hf_token)
25
 
26
+ # Models
27
+ sentence_transformer_model = "all-MiniLM-L6-v2"
28
+ repo_id = "bartowski/Llama-3.2-1B-Instruct-GGUF"
29
+ filename = "Llama-3.2-1B-Instruct-Q4_K_M.gguf" # Q4_K for speed
30
+
31
  try:
32
+ # Load CV embeddings and build FAISS index
33
  logger.info("Loading CV embeddings from cv_embeddings.json")
34
  with open("cv_embeddings.json", "r", encoding="utf-8") as f:
35
  cv_data = json.load(f)
36
  cv_chunks = [item["chunk"] for item in cv_data]
37
  cv_embeddings = np.array([item["embedding"] for item in cv_data]).astype('float32')
38
+ faiss.normalize_L2(cv_embeddings)
39
+ faiss_index = faiss.IndexFlatIP(cv_embeddings.shape[1])
40
  faiss_index.add(cv_embeddings)
41
  logger.info("FAISS index built successfully")
 
42
 
43
  # Load embedding model
44
  logger.info("Loading SentenceTransformer model")
45
+ embedder = SentenceTransformer(sentence_transformer_model, device="cpu")
46
  logger.info("SentenceTransformer model loaded")
47
 
48
+ # Load Llama model
 
 
49
  logger.info(f"Loading {filename} model")
50
  model_path = hf_hub_download(
51
  repo_id=repo_id,
 
56
 
57
  generator = Llama(
58
  model_path=model_path,
59
+ n_ctx=1024, # Reduced for speed
60
+ n_threads=2, # Match HF Spaces vCPUs
61
+ n_batch=512,
62
+ n_gpu_layers=0,
63
+ verbose=True,
64
  )
65
  logger.info(f"{filename} model loaded")
66
 
 
70
 
71
  def retrieve_context(query, top_k=3):
72
  try:
 
73
  query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
74
  query_embedding = query_embedding.reshape(1, -1)
75
  faiss.normalize_L2(query_embedding)
 
84
  logger.info(f"Processing query: {query}")
85
  context = retrieve_context(query)
86
  prompt = (
87
+ f"<|im_start|>system\nI am Tim Luka Horstmann, a German Computer Scientist. This is my CV:\n{context}\n"
88
+ f"I will answer your questions about my CV as myself. Please ask me anything!\n<|im_end|>\n"
89
+ f"<|im_start|>user\n{query}\n<|im_end>\n"
90
  f"<|im_start|>assistant\n"
91
  )
92
 
 
95
  max_tokens=512,
96
  stream=True,
97
  stop=["<|im_end|>", "[DONE]"],
98
+ temperature=0.7,
99
+ top_p=0.9,
100
  ):
101
  yield f"data: {chunk['choices'][0]['text']}\n\n"
102
  yield "data: [DONE]\n\n"
 
119
  async def health_check():
120
  return {"status": "healthy"}
121
 
122
+ @app.get("/model_info")
123
+ async def model_info():
124
+ return {
125
+ "model_name": "Llama-3.2-1B-Instruct-GGUF",
126
+ "model_size": "1B",
127
+ "embedding_model": sentence_transformer_model,
128
+ "faiss_index_size": len(cv_chunks),
129
+ "faiss_index_dim": cv_embeddings.shape[1],
130
+ }
131
+
132
  @app.on_event("startup")
133
  async def warm_up_model():
134
  logger.info("Warming up the model...")
135
+ dummy_query = "Hi" # Shorter prompt
136
  for _ in stream_response(dummy_query):
137
  pass
138
  logger.info("Model warm-up complete.")