Spaces:

Luka512
/

website

Running

App Files Files Community

Tim Luka Horstmann commited on Apr 9

Commit

a29c4ff

1 Parent(s): 83ec808

Switched to Llama-3.2-1B Q4_K, added impersonation, optimized performance

Browse files

Files changed (1) hide show

app.py +32 -20

app.py CHANGED Viewed

@@ -19,31 +19,33 @@ app = FastAPI()
 # Authenticate with Hugging Face
 hf_token = os.getenv("HF_TOKEN")
 if not hf_token:
-    logger.error("HF_TOKEN environment variable not set. Required for gated models.")
     raise ValueError("HF_TOKEN not set")
 login(token=hf_token)
 try:
-    # Load precomputed CV embeddings and build FAISS index
     logger.info("Loading CV embeddings from cv_embeddings.json")
     with open("cv_embeddings.json", "r", encoding="utf-8") as f:
         cv_data = json.load(f)
         cv_chunks = [item["chunk"] for item in cv_data]
         cv_embeddings = np.array([item["embedding"] for item in cv_data]).astype('float32')
-    faiss.normalize_L2(cv_embeddings)  # Normalize for cosine similarity
-    faiss_index = faiss.IndexFlatIP(cv_embeddings.shape[1])  # Inner Product for cosine similarity
     faiss_index.add(cv_embeddings)
     logger.info("FAISS index built successfully")
-    logger.info("CV embeddings loaded successfully")
     # Load embedding model
     logger.info("Loading SentenceTransformer model")
-    embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
     logger.info("SentenceTransformer model loaded")
-    # Load Qwen 2.5 7B model
-    repo_id = "bartowski/Llama-3.2-3B-Instruct-GGUF" # "bartowski/Qwen2.5-7B-Instruct-GGUF"
-    filename = "Llama-3.2-3B-Instruct-Q6_K_L.gguf" # "Qwen2.5-7B-Instruct-Q4_K_M.gguf"
     logger.info(f"Loading {filename} model")
     model_path = hf_hub_download(
         repo_id=repo_id,
@@ -54,11 +56,11 @@ try:
     generator = Llama(
         model_path=model_path,
-        n_ctx=2048,
-        n_threads=4,
-        n_batch=512,  # Increase batch size for faster eval
-        n_gpu_layers=0,  # Explicitly set to 0 (no GPU in HF Spaces)
-        verbose=True,  # Keep for perf logging
     )
     logger.info(f"{filename} model loaded")
@@ -68,7 +70,6 @@ except Exception as e:
 def retrieve_context(query, top_k=3):
     try:
-        # Encode query and normalize for FAISS
         query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
         query_embedding = query_embedding.reshape(1, -1)
         faiss.normalize_L2(query_embedding)
@@ -83,8 +84,9 @@ def stream_response(query):
         logger.info(f"Processing query: {query}")
         context = retrieve_context(query)
         prompt = (
-            f"<|im_start|>system\nYou are a helpful assistant.\nI am Tim Luka Horstmann, a German Computer Scientist. Based on my CV:\n{context}\n<|im_end|>\n"
-            f"<|im_start|>user\nQuestion: {query}\nAnswer:<|im_end>\n"
             f"<|im_start|>assistant\n"
         )
@@ -93,8 +95,8 @@ def stream_response(query):
             max_tokens=512,
             stream=True,
             stop=["<|im_end|>", "[DONE]"],
-            temperature=0.7,  # Slightly lower for consistency
-            top_p=0.9,  # Narrow sampling for faster generation
         ):
             yield f"data: {chunk['choices'][0]['text']}\n\n"
         yield "data: [DONE]\n\n"
@@ -117,10 +119,20 @@ async def predict(request: QueryRequest):
 async def health_check():
     return {"status": "healthy"}
 @app.on_event("startup")
 async def warm_up_model():
     logger.info("Warming up the model...")
-    dummy_query = "Hello, please warm up your model."
     for _ in stream_response(dummy_query):
         pass
     logger.info("Model warm-up complete.")

 # Authenticate with Hugging Face
 hf_token = os.getenv("HF_TOKEN")
 if not hf_token:
+    logger.error("HF_TOKEN environment variable not set.")
     raise ValueError("HF_TOKEN not set")
 login(token=hf_token)
+# Models
+sentence_transformer_model = "all-MiniLM-L6-v2"
+repo_id = "bartowski/Llama-3.2-1B-Instruct-GGUF"
+filename = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"  # Q4_K for speed
 try:
+    # Load CV embeddings and build FAISS index
     logger.info("Loading CV embeddings from cv_embeddings.json")
     with open("cv_embeddings.json", "r", encoding="utf-8") as f:
         cv_data = json.load(f)
         cv_chunks = [item["chunk"] for item in cv_data]
         cv_embeddings = np.array([item["embedding"] for item in cv_data]).astype('float32')
+    faiss.normalize_L2(cv_embeddings)
+    faiss_index = faiss.IndexFlatIP(cv_embeddings.shape[1])
     faiss_index.add(cv_embeddings)
     logger.info("FAISS index built successfully")
     # Load embedding model
     logger.info("Loading SentenceTransformer model")
+    embedder = SentenceTransformer(sentence_transformer_model, device="cpu")
     logger.info("SentenceTransformer model loaded")
+    # Load Llama model
     logger.info(f"Loading {filename} model")
     model_path = hf_hub_download(
         repo_id=repo_id,
     generator = Llama(
         model_path=model_path,
+        n_ctx=1024,  # Reduced for speed
+        n_threads=2,  # Match HF Spaces vCPUs
+        n_batch=512,
+        n_gpu_layers=0,
+        verbose=True,
     )
     logger.info(f"{filename} model loaded")
 def retrieve_context(query, top_k=3):
     try:
         query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
         query_embedding = query_embedding.reshape(1, -1)
         faiss.normalize_L2(query_embedding)
         logger.info(f"Processing query: {query}")
         context = retrieve_context(query)
         prompt = (
+            f"<|im_start|>system\nI am Tim Luka Horstmann, a German Computer Scientist. This is my CV:\n{context}\n"
+            f"I will answer your questions about my CV as myself. Please ask me anything!\n<|im_end|>\n"
+            f"<|im_start|>user\n{query}\n<|im_end>\n"
             f"<|im_start|>assistant\n"
         )
             max_tokens=512,
             stream=True,
             stop=["<|im_end|>", "[DONE]"],
+            temperature=0.7,
+            top_p=0.9,
         ):
             yield f"data: {chunk['choices'][0]['text']}\n\n"
         yield "data: [DONE]\n\n"
 async def health_check():
     return {"status": "healthy"}
+@app.get("/model_info")
+async def model_info():
+    return {
+        "model_name": "Llama-3.2-1B-Instruct-GGUF",
+        "model_size": "1B",
+        "embedding_model": sentence_transformer_model,
+        "faiss_index_size": len(cv_chunks),
+        "faiss_index_dim": cv_embeddings.shape[1],
+    }
 @app.on_event("startup")
 async def warm_up_model():
     logger.info("Warming up the model...")
+    dummy_query = "Hi"  # Shorter prompt
     for _ in stream_response(dummy_query):
         pass
     logger.info("Model warm-up complete.")