Spaces:

Luka512
/

website

Running

App Files Files Community

Tim Luka Horstmann commited on Apr 8

Commit

58272f8

1 Parent(s): aa6b888

Fixed Qwen 2.5 1.5B with llama_cpp for HF Spaces

Browse files

Files changed (1) hide show

app.py +13 -12

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ hf_token = os.getenv("HF_TOKEN")
 if not hf_token:
     logger.error("HF_TOKEN environment variable not set. Required for gated models.")
     raise ValueError("HF_TOKEN not set")
-login(token=hf_token)  # Set token for huggingface_hub
 try:
     # Load precomputed CV embeddings
@@ -39,20 +39,20 @@ try:
     embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
     logger.info("SentenceTransformer model loaded")
-    # Load Gemma 3 1B model with llama_cpp
-    logger.info("Loading Gemma 3 1B model")
     model_path = hf_hub_download(
-        repo_id="google/gemma-3-1b-it-qat-q4_0-gguf",
-        filename="gemma-3-1b-it-q4_0.gguf",
-        local_dir="/app/cache" if os.getenv("HF_HOME") else None,  # Use cache dir in Docker
         token=hf_token,
     )
     generator = Llama(
         model_path=model_path,
-        n_ctx=2048,  # Context length
-        n_threads=4,  # Adjust based on CPU cores
     )
-    logger.info("Gemma 3 1B model loaded")
 except Exception as e:
     logger.error(f"Startup error: {str(e)}", exc_info=True)
@@ -74,8 +74,9 @@ def stream_response(query):
         logger.info(f"Processing query: {query}")
         context = retrieve_context(query)
         prompt = (
-            f"I am Tim Luka Horstmann, a German Computer Scientist. Based on my CV:\n{context}\n\n"
-            f"Question: {query}\nAnswer:"
         )
         # Stream response with llama_cpp
@@ -83,7 +84,7 @@ def stream_response(query):
             prompt,
             max_tokens=512,
             stream=True,
-            stop=["[DONE]"],
         ):
             yield f"data: {chunk['choices'][0]['text']}\n\n"
         yield "data: [DONE]\n\n"

 if not hf_token:
     logger.error("HF_TOKEN environment variable not set. Required for gated models.")
     raise ValueError("HF_TOKEN not set")
+login(token=hf_token)
 try:
     # Load precomputed CV embeddings
     embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
     logger.info("SentenceTransformer model loaded")
+    # Load Qwen 2.5 1.5B model with llama_cpp
+    logger.info("Loading Qwen 2.5 1.5B model")
     model_path = hf_hub_download(
+        repo_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
+        filename="qwen2.5-1.5b-instruct-q4_0.gguf",
+        local_dir="/app/cache" if os.getenv("HF_HOME") else None,
         token=hf_token,
     )
     generator = Llama(
         model_path=model_path,
+        n_ctx=2048,
+        n_threads=4,
     )
+    logger.info("Qwen 2.5 1.5B model loaded")
 except Exception as e:
     logger.error(f"Startup error: {str(e)}", exc_info=True)
         logger.info(f"Processing query: {query}")
         context = retrieve_context(query)
         prompt = (
+            f"<|im_start|>system\nYou are a helpful assistant.\nI am Tim Luka Horstmann, a German Computer Scientist. Based on my CV:\n{context}\n<|im_end|>\n"
+            f"<|im_start|>user\nQuestion: {query}\nAnswer:<|im_end>\n"
+            f"<|im_start|>assistant\n"
         )
         # Stream response with llama_cpp
             prompt,
             max_tokens=512,
             stream=True,
+            stop=["<|im_end|>", "[DONE]"],
         ):
             yield f"data: {chunk['choices'][0]['text']}\n\n"
         yield "data: [DONE]\n\n"