Tim Luka Horstmann commited on
Commit
58272f8
·
1 Parent(s): aa6b888

Fixed Qwen 2.5 1.5B with llama_cpp for HF Spaces

Browse files
Files changed (1) hide show
  1. app.py +13 -12
app.py CHANGED
@@ -22,7 +22,7 @@ hf_token = os.getenv("HF_TOKEN")
22
  if not hf_token:
23
  logger.error("HF_TOKEN environment variable not set. Required for gated models.")
24
  raise ValueError("HF_TOKEN not set")
25
- login(token=hf_token) # Set token for huggingface_hub
26
 
27
  try:
28
  # Load precomputed CV embeddings
@@ -39,20 +39,20 @@ try:
39
  embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
40
  logger.info("SentenceTransformer model loaded")
41
 
42
- # Load Gemma 3 1B model with llama_cpp
43
- logger.info("Loading Gemma 3 1B model")
44
  model_path = hf_hub_download(
45
- repo_id="google/gemma-3-1b-it-qat-q4_0-gguf",
46
- filename="gemma-3-1b-it-q4_0.gguf",
47
- local_dir="/app/cache" if os.getenv("HF_HOME") else None, # Use cache dir in Docker
48
  token=hf_token,
49
  )
50
  generator = Llama(
51
  model_path=model_path,
52
- n_ctx=2048, # Context length
53
- n_threads=4, # Adjust based on CPU cores
54
  )
55
- logger.info("Gemma 3 1B model loaded")
56
 
57
  except Exception as e:
58
  logger.error(f"Startup error: {str(e)}", exc_info=True)
@@ -74,8 +74,9 @@ def stream_response(query):
74
  logger.info(f"Processing query: {query}")
75
  context = retrieve_context(query)
76
  prompt = (
77
- f"I am Tim Luka Horstmann, a German Computer Scientist. Based on my CV:\n{context}\n\n"
78
- f"Question: {query}\nAnswer:"
 
79
  )
80
 
81
  # Stream response with llama_cpp
@@ -83,7 +84,7 @@ def stream_response(query):
83
  prompt,
84
  max_tokens=512,
85
  stream=True,
86
- stop=["[DONE]"],
87
  ):
88
  yield f"data: {chunk['choices'][0]['text']}\n\n"
89
  yield "data: [DONE]\n\n"
 
22
  if not hf_token:
23
  logger.error("HF_TOKEN environment variable not set. Required for gated models.")
24
  raise ValueError("HF_TOKEN not set")
25
+ login(token=hf_token)
26
 
27
  try:
28
  # Load precomputed CV embeddings
 
39
  embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
40
  logger.info("SentenceTransformer model loaded")
41
 
42
+ # Load Qwen 2.5 1.5B model with llama_cpp
43
+ logger.info("Loading Qwen 2.5 1.5B model")
44
  model_path = hf_hub_download(
45
+ repo_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
46
+ filename="qwen2.5-1.5b-instruct-q4_0.gguf",
47
+ local_dir="/app/cache" if os.getenv("HF_HOME") else None,
48
  token=hf_token,
49
  )
50
  generator = Llama(
51
  model_path=model_path,
52
+ n_ctx=2048,
53
+ n_threads=4,
54
  )
55
+ logger.info("Qwen 2.5 1.5B model loaded")
56
 
57
  except Exception as e:
58
  logger.error(f"Startup error: {str(e)}", exc_info=True)
 
74
  logger.info(f"Processing query: {query}")
75
  context = retrieve_context(query)
76
  prompt = (
77
+ f"<|im_start|>system\nYou are a helpful assistant.\nI am Tim Luka Horstmann, a German Computer Scientist. Based on my CV:\n{context}\n<|im_end|>\n"
78
+ f"<|im_start|>user\nQuestion: {query}\nAnswer:<|im_end>\n"
79
+ f"<|im_start|>assistant\n"
80
  )
81
 
82
  # Stream response with llama_cpp
 
84
  prompt,
85
  max_tokens=512,
86
  stream=True,
87
+ stop=["<|im_end|>", "[DONE]"],
88
  ):
89
  yield f"data: {chunk['choices'][0]['text']}\n\n"
90
  yield "data: [DONE]\n\n"