Tim Luka Horstmann
commited on
Commit
·
58272f8
1
Parent(s):
aa6b888
Fixed Qwen 2.5 1.5B with llama_cpp for HF Spaces
Browse files
app.py
CHANGED
@@ -22,7 +22,7 @@ hf_token = os.getenv("HF_TOKEN")
|
|
22 |
if not hf_token:
|
23 |
logger.error("HF_TOKEN environment variable not set. Required for gated models.")
|
24 |
raise ValueError("HF_TOKEN not set")
|
25 |
-
login(token=hf_token)
|
26 |
|
27 |
try:
|
28 |
# Load precomputed CV embeddings
|
@@ -39,20 +39,20 @@ try:
|
|
39 |
embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
|
40 |
logger.info("SentenceTransformer model loaded")
|
41 |
|
42 |
-
# Load
|
43 |
-
logger.info("Loading
|
44 |
model_path = hf_hub_download(
|
45 |
-
repo_id="
|
46 |
-
filename="
|
47 |
-
local_dir="/app/cache" if os.getenv("HF_HOME") else None,
|
48 |
token=hf_token,
|
49 |
)
|
50 |
generator = Llama(
|
51 |
model_path=model_path,
|
52 |
-
n_ctx=2048,
|
53 |
-
n_threads=4,
|
54 |
)
|
55 |
-
logger.info("
|
56 |
|
57 |
except Exception as e:
|
58 |
logger.error(f"Startup error: {str(e)}", exc_info=True)
|
@@ -74,8 +74,9 @@ def stream_response(query):
|
|
74 |
logger.info(f"Processing query: {query}")
|
75 |
context = retrieve_context(query)
|
76 |
prompt = (
|
77 |
-
f"
|
78 |
-
f"
|
|
|
79 |
)
|
80 |
|
81 |
# Stream response with llama_cpp
|
@@ -83,7 +84,7 @@ def stream_response(query):
|
|
83 |
prompt,
|
84 |
max_tokens=512,
|
85 |
stream=True,
|
86 |
-
stop=["[DONE]"],
|
87 |
):
|
88 |
yield f"data: {chunk['choices'][0]['text']}\n\n"
|
89 |
yield "data: [DONE]\n\n"
|
|
|
22 |
if not hf_token:
|
23 |
logger.error("HF_TOKEN environment variable not set. Required for gated models.")
|
24 |
raise ValueError("HF_TOKEN not set")
|
25 |
+
login(token=hf_token)
|
26 |
|
27 |
try:
|
28 |
# Load precomputed CV embeddings
|
|
|
39 |
embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
|
40 |
logger.info("SentenceTransformer model loaded")
|
41 |
|
42 |
+
# Load Qwen 2.5 1.5B model with llama_cpp
|
43 |
+
logger.info("Loading Qwen 2.5 1.5B model")
|
44 |
model_path = hf_hub_download(
|
45 |
+
repo_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
|
46 |
+
filename="qwen2.5-1.5b-instruct-q4_0.gguf",
|
47 |
+
local_dir="/app/cache" if os.getenv("HF_HOME") else None,
|
48 |
token=hf_token,
|
49 |
)
|
50 |
generator = Llama(
|
51 |
model_path=model_path,
|
52 |
+
n_ctx=2048,
|
53 |
+
n_threads=4,
|
54 |
)
|
55 |
+
logger.info("Qwen 2.5 1.5B model loaded")
|
56 |
|
57 |
except Exception as e:
|
58 |
logger.error(f"Startup error: {str(e)}", exc_info=True)
|
|
|
74 |
logger.info(f"Processing query: {query}")
|
75 |
context = retrieve_context(query)
|
76 |
prompt = (
|
77 |
+
f"<|im_start|>system\nYou are a helpful assistant.\nI am Tim Luka Horstmann, a German Computer Scientist. Based on my CV:\n{context}\n<|im_end|>\n"
|
78 |
+
f"<|im_start|>user\nQuestion: {query}\nAnswer:<|im_end>\n"
|
79 |
+
f"<|im_start|>assistant\n"
|
80 |
)
|
81 |
|
82 |
# Stream response with llama_cpp
|
|
|
84 |
prompt,
|
85 |
max_tokens=512,
|
86 |
stream=True,
|
87 |
+
stop=["<|im_end|>", "[DONE]"],
|
88 |
):
|
89 |
yield f"data: {chunk['choices'][0]['text']}\n\n"
|
90 |
yield "data: [DONE]\n\n"
|