Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,34 +4,52 @@ from pydantic import BaseModel
|
|
4 |
from huggingface_hub import snapshot_download
|
5 |
from llama_cpp import Llama
|
6 |
|
7 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-3B-Instruct-GGUF")
|
9 |
-
MODEL_FILE = os.getenv("MODEL_FILE", "qwen2.5-3b-instruct-q4_k_m.gguf")
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
app = FastAPI(title="Qwen Planner API (CPU)")
|
12 |
|
13 |
llm = None
|
|
|
14 |
|
15 |
-
def
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
model_path = os.path.join(local_dir, MODEL_FILE)
|
|
|
20 |
llm = Llama(
|
21 |
model_path=model_path,
|
22 |
-
n_ctx=
|
23 |
-
n_threads=
|
24 |
-
n_batch=
|
25 |
logits_all=False,
|
26 |
)
|
27 |
-
|
28 |
-
@app.on_event("startup")
|
29 |
-
def _startup():
|
30 |
-
load_model()
|
31 |
|
32 |
@app.get("/healthz")
|
33 |
def healthz():
|
34 |
-
return {"status": "ok", "model_file": MODEL_FILE}
|
35 |
|
36 |
SYSTEM_PROMPT = "You are a concise assistant. Reply briefly in plain text."
|
37 |
|
@@ -40,15 +58,7 @@ class ChatReq(BaseModel):
|
|
40 |
|
41 |
@app.post("/chat")
|
42 |
def chat(req: ChatReq):
|
43 |
-
|
44 |
-
raise HTTPException(status_code=503, detail="Model not loaded yet.")
|
45 |
full_prompt = f"<|system|>\n{SYSTEM_PROMPT}\n</|system|>\n<|user|>\n{req.prompt}\n</|user|>\n"
|
46 |
-
out = llm(
|
47 |
-
|
48 |
-
temperature=0.2,
|
49 |
-
top_p=0.9,
|
50 |
-
max_tokens=256,
|
51 |
-
stop=["</s>"]
|
52 |
-
)
|
53 |
-
text = out["choices"][0]["text"].strip()
|
54 |
-
return {"response": text}
|
|
|
4 |
from huggingface_hub import snapshot_download
|
5 |
from llama_cpp import Llama
|
6 |
|
7 |
+
# ---- Writable cache/weights dirs (HF Spaces) ----
|
8 |
+
DATA_DIR = os.getenv("DATA_DIR", "/data")
|
9 |
+
HF_HOME = os.path.join(DATA_DIR, "hf")
|
10 |
+
os.environ["HF_HOME"] = HF_HOME
|
11 |
+
os.environ["HF_HUB_CACHE"] = os.path.join(HF_HOME, "cache")
|
12 |
+
os.makedirs(DATA_DIR, exist_ok=True)
|
13 |
+
|
14 |
+
# ---- Model selection (can override in Settings → Variables) ----
|
15 |
MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-3B-Instruct-GGUF")
|
16 |
+
MODEL_FILE = os.getenv("MODEL_FILE", "qwen2.5-3b-instruct-q4_k_m.gguf")
|
17 |
+
|
18 |
+
# Inference knobs (tune if memory tight)
|
19 |
+
N_CTX = int(os.getenv("N_CTX", 2048))
|
20 |
+
N_BATCH = int(os.getenv("N_BATCH", 64))
|
21 |
+
N_THREADS = os.cpu_count() or 2
|
22 |
|
23 |
app = FastAPI(title="Qwen Planner API (CPU)")
|
24 |
|
25 |
llm = None
|
26 |
+
model_loaded = False
|
27 |
|
28 |
+
def ensure_model():
|
29 |
+
"""Lazy-load the model into /data so server starts instantly."""
|
30 |
+
global llm, model_loaded
|
31 |
+
if llm is not None:
|
32 |
+
return
|
33 |
+
local_dir = snapshot_download(
|
34 |
+
repo_id=MODEL_REPO,
|
35 |
+
allow_patterns=[MODEL_FILE],
|
36 |
+
local_dir=os.path.join(DATA_DIR, "models"),
|
37 |
+
local_dir_use_symlinks=False,
|
38 |
+
)
|
39 |
model_path = os.path.join(local_dir, MODEL_FILE)
|
40 |
+
# Load GGUF with llama.cpp
|
41 |
llm = Llama(
|
42 |
model_path=model_path,
|
43 |
+
n_ctx=N_CTX,
|
44 |
+
n_threads=N_THREADS,
|
45 |
+
n_batch=N_BATCH,
|
46 |
logits_all=False,
|
47 |
)
|
48 |
+
model_loaded = True
|
|
|
|
|
|
|
49 |
|
50 |
@app.get("/healthz")
|
51 |
def healthz():
|
52 |
+
return {"status": "ok", "loaded": model_loaded, "model_file": MODEL_FILE}
|
53 |
|
54 |
SYSTEM_PROMPT = "You are a concise assistant. Reply briefly in plain text."
|
55 |
|
|
|
58 |
|
59 |
@app.post("/chat")
|
60 |
def chat(req: ChatReq):
|
61 |
+
ensure_model() # loads on first request
|
|
|
62 |
full_prompt = f"<|system|>\n{SYSTEM_PROMPT}\n</|system|>\n<|user|>\n{req.prompt}\n</|user|>\n"
|
63 |
+
out = llm(prompt=full_prompt, temperature=0.2, top_p=0.9, max_tokens=256, stop=["</s>"])
|
64 |
+
return {"response": out["choices"][0]["text"].strip()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|