Spaces:

omaryasserhassan
/

qwen-planner-api

Sleeping

App Files Files Community

omaryasserhassan commited on 17 days ago

Commit

5c64481

verified ·

1 Parent(s): e3af086

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -25

app.py CHANGED Viewed

@@ -4,34 +4,52 @@ from pydantic import BaseModel
 from huggingface_hub import snapshot_download
 from llama_cpp import Llama
-# --- Model config (change via Space "Settings → Variables" later if you want) ---
 MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-3B-Instruct-GGUF")
-MODEL_FILE = os.getenv("MODEL_FILE", "qwen2.5-3b-instruct-q4_k_m.gguf")  # ~2–3GB, int4
 app = FastAPI(title="Qwen Planner API (CPU)")
 llm = None
-def load_model():
-    global llm
-    # Download the quantized GGUF at runtime (keeps Docker build fast)
-    local_dir = snapshot_download(repo_id=MODEL_REPO, allow_patterns=[MODEL_FILE])
     model_path = os.path.join(local_dir, MODEL_FILE)
     llm = Llama(
         model_path=model_path,
-        n_ctx=2048,                 # keep conservative for free CPU
-        n_threads=os.cpu_count() or 2,
-        n_batch=64,                 # small batch to fit RAM comfortably
         logits_all=False,
     )
-@app.on_event("startup")
-def _startup():
-    load_model()
 @app.get("/healthz")
 def healthz():
-    return {"status": "ok", "model_file": MODEL_FILE}
 SYSTEM_PROMPT = "You are a concise assistant. Reply briefly in plain text."
@@ -40,15 +58,7 @@ class ChatReq(BaseModel):
 @app.post("/chat")
 def chat(req: ChatReq):
-    if llm is None:
-        raise HTTPException(status_code=503, detail="Model not loaded yet.")
     full_prompt = f"<|system|>\n{SYSTEM_PROMPT}\n</|system|>\n<|user|>\n{req.prompt}\n</|user|>\n"
-    out = llm(
-        prompt=full_prompt,
-        temperature=0.2,
-        top_p=0.9,
-        max_tokens=256,
-        stop=["</s>"]
-    )
-    text = out["choices"][0]["text"].strip()
-    return {"response": text}

 from huggingface_hub import snapshot_download
 from llama_cpp import Llama
+# ---- Writable cache/weights dirs (HF Spaces) ----
+DATA_DIR = os.getenv("DATA_DIR", "/data")
+HF_HOME = os.path.join(DATA_DIR, "hf")
+os.environ["HF_HOME"] = HF_HOME
+os.environ["HF_HUB_CACHE"] = os.path.join(HF_HOME, "cache")
+os.makedirs(DATA_DIR, exist_ok=True)
+# ---- Model selection (can override in Settings → Variables) ----
 MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-3B-Instruct-GGUF")
+MODEL_FILE = os.getenv("MODEL_FILE", "qwen2.5-3b-instruct-q4_k_m.gguf")
+# Inference knobs (tune if memory tight)
+N_CTX  = int(os.getenv("N_CTX", 2048))
+N_BATCH = int(os.getenv("N_BATCH", 64))
+N_THREADS = os.cpu_count() or 2
 app = FastAPI(title="Qwen Planner API (CPU)")
 llm = None
+model_loaded = False
+def ensure_model():
+    """Lazy-load the model into /data so server starts instantly."""
+    global llm, model_loaded
+    if llm is not None:
+        return
+    local_dir = snapshot_download(
+        repo_id=MODEL_REPO,
+        allow_patterns=[MODEL_FILE],
+        local_dir=os.path.join(DATA_DIR, "models"),
+        local_dir_use_symlinks=False,
+    )
     model_path = os.path.join(local_dir, MODEL_FILE)
+    # Load GGUF with llama.cpp
     llm = Llama(
         model_path=model_path,
+        n_ctx=N_CTX,
+        n_threads=N_THREADS,
+        n_batch=N_BATCH,
         logits_all=False,
     )
+    model_loaded = True
 @app.get("/healthz")
 def healthz():
+    return {"status": "ok", "loaded": model_loaded, "model_file": MODEL_FILE}
 SYSTEM_PROMPT = "You are a concise assistant. Reply briefly in plain text."
 @app.post("/chat")
 def chat(req: ChatReq):
+    ensure_model()  # loads on first request
     full_prompt = f"<|system|>\n{SYSTEM_PROMPT}\n</|system|>\n<|user|>\n{req.prompt}\n</|user|>\n"
+    out = llm(prompt=full_prompt, temperature=0.2, top_p=0.9, max_tokens=256, stop=["</s>"])
+    return {"response": out["choices"][0]["text"].strip()}