Spaces:

omaryasserhassan
/

qwen-planner-api

Sleeping

App Files Files Community

omaryasserhassan commited on 15 days ago

Commit

947b2e3

verified ·

1 Parent(s): fc85eed

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -30

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from pydantic import BaseModel
 from huggingface_hub import snapshot_download
 from llama_cpp import Llama
-# ---------- pick a writable cache dir ----------
 def first_writable(paths):
     for p in paths:
         if not p:
@@ -21,9 +21,9 @@ def first_writable(paths):
     raise RuntimeError("No writable cache dir found")
 CACHE_BASE = first_writable([
-    os.getenv("SPACE_CACHE_DIR"),   # optional env override
-    "/app/.cache",                  # WORKDIR is usually writable on Spaces
-    "/tmp/app_cache",               # always writable fallback
 ])
 HF_HOME = os.path.join(CACHE_BASE, "huggingface")
@@ -31,46 +31,93 @@ MODELS_DIR = os.path.join(CACHE_BASE, "models")
 os.makedirs(HF_HOME, exist_ok=True)
 os.makedirs(MODELS_DIR, exist_ok=True)
 os.environ["HF_HOME"] = HF_HOME
 os.environ["HF_HUB_CACHE"] = os.path.join(HF_HOME, "hub")
-# ---- Model selection (override in Settings → Variables if needed) ----
 MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-3B-Instruct-GGUF")
-MODEL_FILE = os.getenv("MODEL_FILE", "qwen2.5-3b-instruct-q4_k_m.gguf")
-# Inference knobs
-N_CTX    = int(os.getenv("N_CTX", 2048))
-N_BATCH  = int(os.getenv("N_BATCH", 64))
 N_THREADS = os.cpu_count() or 2
 app = FastAPI(title="Qwen Planner API (CPU)")
 llm = None
 model_loaded = False
 def ensure_model():
-    global llm, model_loaded
     if llm is not None:
         return
-    local_dir = snapshot_download(
-        repo_id=MODEL_REPO,
-        allow_patterns=[MODEL_FILE],
-        local_dir=MODELS_DIR,
-        local_dir_use_symlinks=False,
-    )
-    model_path = os.path.join(local_dir, MODEL_FILE)
-    llm = Llama(
-        model_path=model_path,
-        n_ctx=N_CTX,
-        n_threads=N_THREADS,
-        n_batch=N_BATCH,
-        logits_all=False,
-    )
-    model_loaded = True
 @app.get("/healthz")
 def healthz():
-    return {"status": "ok", "loaded": model_loaded, "cache_base": CACHE_BASE, "model_file": MODEL_FILE}
 SYSTEM_PROMPT = "You are a concise assistant. Reply briefly in plain text."
@@ -79,7 +126,25 @@ class ChatReq(BaseModel):
 @app.post("/chat")
 def chat(req: ChatReq):
-    ensure_model()  # lazy load on first call
-    full_prompt = f"<|system|>\n{SYSTEM_PROMPT}\n</|system|>\n<|user|>\n{req.prompt}\n</|user|>\n"
-    out = llm(prompt=full_prompt, temperature=0.2, top_p=0.9, max_tokens=256, stop=["</s>"])
-    return {"response": out["choices"][0]["text"].strip()}

 from huggingface_hub import snapshot_download
 from llama_cpp import Llama
+# ---------- pick a writable cache dir (tries in order) ----------
 def first_writable(paths):
     for p in paths:
         if not p:
     raise RuntimeError("No writable cache dir found")
 CACHE_BASE = first_writable([
+    os.getenv("SPACE_CACHE_DIR"),   # optional override via Settings → Variables
+    "/app/.cache",                  # WORKDIR is usually writable on HF Spaces
+    "/tmp/app_cache",               # safe fallback
 ])
 HF_HOME = os.path.join(CACHE_BASE, "huggingface")
 os.makedirs(HF_HOME, exist_ok=True)
 os.makedirs(MODELS_DIR, exist_ok=True)
+# Tell huggingface_hub to cache under our writable dir
 os.environ["HF_HOME"] = HF_HOME
 os.environ["HF_HUB_CACHE"] = os.path.join(HF_HOME, "hub")
+# ---- Model selection (can be overridden in Settings → Variables) ----
 MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-3B-Instruct-GGUF")
+MODEL_FILE = os.getenv("MODEL_FILE", "qwen2.5-3b-instruct-q4_k_m.gguf")  # optional hint
+MODEL_REV  = os.getenv("MODEL_REV")  # optional: pin a commit SHA
+# Inference knobs (reduce if memory tight: N_CTX=1024, N_BATCH=32)
+N_CTX     = int(os.getenv("N_CTX", 2048))
+N_BATCH   = int(os.getenv("N_BATCH", 64))
 N_THREADS = os.cpu_count() or 2
 app = FastAPI(title="Qwen Planner API (CPU)")
 llm = None
 model_loaded = False
+chosen_model_path = None  # for /healthz reporting
 def ensure_model():
+    """
+    Lazy-load the model. Downloads any .gguf if needed, then auto-selects one:
+    1) exact MODEL_FILE if present,
+    2) else a *q4*.gguf,
+    3) else the first .gguf found.
+    """
+    global llm, model_loaded, chosen_model_path
     if llm is not None:
         return
+    try:
+        local_dir = snapshot_download(
+            repo_id=MODEL_REPO,
+            revision=MODEL_REV,
+            allow_patterns=["*.gguf"],         # be flexible on filenames
+            local_dir=MODELS_DIR,
+            local_dir_use_symlinks=False,
+        )
+        # discover gguf files
+        ggufs = []
+        for root, _, files in os.walk(local_dir):
+            for f in files:
+                if f.endswith(".gguf"):
+                    ggufs.append(os.path.join(root, f))
+        if not ggufs:
+            raise FileNotFoundError("No .gguf files found after download.")
+        # selection logic
+        model_path = None
+        if MODEL_FILE:
+            cand = os.path.join(local_dir, MODEL_FILE)
+            if os.path.exists(cand):
+                model_path = cand
+        if model_path is None:
+            q4 = [p for p in ggufs if "q4" in os.path.basename(p).lower()]
+            model_path = (q4 or ggufs)[0]
+        chosen_model_path = model_path
+        print(f"[loader] Using GGUF: {model_path}")
+        llm = Llama(
+            model_path=model_path,
+            n_ctx=N_CTX,
+            n_threads=N_THREADS,
+            n_batch=N_BATCH,
+            logits_all=False,
+        )
+        model_loaded = True
+    except Exception as e:
+        # surface a clear error to the HTTP layer
+        raise RuntimeError(f"ensure_model failed: {e}")
 @app.get("/healthz")
 def healthz():
+    return {
+        "status": "ok",
+        "loaded": model_loaded,
+        "cache_base": CACHE_BASE,
+        "model_repo": MODEL_REPO,
+        "model_file_hint": MODEL_FILE,
+        "chosen_model_path": chosen_model_path,
+    }
 SYSTEM_PROMPT = "You are a concise assistant. Reply briefly in plain text."
 @app.post("/chat")
 def chat(req: ChatReq):
+    # Load (or reuse) model
+    try:
+        ensure_model()  # may take minutes on first-ever call
+    except Exception as e:
+        raise HTTPException(status_code=503, detail=f"loading_error: {e}")
+    # Inference
+    try:
+        full_prompt = (
+            f"<|system|>\n{SYSTEM_PROMPT}\n</|system|>\n"
+            f"<|user|>\n{req.prompt}\n</|user|>\n"
+        )
+        out = llm(
+            prompt=full_prompt,
+            temperature=0.2,
+            top_p=0.9,
+            max_tokens=256,
+            stop=["</s>"],
+        )
+        return {"response": out["choices"][0]["text"].strip()}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"infer_error: {e}")