Spaces:

omaryasserhassan
/

qwen-planner-api

Sleeping

App Files Files Community

omaryasserhassan commited on 11 days ago

Commit

dd38c20

verified ·

1 Parent(s): cea5896

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -152

app.py CHANGED Viewed

@@ -1,163 +1,84 @@
-import os, json, re
-from typing import Any, Dict, List, Optional
-from fastapi import FastAPI, HTTPException, Header
 from pydantic import BaseModel
-from llama_cpp import Llama
-# Inference knobs (you can still override via Settings → Variables)
-N_CTX      = int(os.getenv("N_CTX", 2048))
-N_BATCH    = int(os.getenv("N_BATCH", 64))
-N_THREADS  = os.cpu_count() or 2
-API_SECRET = os.getenv("API_SECRET")  # optional bearer auth
-MODELS_DIR = "/app/models"  # baked into the image by Dockerfile
-MODEL_FILE_HINT = os.getenv("MODEL_FILE")  # for /healthz display only
-app = FastAPI(title="Qwen Planner API (CPU)")
-llm: Optional[Llama] = None
-model_loaded = False
-chosen_model_path: Optional[str] = None
-def require_auth(authorization: Optional[str]) -> None:
-    if API_SECRET and authorization != f"Bearer {API_SECRET}":
-        raise HTTPException(status_code=401, detail="Unauthorized")
-def extract_json_block(txt: str) -> str:
-    m = re.search(r"\{.*\}\s*$", txt, flags=re.S)
-    if not m:
-        raise ValueError("No JSON object found in output.")
-    return m.group(0)
-def ensure_model():
-    global llm, model_loaded, chosen_model_path
-    if llm is not None:
-        return
-    # discover baked gguf
-    if not os.path.isdir(MODELS_DIR):
-        raise RuntimeError(f"Models directory not found: {MODELS_DIR}")
-    ggufs: List[str] = []
-    for root, _, files in os.walk(MODELS_DIR):
-        for f in files:
-            if f.endswith(".gguf"):
-                ggufs.append(os.path.join(root, f))
-    if not ggufs:
-        raise RuntimeError("No .gguf files found in /app/models. Rebuild image with model baked in.")
-    # prefer q4 if multiple
-    q4 = [p for p in ggufs if "q4" in os.path.basename(p).lower()]
-    chosen_model_path = (q4 or ggufs)[0]
-    print(f"[loader] Loading GGUF: {chosen_model_path}")
-    llm = Llama(
-        model_path=chosen_model_path,
-        n_ctx=N_CTX,
-        n_threads=N_THREADS,
-        n_batch=N_BATCH,
-        logits_all=False,
-        n_gpu_layers=0,
     )
-    model_loaded = True
-@app.get("/healthz")
-def healthz():
     return {
         "status": "ok",
-        "loaded": model_loaded,
-        "chosen_model_path": chosen_model_path,
-        "model_file_hint": MODEL_FILE_HINT,
-        "n_ctx": N_CTX,
-        "n_batch": N_BATCH,
-        "threads": N_THREADS,
     }
-SYSTEM_PROMPT_CHAT = "You are a concise assistant. Reply briefly in plain text."
-class ChatReq(BaseModel):
-    prompt: str
-@app.post("/chat")
-def chat(req: ChatReq, authorization: Optional[str] = Header(default=None)):
-    require_auth(authorization)
-    try:
-        ensure_model()
-    except Exception as e:
-        raise HTTPException(status_code=503, detail=f"loading_error: {e}")
     try:
-        full_prompt = (
-            f"<|system|>\n{SYSTEM_PROMPT_CHAT}\n</|system|>\n"
-            f"<|user|>\n{req.prompt}\n</|user|>\n"
-        )
-        out = llm(
-            prompt=full_prompt,
-            temperature=0.2,
-            top_p=0.9,
-            max_tokens=256,
-            stop=["</s>"],
-        )
-        return {"response": out["choices"][0]["text"].strip()}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"infer_error: {e}")
-# -------- planner endpoint --------
-class PlanRequest(BaseModel):
-    profile: Dict[str, Any]
-    sample_rows: List[Dict[str, Any]]
-    goal: str = "auto"
-    constraints: Dict[str, Any] = {}
-SYSTEM_PROMPT_PLAN = """You are a data-planning assistant.
-Return ONLY minified JSON matching exactly this schema:
-{
- "cleaning":[{"op":"impute_mean|impute_mode|drop_col|clip","cols":["..."],"params":{}}],
- "encoding":[{"op":"one_hot|ordinal|hash|target","cols":["..."],"params":{}}],
- "scaling":"none|standard|robust|minmax",
- "target":{"name":"<col_or_empty>","type":"classification|regression|auto"},
- "split":{"strategy":"random|stratified","test_size":0.2,"cv":5},
- "metric":"f1|roc_auc|accuracy|mae|rmse|r2",
- "models":["lgbm","rf","xgb","logreg","ridge","catboost"],
- "notes":"<short justification>"
-}
-No prose. No markdown. JSON only."""
-@app.post("/plan")
-def plan(req: PlanRequest, authorization: Optional[str] = Header(default=None)):
-    require_auth(authorization)
-    try:
-        ensure_model()
-    except Exception as e:
-        raise HTTPException(status_code=503, detail=f"loading_error: {e}")
-    try:
-        sample = req.sample_rows[:200]
-        profile_json = json.dumps(req.profile)[:8000]
-        sample_json  = json.dumps(sample)[:8000]
-        constraints_json = json.dumps(req.constraints)[:2000]
-        user_block = (
-            f"Goal:{req.goal}\n"
-            f"Constraints:{constraints_json}\n"
-            f"Profile:{profile_json}\n"
-            f"Sample:{sample_json}\n"
-        )
-        full_prompt = (
-            f"<|system|>\n{SYSTEM_PROMPT_PLAN}\n</|system|>\n"
-            f"<|user|>\n{user_block}\n</|user|>\n"
-        )
-        out = llm(
-            prompt=full_prompt,
-            temperature=0.2,
-            top_p=0.9,
-            max_tokens=512,
-            stop=["</s>"],
         )
-        text = out["choices"][0]["text"]
-        payload = extract_json_block(text)
-        data = json.loads(payload)
-        return {"plan": data}
-    except ValueError as e:
-        raise HTTPException(status_code=422, detail=f"bad_json: {e}")
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"infer_error: {e}")

+import os
+from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from typing import List, Optional
+from huggingface_hub import hf_hub_download
+from ctransformers import AutoModelForCausalLM
+# ------------------------
+# Model configuration
+# ------------------------
+REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
+FILENAME = "Llama-3.2-3B-Instruct-Q4_K_L.gguf"
+MODEL_TYPE = "llama"
+# ------------------------
+# Persistent cache (Docker Spaces -> /data)
+# ------------------------
+CACHE_DIR = os.environ.get("HUGGINGFACE_HUB_CACHE", "/data/hf_cache")
+os.makedirs(CACHE_DIR, exist_ok=True)
+app = FastAPI(title="Llama 3.2 3B Instruct (ctransformers)")
+_model = None
+def get_model():
+    global _model
+    if _model is not None:
+        return _model
+    # Download exact GGUF file to persistent cache
+    local_path = hf_hub_download(
+        repo_id=REPO_ID,
+        filename=FILENAME,
+        local_dir=CACHE_DIR,
+        local_dir_use_symlinks=False,
+    )
+    # Load with ctransformers (CPU by default)
+    _model = AutoModelForCausalLM.from_pretrained(
+        model_path_or_repo_id=os.path.dirname(local_path),
+        model_file=os.path.basename(local_path),
+        model_type=MODEL_TYPE,
+        gpu_layers=int(os.environ.get("GPU_LAYERS", "0")),  # set >0 on GPU Spaces
+        context_length=int(os.environ.get("CTX_LEN", "4096")),
     )
+    return _model
+class GenerateIn(BaseModel):
+    prompt: str
+    max_new_tokens: int = 256
+    temperature: float = 0.7
+    top_p: float = 0.95
+    top_k: int = 40
+    repetition_penalty: float = 1.1
+    stop: Optional[List[str]] = None
+class GenerateOut(BaseModel):
+    completion: str
+@app.get("/")
+def health():
     return {
         "status": "ok",
+        "model": {"repo_id": REPO_ID, "filename": FILENAME, "type": MODEL_TYPE},
+        "cache_dir": CACHE_DIR,
+        "endpoints": {"POST /generate": "Generate a completion"},
     }
+@app.post("/generate", response_model=GenerateOut)
+def generate(body: GenerateIn):
     try:
+        model = get_model()
+        text = model(
+            body.prompt,
+            max_new_tokens=body.max_new_tokens,
+            temperature=body.temperature,
+            top_p=body.top_p,
+            top_k=body.top_k,
+            repetition_penalty=body.repetition_penalty,
+            stop=body.stop,
         )
+        return GenerateOut(completion=text)
     except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))