omaryasserhassan commited on
Commit
5c64481
·
verified ·
1 Parent(s): e3af086

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -25
app.py CHANGED
@@ -4,34 +4,52 @@ from pydantic import BaseModel
4
  from huggingface_hub import snapshot_download
5
  from llama_cpp import Llama
6
 
7
- # --- Model config (change via Space "Settings → Variables" later if you want) ---
 
 
 
 
 
 
 
8
  MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-3B-Instruct-GGUF")
9
- MODEL_FILE = os.getenv("MODEL_FILE", "qwen2.5-3b-instruct-q4_k_m.gguf") # ~2–3GB, int4
 
 
 
 
 
10
 
11
  app = FastAPI(title="Qwen Planner API (CPU)")
12
 
13
  llm = None
 
14
 
15
- def load_model():
16
- global llm
17
- # Download the quantized GGUF at runtime (keeps Docker build fast)
18
- local_dir = snapshot_download(repo_id=MODEL_REPO, allow_patterns=[MODEL_FILE])
 
 
 
 
 
 
 
19
  model_path = os.path.join(local_dir, MODEL_FILE)
 
20
  llm = Llama(
21
  model_path=model_path,
22
- n_ctx=2048, # keep conservative for free CPU
23
- n_threads=os.cpu_count() or 2,
24
- n_batch=64, # small batch to fit RAM comfortably
25
  logits_all=False,
26
  )
27
-
28
- @app.on_event("startup")
29
- def _startup():
30
- load_model()
31
 
32
  @app.get("/healthz")
33
  def healthz():
34
- return {"status": "ok", "model_file": MODEL_FILE}
35
 
36
  SYSTEM_PROMPT = "You are a concise assistant. Reply briefly in plain text."
37
 
@@ -40,15 +58,7 @@ class ChatReq(BaseModel):
40
 
41
  @app.post("/chat")
42
  def chat(req: ChatReq):
43
- if llm is None:
44
- raise HTTPException(status_code=503, detail="Model not loaded yet.")
45
  full_prompt = f"<|system|>\n{SYSTEM_PROMPT}\n</|system|>\n<|user|>\n{req.prompt}\n</|user|>\n"
46
- out = llm(
47
- prompt=full_prompt,
48
- temperature=0.2,
49
- top_p=0.9,
50
- max_tokens=256,
51
- stop=["</s>"]
52
- )
53
- text = out["choices"][0]["text"].strip()
54
- return {"response": text}
 
4
  from huggingface_hub import snapshot_download
5
  from llama_cpp import Llama
6
 
7
+ # ---- Writable cache/weights dirs (HF Spaces) ----
8
+ DATA_DIR = os.getenv("DATA_DIR", "/data")
9
+ HF_HOME = os.path.join(DATA_DIR, "hf")
10
+ os.environ["HF_HOME"] = HF_HOME
11
+ os.environ["HF_HUB_CACHE"] = os.path.join(HF_HOME, "cache")
12
+ os.makedirs(DATA_DIR, exist_ok=True)
13
+
14
+ # ---- Model selection (can override in Settings → Variables) ----
15
  MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-3B-Instruct-GGUF")
16
+ MODEL_FILE = os.getenv("MODEL_FILE", "qwen2.5-3b-instruct-q4_k_m.gguf")
17
+
18
+ # Inference knobs (tune if memory tight)
19
+ N_CTX = int(os.getenv("N_CTX", 2048))
20
+ N_BATCH = int(os.getenv("N_BATCH", 64))
21
+ N_THREADS = os.cpu_count() or 2
22
 
23
  app = FastAPI(title="Qwen Planner API (CPU)")
24
 
25
  llm = None
26
+ model_loaded = False
27
 
28
+ def ensure_model():
29
+ """Lazy-load the model into /data so server starts instantly."""
30
+ global llm, model_loaded
31
+ if llm is not None:
32
+ return
33
+ local_dir = snapshot_download(
34
+ repo_id=MODEL_REPO,
35
+ allow_patterns=[MODEL_FILE],
36
+ local_dir=os.path.join(DATA_DIR, "models"),
37
+ local_dir_use_symlinks=False,
38
+ )
39
  model_path = os.path.join(local_dir, MODEL_FILE)
40
+ # Load GGUF with llama.cpp
41
  llm = Llama(
42
  model_path=model_path,
43
+ n_ctx=N_CTX,
44
+ n_threads=N_THREADS,
45
+ n_batch=N_BATCH,
46
  logits_all=False,
47
  )
48
+ model_loaded = True
 
 
 
49
 
50
  @app.get("/healthz")
51
  def healthz():
52
+ return {"status": "ok", "loaded": model_loaded, "model_file": MODEL_FILE}
53
 
54
  SYSTEM_PROMPT = "You are a concise assistant. Reply briefly in plain text."
55
 
 
58
 
59
  @app.post("/chat")
60
  def chat(req: ChatReq):
61
+ ensure_model() # loads on first request
 
62
  full_prompt = f"<|system|>\n{SYSTEM_PROMPT}\n</|system|>\n<|user|>\n{req.prompt}\n</|user|>\n"
63
+ out = llm(prompt=full_prompt, temperature=0.2, top_p=0.9, max_tokens=256, stop=["</s>"])
64
+ return {"response": out["choices"][0]["text"].strip()}