omaryasserhassan commited on
Commit
e8ed38e
·
verified ·
1 Parent(s): 5c64481

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -12
app.py CHANGED
@@ -4,19 +4,23 @@ from pydantic import BaseModel
4
  from huggingface_hub import snapshot_download
5
  from llama_cpp import Llama
6
 
7
- # ---- Writable cache/weights dirs (HF Spaces) ----
8
- DATA_DIR = os.getenv("DATA_DIR", "/data")
9
- HF_HOME = os.path.join(DATA_DIR, "hf")
 
 
 
10
  os.environ["HF_HOME"] = HF_HOME
11
- os.environ["HF_HUB_CACHE"] = os.path.join(HF_HOME, "cache")
12
- os.makedirs(DATA_DIR, exist_ok=True)
 
13
 
14
- # ---- Model selection (can override in Settings → Variables) ----
15
  MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-3B-Instruct-GGUF")
16
  MODEL_FILE = os.getenv("MODEL_FILE", "qwen2.5-3b-instruct-q4_k_m.gguf")
17
 
18
- # Inference knobs (tune if memory tight)
19
- N_CTX = int(os.getenv("N_CTX", 2048))
20
  N_BATCH = int(os.getenv("N_BATCH", 64))
21
  N_THREADS = os.cpu_count() or 2
22
 
@@ -26,18 +30,16 @@ llm = None
26
  model_loaded = False
27
 
28
  def ensure_model():
29
- """Lazy-load the model into /data so server starts instantly."""
30
  global llm, model_loaded
31
  if llm is not None:
32
  return
33
  local_dir = snapshot_download(
34
  repo_id=MODEL_REPO,
35
  allow_patterns=[MODEL_FILE],
36
- local_dir=os.path.join(DATA_DIR, "models"),
37
  local_dir_use_symlinks=False,
38
  )
39
  model_path = os.path.join(local_dir, MODEL_FILE)
40
- # Load GGUF with llama.cpp
41
  llm = Llama(
42
  model_path=model_path,
43
  n_ctx=N_CTX,
@@ -58,7 +60,7 @@ class ChatReq(BaseModel):
58
 
59
  @app.post("/chat")
60
  def chat(req: ChatReq):
61
- ensure_model() # loads on first request
62
  full_prompt = f"<|system|>\n{SYSTEM_PROMPT}\n</|system|>\n<|user|>\n{req.prompt}\n</|user|>\n"
63
  out = llm(prompt=full_prompt, temperature=0.2, top_p=0.9, max_tokens=256, stop=["</s>"])
64
  return {"response": out["choices"][0]["text"].strip()}
 
4
  from huggingface_hub import snapshot_download
5
  from llama_cpp import Llama
6
 
7
+ # -------- Writable cache/weights dirs (HF Docker Spaces) --------
8
+ HOME = os.path.expanduser("~") or "/home/user"
9
+ BASE_DIR = os.getenv("SPACE_CACHE_DIR", os.path.join(HOME, ".cache"))
10
+ HF_HOME = os.path.join(BASE_DIR, "huggingface")
11
+ MODELS_DIR = os.path.join(BASE_DIR, "models")
12
+
13
  os.environ["HF_HOME"] = HF_HOME
14
+ os.environ["HF_HUB_CACHE"] = os.path.join(HF_HOME, "hub")
15
+ os.makedirs(HF_HOME, exist_ok=True)
16
+ os.makedirs(MODELS_DIR, exist_ok=True)
17
 
18
+ # ---- Model selection (override in Settings → Variables if needed) ----
19
  MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-3B-Instruct-GGUF")
20
  MODEL_FILE = os.getenv("MODEL_FILE", "qwen2.5-3b-instruct-q4_k_m.gguf")
21
 
22
+ # Inference knobs
23
+ N_CTX = int(os.getenv("N_CTX", 2048))
24
  N_BATCH = int(os.getenv("N_BATCH", 64))
25
  N_THREADS = os.cpu_count() or 2
26
 
 
30
  model_loaded = False
31
 
32
  def ensure_model():
 
33
  global llm, model_loaded
34
  if llm is not None:
35
  return
36
  local_dir = snapshot_download(
37
  repo_id=MODEL_REPO,
38
  allow_patterns=[MODEL_FILE],
39
+ local_dir=MODELS_DIR,
40
  local_dir_use_symlinks=False,
41
  )
42
  model_path = os.path.join(local_dir, MODEL_FILE)
 
43
  llm = Llama(
44
  model_path=model_path,
45
  n_ctx=N_CTX,
 
60
 
61
  @app.post("/chat")
62
  def chat(req: ChatReq):
63
+ ensure_model() # lazy load on first call
64
  full_prompt = f"<|system|>\n{SYSTEM_PROMPT}\n</|system|>\n<|user|>\n{req.prompt}\n</|user|>\n"
65
  out = llm(prompt=full_prompt, temperature=0.2, top_p=0.9, max_tokens=256, stop=["</s>"])
66
  return {"response": out["choices"][0]["text"].strip()}