Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,19 +4,23 @@ from pydantic import BaseModel
|
|
4 |
from huggingface_hub import snapshot_download
|
5 |
from llama_cpp import Llama
|
6 |
|
7 |
-
#
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
10 |
os.environ["HF_HOME"] = HF_HOME
|
11 |
-
os.environ["HF_HUB_CACHE"] = os.path.join(HF_HOME, "
|
12 |
-
os.makedirs(
|
|
|
13 |
|
14 |
-
# ---- Model selection (
|
15 |
MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-3B-Instruct-GGUF")
|
16 |
MODEL_FILE = os.getenv("MODEL_FILE", "qwen2.5-3b-instruct-q4_k_m.gguf")
|
17 |
|
18 |
-
# Inference knobs
|
19 |
-
N_CTX
|
20 |
N_BATCH = int(os.getenv("N_BATCH", 64))
|
21 |
N_THREADS = os.cpu_count() or 2
|
22 |
|
@@ -26,18 +30,16 @@ llm = None
|
|
26 |
model_loaded = False
|
27 |
|
28 |
def ensure_model():
|
29 |
-
"""Lazy-load the model into /data so server starts instantly."""
|
30 |
global llm, model_loaded
|
31 |
if llm is not None:
|
32 |
return
|
33 |
local_dir = snapshot_download(
|
34 |
repo_id=MODEL_REPO,
|
35 |
allow_patterns=[MODEL_FILE],
|
36 |
-
local_dir=
|
37 |
local_dir_use_symlinks=False,
|
38 |
)
|
39 |
model_path = os.path.join(local_dir, MODEL_FILE)
|
40 |
-
# Load GGUF with llama.cpp
|
41 |
llm = Llama(
|
42 |
model_path=model_path,
|
43 |
n_ctx=N_CTX,
|
@@ -58,7 +60,7 @@ class ChatReq(BaseModel):
|
|
58 |
|
59 |
@app.post("/chat")
|
60 |
def chat(req: ChatReq):
|
61 |
-
ensure_model() #
|
62 |
full_prompt = f"<|system|>\n{SYSTEM_PROMPT}\n</|system|>\n<|user|>\n{req.prompt}\n</|user|>\n"
|
63 |
out = llm(prompt=full_prompt, temperature=0.2, top_p=0.9, max_tokens=256, stop=["</s>"])
|
64 |
return {"response": out["choices"][0]["text"].strip()}
|
|
|
4 |
from huggingface_hub import snapshot_download
|
5 |
from llama_cpp import Llama
|
6 |
|
7 |
+
# -------- Writable cache/weights dirs (HF Docker Spaces) --------
|
8 |
+
HOME = os.path.expanduser("~") or "/home/user"
|
9 |
+
BASE_DIR = os.getenv("SPACE_CACHE_DIR", os.path.join(HOME, ".cache"))
|
10 |
+
HF_HOME = os.path.join(BASE_DIR, "huggingface")
|
11 |
+
MODELS_DIR = os.path.join(BASE_DIR, "models")
|
12 |
+
|
13 |
os.environ["HF_HOME"] = HF_HOME
|
14 |
+
os.environ["HF_HUB_CACHE"] = os.path.join(HF_HOME, "hub")
|
15 |
+
os.makedirs(HF_HOME, exist_ok=True)
|
16 |
+
os.makedirs(MODELS_DIR, exist_ok=True)
|
17 |
|
18 |
+
# ---- Model selection (override in Settings → Variables if needed) ----
|
19 |
MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-3B-Instruct-GGUF")
|
20 |
MODEL_FILE = os.getenv("MODEL_FILE", "qwen2.5-3b-instruct-q4_k_m.gguf")
|
21 |
|
22 |
+
# Inference knobs
|
23 |
+
N_CTX = int(os.getenv("N_CTX", 2048))
|
24 |
N_BATCH = int(os.getenv("N_BATCH", 64))
|
25 |
N_THREADS = os.cpu_count() or 2
|
26 |
|
|
|
30 |
model_loaded = False
|
31 |
|
32 |
def ensure_model():
|
|
|
33 |
global llm, model_loaded
|
34 |
if llm is not None:
|
35 |
return
|
36 |
local_dir = snapshot_download(
|
37 |
repo_id=MODEL_REPO,
|
38 |
allow_patterns=[MODEL_FILE],
|
39 |
+
local_dir=MODELS_DIR,
|
40 |
local_dir_use_symlinks=False,
|
41 |
)
|
42 |
model_path = os.path.join(local_dir, MODEL_FILE)
|
|
|
43 |
llm = Llama(
|
44 |
model_path=model_path,
|
45 |
n_ctx=N_CTX,
|
|
|
60 |
|
61 |
@app.post("/chat")
|
62 |
def chat(req: ChatReq):
|
63 |
+
ensure_model() # lazy load on first call
|
64 |
full_prompt = f"<|system|>\n{SYSTEM_PROMPT}\n</|system|>\n<|user|>\n{req.prompt}\n</|user|>\n"
|
65 |
out = llm(prompt=full_prompt, temperature=0.2, top_p=0.9, max_tokens=256, stop=["</s>"])
|
66 |
return {"response": out["choices"][0]["text"].strip()}
|