Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,21 +5,14 @@ from typing import List, Optional
|
|
5 |
from huggingface_hub import hf_hub_download
|
6 |
from ctransformers import AutoModelForCausalLM
|
7 |
|
8 |
-
# ------------------------
|
9 |
-
# Model configuration
|
10 |
-
# ------------------------
|
11 |
REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
|
12 |
FILENAME = "Llama-3.2-3B-Instruct-Q4_K_L.gguf"
|
13 |
MODEL_TYPE = "llama"
|
14 |
|
15 |
-
# ------------------------
|
16 |
-
# Persistent cache (Docker Spaces -> /data)
|
17 |
-
# ------------------------
|
18 |
CACHE_DIR = os.environ.get("HUGGINGFACE_HUB_CACHE", "/data/hf_cache")
|
19 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
20 |
|
21 |
app = FastAPI(title="Llama 3.2 3B Instruct (ctransformers)")
|
22 |
-
|
23 |
_model = None
|
24 |
|
25 |
def get_model():
|
@@ -27,7 +20,6 @@ def get_model():
|
|
27 |
if _model is not None:
|
28 |
return _model
|
29 |
|
30 |
-
# Download exact GGUF file to persistent cache
|
31 |
local_path = hf_hub_download(
|
32 |
repo_id=REPO_ID,
|
33 |
filename=FILENAME,
|
@@ -35,12 +27,11 @@ def get_model():
|
|
35 |
local_dir_use_symlinks=False,
|
36 |
)
|
37 |
|
38 |
-
# Load with ctransformers (CPU by default)
|
39 |
_model = AutoModelForCausalLM.from_pretrained(
|
40 |
model_path_or_repo_id=os.path.dirname(local_path),
|
41 |
model_file=os.path.basename(local_path),
|
42 |
model_type=MODEL_TYPE,
|
43 |
-
gpu_layers=int(os.environ.get("GPU_LAYERS", "0")),
|
44 |
context_length=int(os.environ.get("CTX_LEN", "4096")),
|
45 |
)
|
46 |
return _model
|
@@ -59,12 +50,7 @@ class GenerateOut(BaseModel):
|
|
59 |
|
60 |
@app.get("/")
|
61 |
def health():
|
62 |
-
return {
|
63 |
-
"status": "ok",
|
64 |
-
"model": {"repo_id": REPO_ID, "filename": FILENAME, "type": MODEL_TYPE},
|
65 |
-
"cache_dir": CACHE_DIR,
|
66 |
-
"endpoints": {"POST /generate": "Generate a completion"},
|
67 |
-
}
|
68 |
|
69 |
@app.post("/generate", response_model=GenerateOut)
|
70 |
def generate(body: GenerateIn):
|
|
|
5 |
from huggingface_hub import hf_hub_download
|
6 |
from ctransformers import AutoModelForCausalLM
|
7 |
|
|
|
|
|
|
|
8 |
REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
|
9 |
FILENAME = "Llama-3.2-3B-Instruct-Q4_K_L.gguf"
|
10 |
MODEL_TYPE = "llama"
|
11 |
|
|
|
|
|
|
|
12 |
CACHE_DIR = os.environ.get("HUGGINGFACE_HUB_CACHE", "/data/hf_cache")
|
13 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
14 |
|
15 |
app = FastAPI(title="Llama 3.2 3B Instruct (ctransformers)")
|
|
|
16 |
_model = None
|
17 |
|
18 |
def get_model():
|
|
|
20 |
if _model is not None:
|
21 |
return _model
|
22 |
|
|
|
23 |
local_path = hf_hub_download(
|
24 |
repo_id=REPO_ID,
|
25 |
filename=FILENAME,
|
|
|
27 |
local_dir_use_symlinks=False,
|
28 |
)
|
29 |
|
|
|
30 |
_model = AutoModelForCausalLM.from_pretrained(
|
31 |
model_path_or_repo_id=os.path.dirname(local_path),
|
32 |
model_file=os.path.basename(local_path),
|
33 |
model_type=MODEL_TYPE,
|
34 |
+
gpu_layers=int(os.environ.get("GPU_LAYERS", "0")),
|
35 |
context_length=int(os.environ.get("CTX_LEN", "4096")),
|
36 |
)
|
37 |
return _model
|
|
|
50 |
|
51 |
@app.get("/")
|
52 |
def health():
|
53 |
+
return {"status": "ok", "cache_dir": CACHE_DIR}
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
@app.post("/generate", response_model=GenerateOut)
|
56 |
def generate(body: GenerateIn):
|