omaryasserhassan commited on
Commit
04c30fa
·
verified ·
1 Parent(s): dd38c20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -16
app.py CHANGED
@@ -5,21 +5,14 @@ from typing import List, Optional
5
  from huggingface_hub import hf_hub_download
6
  from ctransformers import AutoModelForCausalLM
7
 
8
- # ------------------------
9
- # Model configuration
10
- # ------------------------
11
  REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
12
  FILENAME = "Llama-3.2-3B-Instruct-Q4_K_L.gguf"
13
  MODEL_TYPE = "llama"
14
 
15
- # ------------------------
16
- # Persistent cache (Docker Spaces -> /data)
17
- # ------------------------
18
  CACHE_DIR = os.environ.get("HUGGINGFACE_HUB_CACHE", "/data/hf_cache")
19
  os.makedirs(CACHE_DIR, exist_ok=True)
20
 
21
  app = FastAPI(title="Llama 3.2 3B Instruct (ctransformers)")
22
-
23
  _model = None
24
 
25
  def get_model():
@@ -27,7 +20,6 @@ def get_model():
27
  if _model is not None:
28
  return _model
29
 
30
- # Download exact GGUF file to persistent cache
31
  local_path = hf_hub_download(
32
  repo_id=REPO_ID,
33
  filename=FILENAME,
@@ -35,12 +27,11 @@ def get_model():
35
  local_dir_use_symlinks=False,
36
  )
37
 
38
- # Load with ctransformers (CPU by default)
39
  _model = AutoModelForCausalLM.from_pretrained(
40
  model_path_or_repo_id=os.path.dirname(local_path),
41
  model_file=os.path.basename(local_path),
42
  model_type=MODEL_TYPE,
43
- gpu_layers=int(os.environ.get("GPU_LAYERS", "0")), # set >0 on GPU Spaces
44
  context_length=int(os.environ.get("CTX_LEN", "4096")),
45
  )
46
  return _model
@@ -59,12 +50,7 @@ class GenerateOut(BaseModel):
59
 
60
  @app.get("/")
61
  def health():
62
- return {
63
- "status": "ok",
64
- "model": {"repo_id": REPO_ID, "filename": FILENAME, "type": MODEL_TYPE},
65
- "cache_dir": CACHE_DIR,
66
- "endpoints": {"POST /generate": "Generate a completion"},
67
- }
68
 
69
  @app.post("/generate", response_model=GenerateOut)
70
  def generate(body: GenerateIn):
 
5
  from huggingface_hub import hf_hub_download
6
  from ctransformers import AutoModelForCausalLM
7
 
 
 
 
8
  REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
9
  FILENAME = "Llama-3.2-3B-Instruct-Q4_K_L.gguf"
10
  MODEL_TYPE = "llama"
11
 
 
 
 
12
  CACHE_DIR = os.environ.get("HUGGINGFACE_HUB_CACHE", "/data/hf_cache")
13
  os.makedirs(CACHE_DIR, exist_ok=True)
14
 
15
  app = FastAPI(title="Llama 3.2 3B Instruct (ctransformers)")
 
16
  _model = None
17
 
18
  def get_model():
 
20
  if _model is not None:
21
  return _model
22
 
 
23
  local_path = hf_hub_download(
24
  repo_id=REPO_ID,
25
  filename=FILENAME,
 
27
  local_dir_use_symlinks=False,
28
  )
29
 
 
30
  _model = AutoModelForCausalLM.from_pretrained(
31
  model_path_or_repo_id=os.path.dirname(local_path),
32
  model_file=os.path.basename(local_path),
33
  model_type=MODEL_TYPE,
34
+ gpu_layers=int(os.environ.get("GPU_LAYERS", "0")),
35
  context_length=int(os.environ.get("CTX_LEN", "4096")),
36
  )
37
  return _model
 
50
 
51
  @app.get("/")
52
  def health():
53
+ return {"status": "ok", "cache_dir": CACHE_DIR}
 
 
 
 
 
54
 
55
  @app.post("/generate", response_model=GenerateOut)
56
  def generate(body: GenerateIn):