Tim Luka Horstmann commited on
Commit
470d3ad
Β·
1 Parent(s): 41c6a1d

Swap to pre-model packing

Browse files
Files changed (2) hide show
  1. Dockerfile +39 -6
  2. llm_server.py +31 -97
Dockerfile CHANGED
@@ -1,5 +1,11 @@
 
1
  FROM python:3.10-slim
2
 
 
 
 
 
 
3
  ENV DEBIAN_FRONTEND=noninteractive \
4
  RUSTUP_HOME=/root/.rustup \
5
  CARGO_HOME=/root/.cargo \
@@ -11,7 +17,7 @@ ENV DEBIAN_FRONTEND=noninteractive \
11
 
12
  WORKDIR /app
13
 
14
- # install system deps + Rust toolchain
15
  RUN apt-get update && \
16
  apt-get install -y --no-install-recommends \
17
  build-essential cmake git curl wget ninja-build libgomp1 ca-certificates \
@@ -20,22 +26,49 @@ RUN apt-get update && \
20
  curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
21
  rustup default stable
22
 
23
- # cache dirs
24
- RUN mkdir -p /app/cache /app/pretrained_models/llm && chmod -R 777 /app/cache /app/pretrained_models/llm
 
25
 
26
- # Python deps (except llama-cpp-python)
27
  COPY requirements.txt .
28
  RUN sed -i '/llama-cpp-python/d' requirements.txt && \
29
  pip install --no-cache-dir -r requirements.txt
30
 
31
- # build llama-cpp-python from source (with its llama.cpp submodule)
32
  RUN git clone --recursive https://github.com/abetlen/llama-cpp-python.git /tmp/llama-cpp-python && \
33
  cd /tmp/llama-cpp-python && \
34
  git submodule update --init --recursive && \
35
  python -m pip install --no-cache-dir . && \
36
  rm -rf /tmp/llama-cpp-python
37
 
38
- # copy the server code
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  COPY llm_server.py /app/llm_server.py
40
 
41
  EXPOSE 7860
 
1
+ # 1) Base
2
  FROM python:3.10-slim
3
 
4
+ # 2) Build‐time HF token
5
+ ARG HF_TOKEN
6
+ ENV HF_TOKEN=${HF_TOKEN}
7
+
8
+ # 3) Environment
9
  ENV DEBIAN_FRONTEND=noninteractive \
10
  RUSTUP_HOME=/root/.rustup \
11
  CARGO_HOME=/root/.cargo \
 
17
 
18
  WORKDIR /app
19
 
20
+ # 4) System deps + Rust
21
  RUN apt-get update && \
22
  apt-get install -y --no-install-recommends \
23
  build-essential cmake git curl wget ninja-build libgomp1 ca-certificates \
 
26
  curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
27
  rustup default stable
28
 
29
+ # 5) Prepare directories
30
+ RUN mkdir -p /app/cache /app/pretrained_models/llm && \
31
+ chmod -R 777 /app/cache /app/pretrained_models/llm
32
 
33
+ # 6) Install Python deps (except llama-cpp-python)
34
  COPY requirements.txt .
35
  RUN sed -i '/llama-cpp-python/d' requirements.txt && \
36
  pip install --no-cache-dir -r requirements.txt
37
 
38
+ # 7) Build llama-cpp-python from source
39
  RUN git clone --recursive https://github.com/abetlen/llama-cpp-python.git /tmp/llama-cpp-python && \
40
  cd /tmp/llama-cpp-python && \
41
  git submodule update --init --recursive && \
42
  python -m pip install --no-cache-dir . && \
43
  rm -rf /tmp/llama-cpp-python
44
 
45
+ # 8) Install huggingface_hub CLI
46
+ RUN pip install --no-cache-dir huggingface_hub
47
+
48
+ # 9) Clone llama.cpp for quantize tool
49
+ RUN git clone https://github.com/ggerganov/llama.cpp.git /tmp/llama.cpp && \
50
+ cd /tmp/llama.cpp && make quantize
51
+
52
+ # 10) Download & pre-quantize model (no runtime repack)
53
+ RUN python3 - <<EOF
54
+ import os
55
+ from huggingface_hub import login, hf_hub_download
56
+ if os.getenv('HF_TOKEN'): login(token=os.getenv('HF_TOKEN'))
57
+ hf_hub_download(repo_id=os.getenv('MODEL_REPO'),
58
+ filename=os.getenv('MODEL_FILE'),
59
+ local_dir='/app/pretrained_models/llm',
60
+ token=os.getenv('HF_TOKEN'))
61
+ EOF
62
+
63
+ RUN /tmp/llama.cpp/build/quantize \
64
+ /app/pretrained_models/llm/${MODEL_FILE} \
65
+ /app/pretrained_models/llm/${MODEL_FILE}.packed.gguf \
66
+ q4_K_M && \
67
+ mv /app/pretrained_models/llm/${MODEL_FILE}.packed.gguf \
68
+ /app/pretrained_models/llm/${MODEL_FILE} && \
69
+ rm -rf /tmp/llama.cpp
70
+
71
+ # 11) Copy server
72
  COPY llm_server.py /app/llm_server.py
73
 
74
  EXPOSE 7860
llm_server.py CHANGED
@@ -1,114 +1,50 @@
1
- import os
2
- import time
3
- import logging
4
- import asyncio
5
  from pathlib import Path
6
-
7
  from fastapi import FastAPI, HTTPException
8
- from fastapi.middleware.cors import CORSMiddleware
9
  from fastapi.responses import JSONResponse
10
-
11
- from huggingface_hub import login, hf_hub_download
12
  from llama_cpp import Llama
13
 
14
- # ─── logging setup ────────────────────────────────────────────────────────────
15
  logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
16
  logger = logging.getLogger("llm_server")
17
 
18
- # ─── FastAPI setup ────────────────────────────────────────────────────────────
19
  app = FastAPI()
20
- app.add_middleware(
21
- CORSMiddleware,
22
- allow_origins=["*"],
23
- allow_methods=["*"],
24
- allow_headers=["*"],
25
- )
26
-
27
- # ─── model paths & env ────────────────────────────────────────────────────────
28
- MODEL_DIR = Path("/app/pretrained_models/llm")
29
- MODEL_DIR.mkdir(parents=True, exist_ok=True)
30
 
 
 
 
31
  MODEL_FILE = os.getenv("MODEL_FILE", "Qwen3-0.6B-Q4_K_M.gguf")
32
  MODEL_PATH = MODEL_DIR / MODEL_FILE
33
-
34
  HF_TOKEN = os.getenv("HF_TOKEN")
35
 
36
- # ─── startup: download, init, warm-up, schedule keep-alive ────────────────────
37
- @app.on_event("startup")
38
- async def startup_event():
39
- logger.info("πŸ”§ Starting LLM service…")
40
-
41
- # login if we have a token
42
- if HF_TOKEN:
43
- login(token=HF_TOKEN)
44
-
45
- # download weights if missing
46
- if not MODEL_PATH.exists():
47
- logger.info(f"β†’ Downloading {MODEL_FILE} from {os.getenv('MODEL_REPO')}")
48
- hf_hub_download(
49
- repo_id=os.getenv("MODEL_REPO"),
50
- filename=MODEL_FILE,
51
- local_dir=str(MODEL_DIR),
52
- token=HF_TOKEN,
53
- )
54
- logger.info("βœ” Download complete")
55
-
56
- # instantiate llama.cpp
57
- global llm
58
- llm = Llama(
59
- model_path=str(MODEL_PATH),
60
- n_ctx=1024,
61
- n_batch=64,
62
- n_threads=os.cpu_count(), # use all available vCPUs
63
- n_gpu_layers=0,
64
- use_mlock=True,
65
- use_mmap=True,
66
- f16_kv=True,
67
- batch_prefill=True,
68
- prefill_logits=False,
69
- verbose=True,
70
- )
71
- logger.info("βœ” Llama model loaded")
72
-
73
- # warm-up the model
74
- try:
75
- logger.info("πŸ”„ Warming up model…")
76
- llm.create_chat_completion(
77
- messages=[{"role": "user", "content": "/no_think ok"}],
78
- max_tokens=1,
79
- stream=False,
80
- )
81
- logger.info("βœ” Warm-up complete")
82
- except Exception as e:
83
- logger.error(f"Warm-up failed: {e}")
84
-
85
- # schedule periodic keep-alive so the Space never goes idle
86
- asyncio.create_task(_keep_model_warm())
87
- logger.info("πŸ”„ Keep-alive warm-up task scheduled (every 12 min)")
88
-
89
- async def _keep_model_warm():
90
- while True:
91
- try:
92
- logger.debug("…warm-up ping")
93
- # a 1-token echo
94
- llm.create_chat_completion(
95
- messages=[{"role": "user", "content": "/no_think ok"}],
96
- max_tokens=1,
97
- stream=False,
98
- )
99
- logger.debug("…ping done")
100
- except Exception as e:
101
- logger.warning(f"Warm-up ping failed: {e}")
102
- # HF Spaces idle timeout is ~15 min; ping every 12
103
- await asyncio.sleep(12 * 60)
104
 
105
- # ─── OpenAI‐compatible endpoint ───────────────────────────────────────────────
106
  @app.post("/v1/chat/completions")
107
- async def chat_completions(req: dict):
108
  if req.get("model") != "llama-cpp":
109
- raise HTTPException(status_code=404, detail="Model not found")
110
-
111
- # call into llama.cpp
112
  resp = llm.create_chat_completion(
113
  messages=req["messages"],
114
  max_tokens=req.get("max_tokens", 256),
@@ -116,15 +52,13 @@ async def chat_completions(req: dict):
116
  top_p=req.get("top_p", 1.0),
117
  stream=False,
118
  )
119
-
120
- # repackage into OpenAI JSON
121
  return JSONResponse({
122
  "id": resp["id"],
123
  "object": "chat.completion",
124
  "created": resp.get("created", int(time.time())),
125
  "model": "llama-cpp",
126
  "choices": [{
127
- "index": 0,
128
  "message": {
129
  "role": resp["choices"][0]["message"]["role"],
130
  "content": resp["choices"][0]["message"]["content"],
 
1
+ import os, time, logging, asyncio
 
 
 
2
  from pathlib import Path
 
3
  from fastapi import FastAPI, HTTPException
 
4
  from fastapi.responses import JSONResponse
5
+ from huggingface_hub import login
 
6
  from llama_cpp import Llama
7
 
8
+ # ─── Logging ────────────────────────────────────────────────────────────────
9
  logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
10
  logger = logging.getLogger("llm_server")
11
 
12
+ # ─── FastAPI setup ─────────────────────────────────────────────────────────
13
  app = FastAPI()
 
 
 
 
 
 
 
 
 
 
14
 
15
+ # ─── Model paths & token ────────────────────────────────────────────────────
16
+ MODEL_DIR = Path("/app/pretrained_models/llm")
17
+ MODEL_DIR.mkdir(exist_ok=True)
18
  MODEL_FILE = os.getenv("MODEL_FILE", "Qwen3-0.6B-Q4_K_M.gguf")
19
  MODEL_PATH = MODEL_DIR / MODEL_FILE
 
20
  HF_TOKEN = os.getenv("HF_TOKEN")
21
 
22
+ # ─── Instantiate & warm at import time ─────────────────────────────────────
23
+ if HF_TOKEN:
24
+ login(token=HF_TOKEN)
25
+
26
+ logger.info("Loading & warming model (this may take a moment)...")
27
+ llm = Llama(
28
+ model_path=str(MODEL_PATH),
29
+ n_ctx=1024,
30
+ n_threads=os.cpu_count(), # saturate all vCPUs
31
+ n_gpu_layers=0,
32
+ use_mlock=True,
33
+ f16_kv=True,
34
+ )
35
+ # one‐token warm-up to avoid any first‐request penalty
36
+ llm.create_chat_completion(
37
+ messages=[{"role":"user","content":"/no_think ok"}],
38
+ max_tokens=1,
39
+ stream=False,
40
+ )
41
+ logger.info("Model ready")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ # ─── OpenAI‐compatible endpoint ─────────────────────────────────────────────
44
  @app.post("/v1/chat/completions")
45
+ async def chat(req: dict):
46
  if req.get("model") != "llama-cpp":
47
+ raise HTTPException(404, "Model not found")
 
 
48
  resp = llm.create_chat_completion(
49
  messages=req["messages"],
50
  max_tokens=req.get("max_tokens", 256),
 
52
  top_p=req.get("top_p", 1.0),
53
  stream=False,
54
  )
 
 
55
  return JSONResponse({
56
  "id": resp["id"],
57
  "object": "chat.completion",
58
  "created": resp.get("created", int(time.time())),
59
  "model": "llama-cpp",
60
  "choices": [{
61
+ "index": 0,
62
  "message": {
63
  "role": resp["choices"][0]["message"]["role"],
64
  "content": resp["choices"][0]["message"]["content"],