Tim Luka Horstmann
commited on
Commit
Β·
470d3ad
1
Parent(s):
41c6a1d
Swap to pre-model packing
Browse files- Dockerfile +39 -6
- llm_server.py +31 -97
Dockerfile
CHANGED
@@ -1,5 +1,11 @@
|
|
|
|
1 |
FROM python:3.10-slim
|
2 |
|
|
|
|
|
|
|
|
|
|
|
3 |
ENV DEBIAN_FRONTEND=noninteractive \
|
4 |
RUSTUP_HOME=/root/.rustup \
|
5 |
CARGO_HOME=/root/.cargo \
|
@@ -11,7 +17,7 @@ ENV DEBIAN_FRONTEND=noninteractive \
|
|
11 |
|
12 |
WORKDIR /app
|
13 |
|
14 |
-
#
|
15 |
RUN apt-get update && \
|
16 |
apt-get install -y --no-install-recommends \
|
17 |
build-essential cmake git curl wget ninja-build libgomp1 ca-certificates \
|
@@ -20,22 +26,49 @@ RUN apt-get update && \
|
|
20 |
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
|
21 |
rustup default stable
|
22 |
|
23 |
-
#
|
24 |
-
RUN mkdir -p /app/cache /app/pretrained_models/llm &&
|
|
|
25 |
|
26 |
-
# Python deps (except llama-cpp-python)
|
27 |
COPY requirements.txt .
|
28 |
RUN sed -i '/llama-cpp-python/d' requirements.txt && \
|
29 |
pip install --no-cache-dir -r requirements.txt
|
30 |
|
31 |
-
#
|
32 |
RUN git clone --recursive https://github.com/abetlen/llama-cpp-python.git /tmp/llama-cpp-python && \
|
33 |
cd /tmp/llama-cpp-python && \
|
34 |
git submodule update --init --recursive && \
|
35 |
python -m pip install --no-cache-dir . && \
|
36 |
rm -rf /tmp/llama-cpp-python
|
37 |
|
38 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
COPY llm_server.py /app/llm_server.py
|
40 |
|
41 |
EXPOSE 7860
|
|
|
1 |
+
# 1) Base
|
2 |
FROM python:3.10-slim
|
3 |
|
4 |
+
# 2) Buildβtime HF token
|
5 |
+
ARG HF_TOKEN
|
6 |
+
ENV HF_TOKEN=${HF_TOKEN}
|
7 |
+
|
8 |
+
# 3) Environment
|
9 |
ENV DEBIAN_FRONTEND=noninteractive \
|
10 |
RUSTUP_HOME=/root/.rustup \
|
11 |
CARGO_HOME=/root/.cargo \
|
|
|
17 |
|
18 |
WORKDIR /app
|
19 |
|
20 |
+
# 4) System deps + Rust
|
21 |
RUN apt-get update && \
|
22 |
apt-get install -y --no-install-recommends \
|
23 |
build-essential cmake git curl wget ninja-build libgomp1 ca-certificates \
|
|
|
26 |
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
|
27 |
rustup default stable
|
28 |
|
29 |
+
# 5) Prepare directories
|
30 |
+
RUN mkdir -p /app/cache /app/pretrained_models/llm && \
|
31 |
+
chmod -R 777 /app/cache /app/pretrained_models/llm
|
32 |
|
33 |
+
# 6) Install Python deps (except llama-cpp-python)
|
34 |
COPY requirements.txt .
|
35 |
RUN sed -i '/llama-cpp-python/d' requirements.txt && \
|
36 |
pip install --no-cache-dir -r requirements.txt
|
37 |
|
38 |
+
# 7) Build llama-cpp-python from source
|
39 |
RUN git clone --recursive https://github.com/abetlen/llama-cpp-python.git /tmp/llama-cpp-python && \
|
40 |
cd /tmp/llama-cpp-python && \
|
41 |
git submodule update --init --recursive && \
|
42 |
python -m pip install --no-cache-dir . && \
|
43 |
rm -rf /tmp/llama-cpp-python
|
44 |
|
45 |
+
# 8) Install huggingface_hub CLI
|
46 |
+
RUN pip install --no-cache-dir huggingface_hub
|
47 |
+
|
48 |
+
# 9) Clone llama.cpp for quantize tool
|
49 |
+
RUN git clone https://github.com/ggerganov/llama.cpp.git /tmp/llama.cpp && \
|
50 |
+
cd /tmp/llama.cpp && make quantize
|
51 |
+
|
52 |
+
# 10) Download & pre-quantize model (no runtime repack)
|
53 |
+
RUN python3 - <<EOF
|
54 |
+
import os
|
55 |
+
from huggingface_hub import login, hf_hub_download
|
56 |
+
if os.getenv('HF_TOKEN'): login(token=os.getenv('HF_TOKEN'))
|
57 |
+
hf_hub_download(repo_id=os.getenv('MODEL_REPO'),
|
58 |
+
filename=os.getenv('MODEL_FILE'),
|
59 |
+
local_dir='/app/pretrained_models/llm',
|
60 |
+
token=os.getenv('HF_TOKEN'))
|
61 |
+
EOF
|
62 |
+
|
63 |
+
RUN /tmp/llama.cpp/build/quantize \
|
64 |
+
/app/pretrained_models/llm/${MODEL_FILE} \
|
65 |
+
/app/pretrained_models/llm/${MODEL_FILE}.packed.gguf \
|
66 |
+
q4_K_M && \
|
67 |
+
mv /app/pretrained_models/llm/${MODEL_FILE}.packed.gguf \
|
68 |
+
/app/pretrained_models/llm/${MODEL_FILE} && \
|
69 |
+
rm -rf /tmp/llama.cpp
|
70 |
+
|
71 |
+
# 11) Copy server
|
72 |
COPY llm_server.py /app/llm_server.py
|
73 |
|
74 |
EXPOSE 7860
|
llm_server.py
CHANGED
@@ -1,114 +1,50 @@
|
|
1 |
-
import os
|
2 |
-
import time
|
3 |
-
import logging
|
4 |
-
import asyncio
|
5 |
from pathlib import Path
|
6 |
-
|
7 |
from fastapi import FastAPI, HTTPException
|
8 |
-
from fastapi.middleware.cors import CORSMiddleware
|
9 |
from fastapi.responses import JSONResponse
|
10 |
-
|
11 |
-
from huggingface_hub import login, hf_hub_download
|
12 |
from llama_cpp import Llama
|
13 |
|
14 |
-
# βββ
|
15 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
16 |
logger = logging.getLogger("llm_server")
|
17 |
|
18 |
-
# βββ FastAPI setup
|
19 |
app = FastAPI()
|
20 |
-
app.add_middleware(
|
21 |
-
CORSMiddleware,
|
22 |
-
allow_origins=["*"],
|
23 |
-
allow_methods=["*"],
|
24 |
-
allow_headers=["*"],
|
25 |
-
)
|
26 |
-
|
27 |
-
# βββ model paths & env ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
28 |
-
MODEL_DIR = Path("/app/pretrained_models/llm")
|
29 |
-
MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
30 |
|
|
|
|
|
|
|
31 |
MODEL_FILE = os.getenv("MODEL_FILE", "Qwen3-0.6B-Q4_K_M.gguf")
|
32 |
MODEL_PATH = MODEL_DIR / MODEL_FILE
|
33 |
-
|
34 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
35 |
|
36 |
-
# βββ
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
# instantiate llama.cpp
|
57 |
-
global llm
|
58 |
-
llm = Llama(
|
59 |
-
model_path=str(MODEL_PATH),
|
60 |
-
n_ctx=1024,
|
61 |
-
n_batch=64,
|
62 |
-
n_threads=os.cpu_count(), # use all available vCPUs
|
63 |
-
n_gpu_layers=0,
|
64 |
-
use_mlock=True,
|
65 |
-
use_mmap=True,
|
66 |
-
f16_kv=True,
|
67 |
-
batch_prefill=True,
|
68 |
-
prefill_logits=False,
|
69 |
-
verbose=True,
|
70 |
-
)
|
71 |
-
logger.info("β Llama model loaded")
|
72 |
-
|
73 |
-
# warm-up the model
|
74 |
-
try:
|
75 |
-
logger.info("π Warming up modelβ¦")
|
76 |
-
llm.create_chat_completion(
|
77 |
-
messages=[{"role": "user", "content": "/no_think ok"}],
|
78 |
-
max_tokens=1,
|
79 |
-
stream=False,
|
80 |
-
)
|
81 |
-
logger.info("β Warm-up complete")
|
82 |
-
except Exception as e:
|
83 |
-
logger.error(f"Warm-up failed: {e}")
|
84 |
-
|
85 |
-
# schedule periodic keep-alive so the Space never goes idle
|
86 |
-
asyncio.create_task(_keep_model_warm())
|
87 |
-
logger.info("π Keep-alive warm-up task scheduled (every 12 min)")
|
88 |
-
|
89 |
-
async def _keep_model_warm():
|
90 |
-
while True:
|
91 |
-
try:
|
92 |
-
logger.debug("β¦warm-up ping")
|
93 |
-
# a 1-token echo
|
94 |
-
llm.create_chat_completion(
|
95 |
-
messages=[{"role": "user", "content": "/no_think ok"}],
|
96 |
-
max_tokens=1,
|
97 |
-
stream=False,
|
98 |
-
)
|
99 |
-
logger.debug("β¦ping done")
|
100 |
-
except Exception as e:
|
101 |
-
logger.warning(f"Warm-up ping failed: {e}")
|
102 |
-
# HF Spaces idle timeout is ~15 min; ping every 12
|
103 |
-
await asyncio.sleep(12 * 60)
|
104 |
|
105 |
-
# βββ OpenAIβcompatible endpoint
|
106 |
@app.post("/v1/chat/completions")
|
107 |
-
async def
|
108 |
if req.get("model") != "llama-cpp":
|
109 |
-
raise HTTPException(
|
110 |
-
|
111 |
-
# call into llama.cpp
|
112 |
resp = llm.create_chat_completion(
|
113 |
messages=req["messages"],
|
114 |
max_tokens=req.get("max_tokens", 256),
|
@@ -116,15 +52,13 @@ async def chat_completions(req: dict):
|
|
116 |
top_p=req.get("top_p", 1.0),
|
117 |
stream=False,
|
118 |
)
|
119 |
-
|
120 |
-
# repackage into OpenAI JSON
|
121 |
return JSONResponse({
|
122 |
"id": resp["id"],
|
123 |
"object": "chat.completion",
|
124 |
"created": resp.get("created", int(time.time())),
|
125 |
"model": "llama-cpp",
|
126 |
"choices": [{
|
127 |
-
"index":
|
128 |
"message": {
|
129 |
"role": resp["choices"][0]["message"]["role"],
|
130 |
"content": resp["choices"][0]["message"]["content"],
|
|
|
1 |
+
import os, time, logging, asyncio
|
|
|
|
|
|
|
2 |
from pathlib import Path
|
|
|
3 |
from fastapi import FastAPI, HTTPException
|
|
|
4 |
from fastapi.responses import JSONResponse
|
5 |
+
from huggingface_hub import login
|
|
|
6 |
from llama_cpp import Llama
|
7 |
|
8 |
+
# βββ Logging ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
9 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
10 |
logger = logging.getLogger("llm_server")
|
11 |
|
12 |
+
# βββ FastAPI setup βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
13 |
app = FastAPI()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
# βββ Model paths & token ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
16 |
+
MODEL_DIR = Path("/app/pretrained_models/llm")
|
17 |
+
MODEL_DIR.mkdir(exist_ok=True)
|
18 |
MODEL_FILE = os.getenv("MODEL_FILE", "Qwen3-0.6B-Q4_K_M.gguf")
|
19 |
MODEL_PATH = MODEL_DIR / MODEL_FILE
|
|
|
20 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
21 |
|
22 |
+
# βββ Instantiate & warm at import time βββββββββββββββββββββββββββββββββββββ
|
23 |
+
if HF_TOKEN:
|
24 |
+
login(token=HF_TOKEN)
|
25 |
+
|
26 |
+
logger.info("Loading & warming model (this may take a moment)...")
|
27 |
+
llm = Llama(
|
28 |
+
model_path=str(MODEL_PATH),
|
29 |
+
n_ctx=1024,
|
30 |
+
n_threads=os.cpu_count(), # saturate all vCPUs
|
31 |
+
n_gpu_layers=0,
|
32 |
+
use_mlock=True,
|
33 |
+
f16_kv=True,
|
34 |
+
)
|
35 |
+
# oneβtoken warm-up to avoid any firstβrequest penalty
|
36 |
+
llm.create_chat_completion(
|
37 |
+
messages=[{"role":"user","content":"/no_think ok"}],
|
38 |
+
max_tokens=1,
|
39 |
+
stream=False,
|
40 |
+
)
|
41 |
+
logger.info("Model ready")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
+
# βββ OpenAIβcompatible endpoint βββββββββββββββββββββββββββββββββββββββββββββ
|
44 |
@app.post("/v1/chat/completions")
|
45 |
+
async def chat(req: dict):
|
46 |
if req.get("model") != "llama-cpp":
|
47 |
+
raise HTTPException(404, "Model not found")
|
|
|
|
|
48 |
resp = llm.create_chat_completion(
|
49 |
messages=req["messages"],
|
50 |
max_tokens=req.get("max_tokens", 256),
|
|
|
52 |
top_p=req.get("top_p", 1.0),
|
53 |
stream=False,
|
54 |
)
|
|
|
|
|
55 |
return JSONResponse({
|
56 |
"id": resp["id"],
|
57 |
"object": "chat.completion",
|
58 |
"created": resp.get("created", int(time.time())),
|
59 |
"model": "llama-cpp",
|
60 |
"choices": [{
|
61 |
+
"index": 0,
|
62 |
"message": {
|
63 |
"role": resp["choices"][0]["message"]["role"],
|
64 |
"content": resp["choices"][0]["message"]["content"],
|