Spaces:
Sleeping
Sleeping
ateetvatan
commited on
Commit
·
62d49a1
1
Parent(s):
dc24051
swiched to 4-bit AWQ‑quantized version of OpenChat 3.5 7B as openchat 3.5 requires more then 16gn RAM
Browse files- Dockerfile +21 -15
- app.py +37 -50
- env.example +4 -1
- model_loader.py +15 -11
- requirements.txt +5 -12
Dockerfile
CHANGED
@@ -1,34 +1,40 @@
|
|
|
|
1 |
FROM python:3.10-slim
|
2 |
|
3 |
-
# Create
|
4 |
RUN useradd -m -u 1000 user
|
5 |
|
6 |
-
#
|
7 |
-
USER user
|
8 |
ENV HOME=/home/user
|
9 |
-
|
10 |
-
|
11 |
-
# Set cache dirs inside user home
|
12 |
ENV HF_HOME=$HOME/.hf_home
|
13 |
-
ENV TRANSFORMERS_CACHE=$HOME/.cache/transformers
|
14 |
|
15 |
-
#
|
16 |
-
|
17 |
|
18 |
-
# Switch
|
19 |
USER root
|
20 |
-
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
21 |
|
22 |
-
# Install
|
|
|
|
|
|
|
|
|
|
|
23 |
COPY --chown=user:user requirements.txt .
|
24 |
RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
|
25 |
|
26 |
-
# Copy app
|
27 |
COPY --chown=user:user . .
|
28 |
|
29 |
-
#
|
30 |
-
|
|
|
|
|
31 |
USER user
|
32 |
|
|
|
|
|
|
|
33 |
# Entrypoint
|
34 |
CMD ["python", "app.py"]
|
|
|
1 |
+
# Base image
|
2 |
FROM python:3.10-slim
|
3 |
|
4 |
+
# Create non-root user (required for Hugging Face Spaces)
|
5 |
RUN useradd -m -u 1000 user
|
6 |
|
7 |
+
# Set paths
|
|
|
8 |
ENV HOME=/home/user
|
9 |
+
ENV APP_HOME=$HOME/app
|
|
|
|
|
10 |
ENV HF_HOME=$HOME/.hf_home
|
|
|
11 |
|
12 |
+
# Use app directory
|
13 |
+
WORKDIR $APP_HOME
|
14 |
|
15 |
+
# Switch to root for system setup
|
16 |
USER root
|
|
|
17 |
|
18 |
+
# Install system dependencies
|
19 |
+
RUN apt-get update && apt-get install -y \
|
20 |
+
git curl \
|
21 |
+
&& rm -rf /var/lib/apt/lists/*
|
22 |
+
|
23 |
+
# Copy requirements.txt and install Python dependencies
|
24 |
COPY --chown=user:user requirements.txt .
|
25 |
RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
|
26 |
|
27 |
+
# Copy app code
|
28 |
COPY --chown=user:user . .
|
29 |
|
30 |
+
# Ensure cache directories exist and are user-writable
|
31 |
+
RUN mkdir -p $HF_HOME && chown -R user:user $HF_HOME
|
32 |
+
|
33 |
+
# Set to non-root user (required for HF Spaces)
|
34 |
USER user
|
35 |
|
36 |
+
# Expose default port
|
37 |
+
EXPOSE 7860
|
38 |
+
|
39 |
# Entrypoint
|
40 |
CMD ["python", "app.py"]
|
app.py
CHANGED
@@ -1,92 +1,79 @@
|
|
1 |
"""
|
2 |
-
|
3 |
-
-model_loader.py file to load the model and tokenizer.
|
4 |
"""
|
5 |
|
6 |
import logging
|
7 |
from fastapi import FastAPI, HTTPException
|
8 |
from pydantic import BaseModel
|
9 |
-
from model_loader import
|
10 |
import uvicorn
|
11 |
-
import
|
12 |
|
|
|
13 |
logger = logging.getLogger(__name__)
|
14 |
logging.basicConfig(level=logging.INFO)
|
15 |
|
16 |
-
#
|
|
|
|
|
17 |
app = FastAPI(
|
18 |
title="masx-openchat-llm",
|
19 |
-
description="MASX AI service exposing
|
20 |
version="1.0.0",
|
21 |
)
|
22 |
|
23 |
|
24 |
-
# Request
|
25 |
class PromptRequest(BaseModel):
|
26 |
prompt: str
|
27 |
max_tokens: int = 256
|
28 |
-
temperature: float = 0.0
|
29 |
|
30 |
|
31 |
-
# Response
|
32 |
class ChatResponse(BaseModel):
|
33 |
response: str
|
34 |
|
35 |
|
36 |
@app.get("/status")
|
37 |
async def status():
|
38 |
-
"""Check model status and max supported tokens."""
|
39 |
try:
|
40 |
-
max_context = getattr(model.config, "max_position_embeddings", "unknown")
|
41 |
return {
|
42 |
"status": "ok",
|
43 |
-
"
|
44 |
-
"
|
|
|
|
|
45 |
}
|
46 |
except Exception as e:
|
47 |
-
logger.error("Status
|
48 |
-
raise HTTPException(status_code=500, detail=
|
49 |
|
50 |
|
51 |
@app.post("/chat", response_model=ChatResponse)
|
52 |
async def chat(req: PromptRequest):
|
53 |
-
"""OpenChat-3.5 Run inference prompt"""
|
54 |
try:
|
55 |
-
logger.info("
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
"temperature": req.temperature,
|
73 |
-
"do_sample": req.temperature > 0,
|
74 |
-
}
|
75 |
-
|
76 |
-
# Generate output
|
77 |
-
outputs = model.generate(**inputs, **gen_kwargs)
|
78 |
-
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
79 |
-
|
80 |
-
# Trim echoed prompt if present
|
81 |
-
response_text = generated_text[len(req.prompt) :].strip()
|
82 |
-
|
83 |
-
logger.info("Generated response: %s", response_text)
|
84 |
-
return ChatResponse(response=response_text)
|
85 |
-
|
86 |
except Exception as e:
|
87 |
-
logger.error("
|
88 |
-
raise HTTPException(status_code=500, detail="Inference failure
|
89 |
|
90 |
|
91 |
if __name__ == "__main__":
|
92 |
-
uvicorn.run("app:app", host="0.0.0.0", port=
|
|
|
1 |
"""
|
2 |
+
app.py FastAPI API for Quantized OpenChat 3.5 (GGUF) using ctransformers
|
|
|
3 |
"""
|
4 |
|
5 |
import logging
|
6 |
from fastapi import FastAPI, HTTPException
|
7 |
from pydantic import BaseModel
|
8 |
+
from model_loader import model
|
9 |
import uvicorn
|
10 |
+
from ctransformers import AutoTokenizer # Add this at the top
|
11 |
|
12 |
+
# Logger
|
13 |
logger = logging.getLogger(__name__)
|
14 |
logging.basicConfig(level=logging.INFO)
|
15 |
|
16 |
+
tokenizer = model.tokenize # Use model's built-in tokenizer if available
|
17 |
+
|
18 |
+
# FastAPI app
|
19 |
app = FastAPI(
|
20 |
title="masx-openchat-llm",
|
21 |
+
description="MASX AI service exposing a quantized OpenChat-3.5 model (GGUF)",
|
22 |
version="1.0.0",
|
23 |
)
|
24 |
|
25 |
|
26 |
+
# Request schema
|
27 |
class PromptRequest(BaseModel):
|
28 |
prompt: str
|
29 |
max_tokens: int = 256
|
30 |
+
temperature: float = 0.0
|
31 |
|
32 |
|
33 |
+
# Response schema
|
34 |
class ChatResponse(BaseModel):
|
35 |
response: str
|
36 |
|
37 |
|
38 |
@app.get("/status")
|
39 |
async def status():
|
|
|
40 |
try:
|
|
|
41 |
return {
|
42 |
"status": "ok",
|
43 |
+
"model_path": getattr(model, "model_path", "unknown"),
|
44 |
+
"model_type": getattr(model, "model_type", "unknown"),
|
45 |
+
"context_length": getattr(model, "context_length", "unknown"),
|
46 |
+
"gpu_layers": getattr(model, "gpu_layers", 0),
|
47 |
}
|
48 |
except Exception as e:
|
49 |
+
logger.error("Status check failed: %s", str(e), exc_info=True)
|
50 |
+
raise HTTPException(status_code=500, detail="Model status check failed")
|
51 |
|
52 |
|
53 |
@app.post("/chat", response_model=ChatResponse)
|
54 |
async def chat(req: PromptRequest):
|
|
|
55 |
try:
|
56 |
+
logger.info("Prompt: %s", req.prompt)
|
57 |
+
|
58 |
+
prompt_tokens = model.tokenize(req.prompt)
|
59 |
+
if len(prompt_tokens) > model.context_length:
|
60 |
+
raise HTTPException(
|
61 |
+
status_code=400,
|
62 |
+
detail=f"Prompt too long ({len(prompt_tokens)} tokens). Max context: {model.context_length}",
|
63 |
+
)
|
64 |
+
|
65 |
+
response = model(
|
66 |
+
req.prompt,
|
67 |
+
max_new_tokens=req.max_tokens,
|
68 |
+
temperature=req.temperature,
|
69 |
+
stop=["</s>"],
|
70 |
+
)
|
71 |
+
logger.info("Response: %s", response)
|
72 |
+
return ChatResponse(response=response.strip())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
except Exception as e:
|
74 |
+
logger.error("Chat error: %s", str(e), exc_info=True)
|
75 |
+
raise HTTPException(status_code=500, detail="Inference failure")
|
76 |
|
77 |
|
78 |
if __name__ == "__main__":
|
79 |
+
uvicorn.run("app:app", host="0.0.0.0", port=7860, log_level="info")
|
env.example
CHANGED
@@ -1 +1,4 @@
|
|
1 |
-
MODEL_NAME = "openchat/openchat-3.5-1210"
|
|
|
|
|
|
|
|
1 |
+
MODEL_NAME = "openchat/openchat-3.5-1210"
|
2 |
+
MODEL_REPO=TheBloke/openchat_3.5-GGUF
|
3 |
+
MODEL_FILE=openchat_3.5.Q4_K_M.gguf
|
4 |
+
MODEL_TYPE=mistral
|
model_loader.py
CHANGED
@@ -1,20 +1,24 @@
|
|
1 |
-
# model_loader.py
|
2 |
import os
|
|
|
3 |
|
4 |
-
#
|
5 |
os.environ.setdefault("HF_HOME", os.path.expanduser("~/.hf_home"))
|
6 |
-
os.environ.setdefault("TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/transformers"))
|
7 |
|
8 |
-
|
9 |
-
import torch, os
|
10 |
from dotenv import load_dotenv
|
11 |
|
12 |
-
# Load environment variables
|
13 |
load_dotenv()
|
14 |
-
MODEL_NAME = os.getenv("MODEL_NAME", "openchat/openchat-3.5-1210")
|
15 |
|
16 |
-
#
|
17 |
-
|
|
|
|
|
18 |
|
19 |
-
# Load model
|
20 |
-
model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
from ctransformers import AutoModelForCausalLM
|
3 |
|
4 |
+
# Optional: create a local cache dir for gguf model if needed
|
5 |
os.environ.setdefault("HF_HOME", os.path.expanduser("~/.hf_home"))
|
|
|
6 |
|
7 |
+
# Load environment variables if you plan to use .env (optional)
|
|
|
8 |
from dotenv import load_dotenv
|
9 |
|
|
|
10 |
load_dotenv()
|
|
|
11 |
|
12 |
+
# Model path or name from environment, fallback to default OpenChat
|
13 |
+
MODEL_REPO = os.getenv("MODEL_REPO", "TheBloke/openchat_3.5-GGUF")
|
14 |
+
MODEL_FILE = os.getenv("MODEL_FILE", "openchat_3.5.Q4_K_M.gguf")
|
15 |
+
MODEL_TYPE = os.getenv("MODEL_TYPE", "mistral") # OpenChat 3.5 is Mistral-compatible
|
16 |
|
17 |
+
# Load quantized GGUF model using ctransformers
|
18 |
+
model = AutoModelForCausalLM.from_pretrained(
|
19 |
+
MODEL_REPO,
|
20 |
+
model_file=MODEL_FILE,
|
21 |
+
model_type=MODEL_TYPE,
|
22 |
+
gpu_layers=0,
|
23 |
+
local_files_only=False,
|
24 |
+
)
|
requirements.txt
CHANGED
@@ -1,17 +1,10 @@
|
|
1 |
-
# Core
|
2 |
fastapi>=0.104.0
|
3 |
uvicorn[standard]>=0.24.0
|
4 |
pydantic>=2.5.0
|
5 |
|
6 |
-
#
|
7 |
-
|
8 |
-
transformers>=4.36.0
|
9 |
-
accelerate>=0.25.0
|
10 |
|
11 |
-
#
|
12 |
-
|
13 |
-
requests>=2.31.0
|
14 |
-
|
15 |
-
# Optional: For better performance and monitoring
|
16 |
-
# tensorboard>=2.15.0 # Uncomment if you need training monitoring
|
17 |
-
# wandb>=0.16.0 # Uncomment if you need experiment tracking
|
|
|
1 |
+
# Core API
|
2 |
fastapi>=0.104.0
|
3 |
uvicorn[standard]>=0.24.0
|
4 |
pydantic>=2.5.0
|
5 |
|
6 |
+
# Quantized LLM support
|
7 |
+
ctransformers>=0.2.27
|
|
|
|
|
8 |
|
9 |
+
# Optional: For local .env files
|
10 |
+
python-dotenv>=1.0.0
|
|
|
|
|
|
|
|
|
|