ateetvatan commited on
Commit
62d49a1
·
1 Parent(s): dc24051

swiched to 4-bit AWQ‑quantized version of OpenChat 3.5 7B as openchat 3.5 requires more then 16gn RAM

Browse files
Files changed (5) hide show
  1. Dockerfile +21 -15
  2. app.py +37 -50
  3. env.example +4 -1
  4. model_loader.py +15 -11
  5. requirements.txt +5 -12
Dockerfile CHANGED
@@ -1,34 +1,40 @@
 
1
  FROM python:3.10-slim
2
 
3
- # Create a non-root user with UID 1000
4
  RUN useradd -m -u 1000 user
5
 
6
- # Switch to that user
7
- USER user
8
  ENV HOME=/home/user
9
- WORKDIR /home/user/app
10
-
11
- # Set cache dirs inside user home
12
  ENV HF_HOME=$HOME/.hf_home
13
- ENV TRANSFORMERS_CACHE=$HOME/.cache/transformers
14
 
15
- # Create cache directories
16
- RUN mkdir -p $HF_HOME $TRANSFORMERS_CACHE
17
 
18
- # Switch back to root to install dependencies
19
  USER root
20
- RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
21
 
22
- # Install Python deps under user home
 
 
 
 
 
23
  COPY --chown=user:user requirements.txt .
24
  RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
25
 
26
- # Copy app files
27
  COPY --chown=user:user . .
28
 
29
- # Expose port and switch user
30
- EXPOSE 7860
 
 
31
  USER user
32
 
 
 
 
33
  # Entrypoint
34
  CMD ["python", "app.py"]
 
1
+ # Base image
2
  FROM python:3.10-slim
3
 
4
+ # Create non-root user (required for Hugging Face Spaces)
5
  RUN useradd -m -u 1000 user
6
 
7
+ # Set paths
 
8
  ENV HOME=/home/user
9
+ ENV APP_HOME=$HOME/app
 
 
10
  ENV HF_HOME=$HOME/.hf_home
 
11
 
12
+ # Use app directory
13
+ WORKDIR $APP_HOME
14
 
15
+ # Switch to root for system setup
16
  USER root
 
17
 
18
+ # Install system dependencies
19
+ RUN apt-get update && apt-get install -y \
20
+ git curl \
21
+ && rm -rf /var/lib/apt/lists/*
22
+
23
+ # Copy requirements.txt and install Python dependencies
24
  COPY --chown=user:user requirements.txt .
25
  RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
26
 
27
+ # Copy app code
28
  COPY --chown=user:user . .
29
 
30
+ # Ensure cache directories exist and are user-writable
31
+ RUN mkdir -p $HF_HOME && chown -R user:user $HF_HOME
32
+
33
+ # Set to non-root user (required for HF Spaces)
34
  USER user
35
 
36
+ # Expose default port
37
+ EXPOSE 7860
38
+
39
  # Entrypoint
40
  CMD ["python", "app.py"]
app.py CHANGED
@@ -1,92 +1,79 @@
1
  """
2
- This is the main file for the OpenChat-3.5 LLM API.
3
- -model_loader.py file to load the model and tokenizer.
4
  """
5
 
6
  import logging
7
  from fastapi import FastAPI, HTTPException
8
  from pydantic import BaseModel
9
- from model_loader import tokenizer, model
10
  import uvicorn
11
- import torch
12
 
 
13
  logger = logging.getLogger(__name__)
14
  logging.basicConfig(level=logging.INFO)
15
 
16
- # Initialize FastAPI app
 
 
17
  app = FastAPI(
18
  title="masx-openchat-llm",
19
- description="MASX AI service exposing the OpenChat-3.5 LLM as an inference endpoint",
20
  version="1.0.0",
21
  )
22
 
23
 
24
- # Request ********schema*******
25
  class PromptRequest(BaseModel):
26
  prompt: str
27
  max_tokens: int = 256
28
- temperature: float = 0.0 # Deterministic by default
29
 
30
 
31
- # Response ********schema*******
32
  class ChatResponse(BaseModel):
33
  response: str
34
 
35
 
36
  @app.get("/status")
37
  async def status():
38
- """Check model status and max supported tokens."""
39
  try:
40
- max_context = getattr(model.config, "max_position_embeddings", "unknown")
41
  return {
42
  "status": "ok",
43
- "model": model.name_or_path,
44
- "max_context_tokens": max_context,
 
 
45
  }
46
  except Exception as e:
47
- logger.error("Status error: %s", str(e))
48
- raise HTTPException(status_code=500, detail=str(e))
49
 
50
 
51
  @app.post("/chat", response_model=ChatResponse)
52
  async def chat(req: PromptRequest):
53
- """OpenChat-3.5 Run inference prompt"""
54
  try:
55
- logger.info("Received prompt: %s", req.prompt)
56
-
57
- # Dynamically choose device at request time
58
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
59
- logger.info(f"Using device: {device}")
60
-
61
- # Move model to device if not
62
- if next(model.parameters()).device != device:
63
- logger.info("Moving model to %s", device)
64
- model.to(device)
65
-
66
- # Tokenize input
67
- inputs = tokenizer(req.prompt, return_tensors="pt").to(device)
68
-
69
- # Generation parameters
70
- gen_kwargs = {
71
- "max_new_tokens": req.max_tokens,
72
- "temperature": req.temperature,
73
- "do_sample": req.temperature > 0,
74
- }
75
-
76
- # Generate output
77
- outputs = model.generate(**inputs, **gen_kwargs)
78
- generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
79
-
80
- # Trim echoed prompt if present
81
- response_text = generated_text[len(req.prompt) :].strip()
82
-
83
- logger.info("Generated response: %s", response_text)
84
- return ChatResponse(response=response_text)
85
-
86
  except Exception as e:
87
- logger.error("Inference failed: %s", str(e), exc_info=True)
88
- raise HTTPException(status_code=500, detail="Inference failure: " + str(e))
89
 
90
 
91
  if __name__ == "__main__":
92
- uvicorn.run("app:app", host="0.0.0.0", port=8080, log_level="info")
 
1
  """
2
+ app.py FastAPI API for Quantized OpenChat 3.5 (GGUF) using ctransformers
 
3
  """
4
 
5
  import logging
6
  from fastapi import FastAPI, HTTPException
7
  from pydantic import BaseModel
8
+ from model_loader import model
9
  import uvicorn
10
+ from ctransformers import AutoTokenizer # Add this at the top
11
 
12
+ # Logger
13
  logger = logging.getLogger(__name__)
14
  logging.basicConfig(level=logging.INFO)
15
 
16
+ tokenizer = model.tokenize # Use model's built-in tokenizer if available
17
+
18
+ # FastAPI app
19
  app = FastAPI(
20
  title="masx-openchat-llm",
21
+ description="MASX AI service exposing a quantized OpenChat-3.5 model (GGUF)",
22
  version="1.0.0",
23
  )
24
 
25
 
26
+ # Request schema
27
  class PromptRequest(BaseModel):
28
  prompt: str
29
  max_tokens: int = 256
30
+ temperature: float = 0.0
31
 
32
 
33
+ # Response schema
34
  class ChatResponse(BaseModel):
35
  response: str
36
 
37
 
38
  @app.get("/status")
39
  async def status():
 
40
  try:
 
41
  return {
42
  "status": "ok",
43
+ "model_path": getattr(model, "model_path", "unknown"),
44
+ "model_type": getattr(model, "model_type", "unknown"),
45
+ "context_length": getattr(model, "context_length", "unknown"),
46
+ "gpu_layers": getattr(model, "gpu_layers", 0),
47
  }
48
  except Exception as e:
49
+ logger.error("Status check failed: %s", str(e), exc_info=True)
50
+ raise HTTPException(status_code=500, detail="Model status check failed")
51
 
52
 
53
  @app.post("/chat", response_model=ChatResponse)
54
  async def chat(req: PromptRequest):
 
55
  try:
56
+ logger.info("Prompt: %s", req.prompt)
57
+
58
+ prompt_tokens = model.tokenize(req.prompt)
59
+ if len(prompt_tokens) > model.context_length:
60
+ raise HTTPException(
61
+ status_code=400,
62
+ detail=f"Prompt too long ({len(prompt_tokens)} tokens). Max context: {model.context_length}",
63
+ )
64
+
65
+ response = model(
66
+ req.prompt,
67
+ max_new_tokens=req.max_tokens,
68
+ temperature=req.temperature,
69
+ stop=["</s>"],
70
+ )
71
+ logger.info("Response: %s", response)
72
+ return ChatResponse(response=response.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  except Exception as e:
74
+ logger.error("Chat error: %s", str(e), exc_info=True)
75
+ raise HTTPException(status_code=500, detail="Inference failure")
76
 
77
 
78
  if __name__ == "__main__":
79
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, log_level="info")
env.example CHANGED
@@ -1 +1,4 @@
1
- MODEL_NAME = "openchat/openchat-3.5-1210"
 
 
 
 
1
+ MODEL_NAME = "openchat/openchat-3.5-1210"
2
+ MODEL_REPO=TheBloke/openchat_3.5-GGUF
3
+ MODEL_FILE=openchat_3.5.Q4_K_M.gguf
4
+ MODEL_TYPE=mistral
model_loader.py CHANGED
@@ -1,20 +1,24 @@
1
- # model_loader.py
2
  import os
 
3
 
4
- # Safe fallback if ENV vars are not set (e.g., during local dev)
5
  os.environ.setdefault("HF_HOME", os.path.expanduser("~/.hf_home"))
6
- os.environ.setdefault("TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/transformers"))
7
 
8
- from transformers import AutoTokenizer, AutoModelForCausalLM
9
- import torch, os
10
  from dotenv import load_dotenv
11
 
12
- # Load environment variables
13
  load_dotenv()
14
- MODEL_NAME = os.getenv("MODEL_NAME", "openchat/openchat-3.5-1210")
15
 
16
- # Load tokenizer
17
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 
 
18
 
19
- # Load model initially on CPU
20
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to("cpu")
 
 
 
 
 
 
 
 
1
  import os
2
+ from ctransformers import AutoModelForCausalLM
3
 
4
+ # Optional: create a local cache dir for gguf model if needed
5
  os.environ.setdefault("HF_HOME", os.path.expanduser("~/.hf_home"))
 
6
 
7
+ # Load environment variables if you plan to use .env (optional)
 
8
  from dotenv import load_dotenv
9
 
 
10
  load_dotenv()
 
11
 
12
+ # Model path or name from environment, fallback to default OpenChat
13
+ MODEL_REPO = os.getenv("MODEL_REPO", "TheBloke/openchat_3.5-GGUF")
14
+ MODEL_FILE = os.getenv("MODEL_FILE", "openchat_3.5.Q4_K_M.gguf")
15
+ MODEL_TYPE = os.getenv("MODEL_TYPE", "mistral") # OpenChat 3.5 is Mistral-compatible
16
 
17
+ # Load quantized GGUF model using ctransformers
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ MODEL_REPO,
20
+ model_file=MODEL_FILE,
21
+ model_type=MODEL_TYPE,
22
+ gpu_layers=0,
23
+ local_files_only=False,
24
+ )
requirements.txt CHANGED
@@ -1,17 +1,10 @@
1
- # Core FastAPI dependencies
2
  fastapi>=0.104.0
3
  uvicorn[standard]>=0.24.0
4
  pydantic>=2.5.0
5
 
6
- # Machine Learning and Transformers
7
- torch>=2.1.0
8
- transformers>=4.36.0
9
- accelerate>=0.25.0
10
 
11
- # Additional utilities
12
- numpy>=1.24.0
13
- requests>=2.31.0
14
-
15
- # Optional: For better performance and monitoring
16
- # tensorboard>=2.15.0 # Uncomment if you need training monitoring
17
- # wandb>=0.16.0 # Uncomment if you need experiment tracking
 
1
+ # Core API
2
  fastapi>=0.104.0
3
  uvicorn[standard]>=0.24.0
4
  pydantic>=2.5.0
5
 
6
+ # Quantized LLM support
7
+ ctransformers>=0.2.27
 
 
8
 
9
+ # Optional: For local .env files
10
+ python-dotenv>=1.0.0