Tim Luka Horstmann commited on
Commit
aa6b888
·
1 Parent(s): b845672

updated to llama

Browse files
Files changed (3) hide show
  1. Dockerfile +4 -2
  2. app.py +24 -14
  3. requirements.txt +7 -7
Dockerfile CHANGED
@@ -4,14 +4,16 @@ FROM python:3.10
4
  # Set working directory
5
  WORKDIR /app
6
 
7
- # Install system dependencies for ctransformers and runtime
8
  RUN apt-get update && apt-get install -y \
9
  gcc \
10
  g++ \
11
  libffi-dev \
 
 
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
- # Set environment variables for cache and token
15
  ENV TRANSFORMERS_CACHE=/app/cache
16
  ENV HF_HOME=/app/cache
17
 
 
4
  # Set working directory
5
  WORKDIR /app
6
 
7
+ # Install system dependencies for llama_cpp
8
  RUN apt-get update && apt-get install -y \
9
  gcc \
10
  g++ \
11
  libffi-dev \
12
+ libgcc-s1 \
13
+ libstdc++6 \
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
+ # Set environment variables for cache
17
  ENV TRANSFORMERS_CACHE=/app/cache
18
  ENV HF_HOME=/app/cache
19
 
app.py CHANGED
@@ -6,8 +6,8 @@ import torch.nn.functional as F
6
  from fastapi import FastAPI, HTTPException
7
  from fastapi.responses import StreamingResponse
8
  from pydantic import BaseModel
9
- from ctransformers import AutoModelForCausalLM
10
- from huggingface_hub import login
11
  import logging
12
  import os
13
 
@@ -17,7 +17,7 @@ logger = logging.getLogger(__name__)
17
 
18
  app = FastAPI()
19
 
20
- # Authenticate with Hugging Fac
21
  hf_token = os.getenv("HF_TOKEN")
22
  if not hf_token:
23
  logger.error("HF_TOKEN environment variable not set. Required for gated models.")
@@ -39,15 +39,20 @@ try:
39
  embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
40
  logger.info("SentenceTransformer model loaded")
41
 
42
- # Load Gemma 3 model with ctransformers
43
- logger.info("Loading Gemma 3 model")
44
- generator = AutoModelForCausalLM.from_pretrained(
45
- "google/gemma-3-12b-it-qat-q4_0-gguf",
46
- local_files_only=False,
47
- model_type="gemma",
48
- model_file="gemma-3-12b-it-q4_0.gguf",
49
  )
50
- logger.info("Gemma 3 model loaded")
 
 
 
 
 
51
 
52
  except Exception as e:
53
  logger.error(f"Startup error: {str(e)}", exc_info=True)
@@ -73,9 +78,14 @@ def stream_response(query):
73
  f"Question: {query}\nAnswer:"
74
  )
75
 
76
- # Stream response with ctransformers
77
- for token in generator(prompt, max_new_tokens=512, stream=True):
78
- yield f"data: {token}\n\n"
 
 
 
 
 
79
  yield "data: [DONE]\n\n"
80
  except Exception as e:
81
  logger.error(f"Error in stream_response: {str(e)}")
 
6
  from fastapi import FastAPI, HTTPException
7
  from fastapi.responses import StreamingResponse
8
  from pydantic import BaseModel
9
+ from llama_cpp import Llama
10
+ from huggingface_hub import login, hf_hub_download
11
  import logging
12
  import os
13
 
 
17
 
18
  app = FastAPI()
19
 
20
+ # Authenticate with Hugging Face
21
  hf_token = os.getenv("HF_TOKEN")
22
  if not hf_token:
23
  logger.error("HF_TOKEN environment variable not set. Required for gated models.")
 
39
  embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
40
  logger.info("SentenceTransformer model loaded")
41
 
42
+ # Load Gemma 3 1B model with llama_cpp
43
+ logger.info("Loading Gemma 3 1B model")
44
+ model_path = hf_hub_download(
45
+ repo_id="google/gemma-3-1b-it-qat-q4_0-gguf",
46
+ filename="gemma-3-1b-it-q4_0.gguf",
47
+ local_dir="/app/cache" if os.getenv("HF_HOME") else None, # Use cache dir in Docker
48
+ token=hf_token,
49
  )
50
+ generator = Llama(
51
+ model_path=model_path,
52
+ n_ctx=2048, # Context length
53
+ n_threads=4, # Adjust based on CPU cores
54
+ )
55
+ logger.info("Gemma 3 1B model loaded")
56
 
57
  except Exception as e:
58
  logger.error(f"Startup error: {str(e)}", exc_info=True)
 
78
  f"Question: {query}\nAnswer:"
79
  )
80
 
81
+ # Stream response with llama_cpp
82
+ for chunk in generator(
83
+ prompt,
84
+ max_tokens=512,
85
+ stream=True,
86
+ stop=["[DONE]"],
87
+ ):
88
+ yield f"data: {chunk['choices'][0]['text']}\n\n"
89
  yield "data: [DONE]\n\n"
90
  except Exception as e:
91
  logger.error(f"Error in stream_response: {str(e)}")
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
- fastapi
2
- uvicorn
3
- sentence-transformers
4
- torch
5
- numpy
6
- ctransformers
7
- huggingface_hub
 
1
+ fastapi==0.115.0
2
+ uvicorn==0.31.0
3
+ sentence-transformers==3.1.1
4
+ torch==2.4.1
5
+ numpy==1.26.4
6
+ llama-cpp-python==0.3.1
7
+ huggingface_hub==0.30.1