ndc8
commited on
Commit
Β·
358e717
1
Parent(s):
994c0b4
Update Dockerfile and application entry point for GGUF backend; optimize memory usage in model parameters and requirements
Browse files- Dockerfile +2 -2
- app.py +4 -3
- gemma_gguf_backend.py +23 -15
- requirements.txt +8 -11
Dockerfile
CHANGED
@@ -27,5 +27,5 @@ COPY --chown=user . .
|
|
27 |
# Expose port 7860 (HF Spaces default)
|
28 |
EXPOSE 7860
|
29 |
|
30 |
-
# Command to run the application
|
31 |
-
CMD ["python", "-m", "uvicorn", "
|
|
|
27 |
# Expose port 7860 (HF Spaces default)
|
28 |
EXPOSE 7860
|
29 |
|
30 |
+
# Command to run the GGUF backend application
|
31 |
+
CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
-
Entry point for Hugging Face Spaces
|
4 |
-
This file imports and runs the FastAPI application
|
5 |
"""
|
6 |
|
7 |
-
|
|
|
8 |
|
9 |
if __name__ == "__main__":
|
10 |
import uvicorn
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
+
Entry point for Hugging Face Spaces - GGUF Backend
|
4 |
+
This file imports and runs the GGUF FastAPI application optimized for memory constraints
|
5 |
"""
|
6 |
|
7 |
+
# Import the GGUF backend instead of the transformers backend
|
8 |
+
from gemma_gguf_backend import app
|
9 |
|
10 |
if __name__ == "__main__":
|
11 |
import uvicorn
|
gemma_gguf_backend.py
CHANGED
@@ -52,7 +52,7 @@ class ChatMessage(BaseModel):
|
|
52 |
class ChatCompletionRequest(BaseModel):
|
53 |
model: str = Field(default="gemma-3n-e4b-it", description="The model to use for completion")
|
54 |
messages: List[ChatMessage] = Field(..., description="List of messages in the conversation")
|
55 |
-
max_tokens: Optional[int] = Field(default=
|
56 |
temperature: Optional[float] = Field(default=1.0, ge=0.0, le=2.0, description="Sampling temperature")
|
57 |
top_p: Optional[float] = Field(default=0.95, ge=0.0, le=1.0, description="Top-p sampling")
|
58 |
top_k: Optional[int] = Field(default=64, ge=1, le=100, description="Top-k sampling")
|
@@ -121,20 +121,25 @@ async def lifespan(app: FastAPI):
|
|
121 |
try:
|
122 |
logger.info(f"π₯ Loading Gemma 3n GGUF model from {current_model}...")
|
123 |
|
124 |
-
# Configure model parameters for
|
125 |
llm = Llama.from_pretrained(
|
126 |
repo_id=current_model,
|
127 |
-
filename="*
|
128 |
verbose=True,
|
129 |
-
#
|
130 |
-
n_ctx=
|
131 |
-
n_threads=
|
132 |
-
n_gpu_layers
|
|
|
|
|
|
|
|
|
|
|
133 |
# Chat template for Gemma 3n format
|
134 |
chat_format="gemma", # Try built-in gemma format first
|
135 |
)
|
136 |
|
137 |
-
logger.info("β
Successfully loaded Gemma 3n GGUF model")
|
138 |
|
139 |
except Exception as e:
|
140 |
logger.error(f"β Failed to initialize Gemma 3n model: {e}")
|
@@ -172,12 +177,15 @@ def ensure_model_ready():
|
|
172 |
# For demo mode, we'll allow the service to run even without a model
|
173 |
pass
|
174 |
|
175 |
-
def generate_response_gguf(messages: List[ChatMessage], max_tokens: int =
|
176 |
-
"""Generate response using GGUF model via llama-cpp-python."""
|
177 |
if llm is None:
|
178 |
# Demo mode response
|
179 |
return "π€ Demo mode: Gemma 3n model not loaded. This would be a real response from the Gemma 3n model. Please download the GGUF model from https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF"
|
180 |
|
|
|
|
|
|
|
181 |
try:
|
182 |
# Use the chat completion method if available
|
183 |
if hasattr(llm, 'create_chat_completion'):
|
@@ -451,15 +459,15 @@ def start_training(req: StartTrainingRequest):
|
|
451 |
proc = _start_training_subprocess(job_id, req.model_dump())
|
452 |
TRAIN_JOBS[job_id]["status"] = "running"
|
453 |
TRAIN_JOBS[job_id]["pid"] = proc.pid
|
454 |
-
|
455 |
watcher = threading.Thread(target=_watch_process, args=(job_id, proc), daemon=True)
|
456 |
watcher.start()
|
457 |
return StartTrainingResponse(job_id=job_id, status="running", output_dir=output_dir)
|
458 |
except Exception as e:
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
|
464 |
@app.get("/train/status/{job_id}", response_model=TrainStatusResponse)
|
465 |
def train_status(job_id: str):
|
|
|
52 |
class ChatCompletionRequest(BaseModel):
|
53 |
model: str = Field(default="gemma-3n-e4b-it", description="The model to use for completion")
|
54 |
messages: List[ChatMessage] = Field(..., description="List of messages in the conversation")
|
55 |
+
max_tokens: Optional[int] = Field(default=256, ge=1, le=1024, description="Maximum tokens to generate (reduced for memory efficiency)")
|
56 |
temperature: Optional[float] = Field(default=1.0, ge=0.0, le=2.0, description="Sampling temperature")
|
57 |
top_p: Optional[float] = Field(default=0.95, ge=0.0, le=1.0, description="Top-p sampling")
|
58 |
top_k: Optional[int] = Field(default=64, ge=1, le=100, description="Top-k sampling")
|
|
|
121 |
try:
|
122 |
logger.info(f"π₯ Loading Gemma 3n GGUF model from {current_model}...")
|
123 |
|
124 |
+
# Configure model parameters optimized for HF Spaces memory constraints
|
125 |
llm = Llama.from_pretrained(
|
126 |
repo_id=current_model,
|
127 |
+
filename="*Q4_0.gguf", # Use Q4_0 instead of Q4_K_M for lower memory usage
|
128 |
verbose=True,
|
129 |
+
# Memory-optimized settings for HF Spaces
|
130 |
+
n_ctx=2048, # Reduced context length to save memory (was 4096)
|
131 |
+
n_threads=2, # Fewer threads for lower memory usage (was 4)
|
132 |
+
n_gpu_layers=0, # Force CPU-only to avoid GPU memory issues
|
133 |
+
# Additional memory optimizations
|
134 |
+
n_batch=512, # Smaller batch size to reduce memory peaks
|
135 |
+
use_mmap=True, # Use memory mapping to reduce RAM usage
|
136 |
+
use_mlock=False, # Don't lock memory pages
|
137 |
+
low_vram=True, # Enable low VRAM mode for additional memory savings
|
138 |
# Chat template for Gemma 3n format
|
139 |
chat_format="gemma", # Try built-in gemma format first
|
140 |
)
|
141 |
|
142 |
+
logger.info("β
Successfully loaded Gemma 3n GGUF model with memory optimizations")
|
143 |
|
144 |
except Exception as e:
|
145 |
logger.error(f"β Failed to initialize Gemma 3n model: {e}")
|
|
|
177 |
# For demo mode, we'll allow the service to run even without a model
|
178 |
pass
|
179 |
|
180 |
+
def generate_response_gguf(messages: List[ChatMessage], max_tokens: int = 256, temperature: float = 1.0, top_p: float = 0.95, top_k: int = 64) -> str:
|
181 |
+
"""Generate response using GGUF model via llama-cpp-python (memory-optimized)."""
|
182 |
if llm is None:
|
183 |
# Demo mode response
|
184 |
return "π€ Demo mode: Gemma 3n model not loaded. This would be a real response from the Gemma 3n model. Please download the GGUF model from https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF"
|
185 |
|
186 |
+
# Limit max_tokens for memory efficiency on HF Spaces
|
187 |
+
max_tokens = min(max_tokens, 512) # Cap at 512 tokens max
|
188 |
+
|
189 |
try:
|
190 |
# Use the chat completion method if available
|
191 |
if hasattr(llm, 'create_chat_completion'):
|
|
|
459 |
proc = _start_training_subprocess(job_id, req.model_dump())
|
460 |
TRAIN_JOBS[job_id]["status"] = "running"
|
461 |
TRAIN_JOBS[job_id]["pid"] = proc.pid
|
462 |
+
save_job(job_id)
|
463 |
watcher = threading.Thread(target=_watch_process, args=(job_id, proc), daemon=True)
|
464 |
watcher.start()
|
465 |
return StartTrainingResponse(job_id=job_id, status="running", output_dir=output_dir)
|
466 |
except Exception as e:
|
467 |
+
logger.exception("Failed to start training job")
|
468 |
+
TRAIN_JOBS[job_id]["status"] = "failed_to_start"
|
469 |
+
save_job(job_id)
|
470 |
+
raise HTTPException(status_code=500, detail=f"Failed to start training: {e}")
|
471 |
|
472 |
@app.get("/train/status/{job_id}", response_model=TrainStatusResponse)
|
473 |
def train_status(job_id: str):
|
requirements.txt
CHANGED
@@ -1,24 +1,21 @@
|
|
1 |
|
2 |
|
3 |
-
# Hugging Face Spaces requirements (
|
4 |
fastapi
|
5 |
uvicorn
|
6 |
-
transformers>=4.53.0
|
7 |
-
torch
|
8 |
python-dotenv
|
9 |
httpx
|
10 |
requests
|
11 |
Pillow
|
12 |
|
13 |
-
#
|
14 |
-
|
15 |
-
tiktoken
|
16 |
-
sentencepiece>=0.2.0
|
17 |
-
tokenizers
|
18 |
-
regex
|
19 |
|
20 |
-
#
|
21 |
-
|
|
|
|
|
|
|
22 |
|
23 |
# Optional: gradio for demo UI
|
24 |
# gradio
|
|
|
1 |
|
2 |
|
3 |
+
# Hugging Face Spaces requirements (GGUF backend with llama-cpp-python)
|
4 |
fastapi
|
5 |
uvicorn
|
|
|
|
|
6 |
python-dotenv
|
7 |
httpx
|
8 |
requests
|
9 |
Pillow
|
10 |
|
11 |
+
# GGUF model support - optimized for HF Spaces memory constraints
|
12 |
+
llama-cpp-python>=0.3.14
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
# Optional: minimal transformers for fallback (much lighter than full transformers)
|
15 |
+
# transformers[torch]
|
16 |
+
|
17 |
+
# Database support for training job persistence
|
18 |
+
# sqlite3 (included in Python standard library)
|
19 |
|
20 |
# Optional: gradio for demo UI
|
21 |
# gradio
|