ndc8 commited on
Commit
358e717
Β·
1 Parent(s): 994c0b4

Update Dockerfile and application entry point for GGUF backend; optimize memory usage in model parameters and requirements

Browse files
Files changed (4) hide show
  1. Dockerfile +2 -2
  2. app.py +4 -3
  3. gemma_gguf_backend.py +23 -15
  4. requirements.txt +8 -11
Dockerfile CHANGED
@@ -27,5 +27,5 @@ COPY --chown=user . .
27
  # Expose port 7860 (HF Spaces default)
28
  EXPOSE 7860
29
 
30
- # Command to run the application
31
- CMD ["python", "-m", "uvicorn", "backend_service:app", "--host", "0.0.0.0", "--port", "7860"]
 
27
  # Expose port 7860 (HF Spaces default)
28
  EXPOSE 7860
29
 
30
+ # Command to run the GGUF backend application
31
+ CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,10 +1,11 @@
1
  #!/usr/bin/env python3
2
  """
3
- Entry point for Hugging Face Spaces
4
- This file imports and runs the FastAPI application from backend_service
5
  """
6
 
7
- from backend_service import app
 
8
 
9
  if __name__ == "__main__":
10
  import uvicorn
 
1
  #!/usr/bin/env python3
2
  """
3
+ Entry point for Hugging Face Spaces - GGUF Backend
4
+ This file imports and runs the GGUF FastAPI application optimized for memory constraints
5
  """
6
 
7
+ # Import the GGUF backend instead of the transformers backend
8
+ from gemma_gguf_backend import app
9
 
10
  if __name__ == "__main__":
11
  import uvicorn
gemma_gguf_backend.py CHANGED
@@ -52,7 +52,7 @@ class ChatMessage(BaseModel):
52
  class ChatCompletionRequest(BaseModel):
53
  model: str = Field(default="gemma-3n-e4b-it", description="The model to use for completion")
54
  messages: List[ChatMessage] = Field(..., description="List of messages in the conversation")
55
- max_tokens: Optional[int] = Field(default=512, ge=1, le=2048, description="Maximum tokens to generate")
56
  temperature: Optional[float] = Field(default=1.0, ge=0.0, le=2.0, description="Sampling temperature")
57
  top_p: Optional[float] = Field(default=0.95, ge=0.0, le=1.0, description="Top-p sampling")
58
  top_k: Optional[int] = Field(default=64, ge=1, le=100, description="Top-k sampling")
@@ -121,20 +121,25 @@ async def lifespan(app: FastAPI):
121
  try:
122
  logger.info(f"πŸ“₯ Loading Gemma 3n GGUF model from {current_model}...")
123
 
124
- # Configure model parameters for Gemma 3n
125
  llm = Llama.from_pretrained(
126
  repo_id=current_model,
127
- filename="*Q4_K_M.gguf", # Use Q4_K_M quantization for good performance
128
  verbose=True,
129
- # Gemma 3n specific settings
130
- n_ctx=4096, # Start with 4K context (can be increased to 32K)
131
- n_threads=4, # Adjust based on your CPU
132
- n_gpu_layers=-1, # Use all GPU layers if CUDA available, otherwise CPU
 
 
 
 
 
133
  # Chat template for Gemma 3n format
134
  chat_format="gemma", # Try built-in gemma format first
135
  )
136
 
137
- logger.info("βœ… Successfully loaded Gemma 3n GGUF model")
138
 
139
  except Exception as e:
140
  logger.error(f"❌ Failed to initialize Gemma 3n model: {e}")
@@ -172,12 +177,15 @@ def ensure_model_ready():
172
  # For demo mode, we'll allow the service to run even without a model
173
  pass
174
 
175
- def generate_response_gguf(messages: List[ChatMessage], max_tokens: int = 512, temperature: float = 1.0, top_p: float = 0.95, top_k: int = 64) -> str:
176
- """Generate response using GGUF model via llama-cpp-python."""
177
  if llm is None:
178
  # Demo mode response
179
  return "πŸ€– Demo mode: Gemma 3n model not loaded. This would be a real response from the Gemma 3n model. Please download the GGUF model from https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF"
180
 
 
 
 
181
  try:
182
  # Use the chat completion method if available
183
  if hasattr(llm, 'create_chat_completion'):
@@ -451,15 +459,15 @@ def start_training(req: StartTrainingRequest):
451
  proc = _start_training_subprocess(job_id, req.model_dump())
452
  TRAIN_JOBS[job_id]["status"] = "running"
453
  TRAIN_JOBS[job_id]["pid"] = proc.pid
454
- save_job(job_id)
455
  watcher = threading.Thread(target=_watch_process, args=(job_id, proc), daemon=True)
456
  watcher.start()
457
  return StartTrainingResponse(job_id=job_id, status="running", output_dir=output_dir)
458
  except Exception as e:
459
- logger.exception("Failed to start training job")
460
- TRAIN_JOBS[job_id]["status"] = "failed_to_start"
461
- save_job(job_id)
462
- raise HTTPException(status_code=500, detail=f"Failed to start training: {e}")
463
 
464
  @app.get("/train/status/{job_id}", response_model=TrainStatusResponse)
465
  def train_status(job_id: str):
 
52
  class ChatCompletionRequest(BaseModel):
53
  model: str = Field(default="gemma-3n-e4b-it", description="The model to use for completion")
54
  messages: List[ChatMessage] = Field(..., description="List of messages in the conversation")
55
+ max_tokens: Optional[int] = Field(default=256, ge=1, le=1024, description="Maximum tokens to generate (reduced for memory efficiency)")
56
  temperature: Optional[float] = Field(default=1.0, ge=0.0, le=2.0, description="Sampling temperature")
57
  top_p: Optional[float] = Field(default=0.95, ge=0.0, le=1.0, description="Top-p sampling")
58
  top_k: Optional[int] = Field(default=64, ge=1, le=100, description="Top-k sampling")
 
121
  try:
122
  logger.info(f"πŸ“₯ Loading Gemma 3n GGUF model from {current_model}...")
123
 
124
+ # Configure model parameters optimized for HF Spaces memory constraints
125
  llm = Llama.from_pretrained(
126
  repo_id=current_model,
127
+ filename="*Q4_0.gguf", # Use Q4_0 instead of Q4_K_M for lower memory usage
128
  verbose=True,
129
+ # Memory-optimized settings for HF Spaces
130
+ n_ctx=2048, # Reduced context length to save memory (was 4096)
131
+ n_threads=2, # Fewer threads for lower memory usage (was 4)
132
+ n_gpu_layers=0, # Force CPU-only to avoid GPU memory issues
133
+ # Additional memory optimizations
134
+ n_batch=512, # Smaller batch size to reduce memory peaks
135
+ use_mmap=True, # Use memory mapping to reduce RAM usage
136
+ use_mlock=False, # Don't lock memory pages
137
+ low_vram=True, # Enable low VRAM mode for additional memory savings
138
  # Chat template for Gemma 3n format
139
  chat_format="gemma", # Try built-in gemma format first
140
  )
141
 
142
+ logger.info("βœ… Successfully loaded Gemma 3n GGUF model with memory optimizations")
143
 
144
  except Exception as e:
145
  logger.error(f"❌ Failed to initialize Gemma 3n model: {e}")
 
177
  # For demo mode, we'll allow the service to run even without a model
178
  pass
179
 
180
+ def generate_response_gguf(messages: List[ChatMessage], max_tokens: int = 256, temperature: float = 1.0, top_p: float = 0.95, top_k: int = 64) -> str:
181
+ """Generate response using GGUF model via llama-cpp-python (memory-optimized)."""
182
  if llm is None:
183
  # Demo mode response
184
  return "πŸ€– Demo mode: Gemma 3n model not loaded. This would be a real response from the Gemma 3n model. Please download the GGUF model from https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF"
185
 
186
+ # Limit max_tokens for memory efficiency on HF Spaces
187
+ max_tokens = min(max_tokens, 512) # Cap at 512 tokens max
188
+
189
  try:
190
  # Use the chat completion method if available
191
  if hasattr(llm, 'create_chat_completion'):
 
459
  proc = _start_training_subprocess(job_id, req.model_dump())
460
  TRAIN_JOBS[job_id]["status"] = "running"
461
  TRAIN_JOBS[job_id]["pid"] = proc.pid
462
+ save_job(job_id)
463
  watcher = threading.Thread(target=_watch_process, args=(job_id, proc), daemon=True)
464
  watcher.start()
465
  return StartTrainingResponse(job_id=job_id, status="running", output_dir=output_dir)
466
  except Exception as e:
467
+ logger.exception("Failed to start training job")
468
+ TRAIN_JOBS[job_id]["status"] = "failed_to_start"
469
+ save_job(job_id)
470
+ raise HTTPException(status_code=500, detail=f"Failed to start training: {e}")
471
 
472
  @app.get("/train/status/{job_id}", response_model=TrainStatusResponse)
473
  def train_status(job_id: str):
requirements.txt CHANGED
@@ -1,24 +1,21 @@
1
 
2
 
3
- # Hugging Face Spaces requirements (transformers backend only)
4
  fastapi
5
  uvicorn
6
- transformers>=4.53.0
7
- torch
8
  python-dotenv
9
  httpx
10
  requests
11
  Pillow
12
 
13
- # Required dependencies for Gemma models
14
- protobuf
15
- tiktoken
16
- sentencepiece>=0.2.0
17
- tokenizers
18
- regex
19
 
20
- # Required for Gemma 3n vision components
21
- timm
 
 
 
22
 
23
  # Optional: gradio for demo UI
24
  # gradio
 
1
 
2
 
3
+ # Hugging Face Spaces requirements (GGUF backend with llama-cpp-python)
4
  fastapi
5
  uvicorn
 
 
6
  python-dotenv
7
  httpx
8
  requests
9
  Pillow
10
 
11
+ # GGUF model support - optimized for HF Spaces memory constraints
12
+ llama-cpp-python>=0.3.14
 
 
 
 
13
 
14
+ # Optional: minimal transformers for fallback (much lighter than full transformers)
15
+ # transformers[torch]
16
+
17
+ # Database support for training job persistence
18
+ # sqlite3 (included in Python standard library)
19
 
20
  # Optional: gradio for demo UI
21
  # gradio