Spaces:

cong182
/

firstAI

Sleeping

App Files Files Community

ndc8 commited on 5 days ago

Commit

358e717

1 Parent(s): 994c0b4

Update Dockerfile and application entry point for GGUF backend; optimize memory usage in model parameters and requirements

Browse files

Files changed (4) hide show

Dockerfile +2 -2
app.py +4 -3
gemma_gguf_backend.py +23 -15
requirements.txt +8 -11

Dockerfile CHANGED Viewed

@@ -27,5 +27,5 @@ COPY --chown=user . .
 # Expose port 7860 (HF Spaces default)
 EXPOSE 7860
-# Command to run the application
-CMD ["python", "-m", "uvicorn", "backend_service:app", "--host", "0.0.0.0", "--port", "7860"]

 # Expose port 7860 (HF Spaces default)
 EXPOSE 7860
+# Command to run the GGUF backend application
+CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,10 +1,11 @@
 #!/usr/bin/env python3
 """
-Entry point for Hugging Face Spaces
-This file imports and runs the FastAPI application from backend_service
 """
-from backend_service import app
 if __name__ == "__main__":
     import uvicorn

 #!/usr/bin/env python3
 """
+Entry point for Hugging Face Spaces - GGUF Backend
+This file imports and runs the GGUF FastAPI application optimized for memory constraints
 """
+# Import the GGUF backend instead of the transformers backend
+from gemma_gguf_backend import app
 if __name__ == "__main__":
     import uvicorn

gemma_gguf_backend.py CHANGED Viewed

@@ -52,7 +52,7 @@ class ChatMessage(BaseModel):
 class ChatCompletionRequest(BaseModel):
     model: str = Field(default="gemma-3n-e4b-it", description="The model to use for completion")
     messages: List[ChatMessage] = Field(..., description="List of messages in the conversation")
-    max_tokens: Optional[int] = Field(default=512, ge=1, le=2048, description="Maximum tokens to generate")
     temperature: Optional[float] = Field(default=1.0, ge=0.0, le=2.0, description="Sampling temperature")
     top_p: Optional[float] = Field(default=0.95, ge=0.0, le=1.0, description="Top-p sampling")
     top_k: Optional[int] = Field(default=64, ge=1, le=100, description="Top-k sampling")
@@ -121,20 +121,25 @@ async def lifespan(app: FastAPI):
     try:
         logger.info(f"📥 Loading Gemma 3n GGUF model from {current_model}...")
-        # Configure model parameters for Gemma 3n
         llm = Llama.from_pretrained(
             repo_id=current_model,
-            filename="*Q4_K_M.gguf",  # Use Q4_K_M quantization for good performance
             verbose=True,
-            # Gemma 3n specific settings
-            n_ctx=4096,  # Start with 4K context (can be increased to 32K)
-            n_threads=4,  # Adjust based on your CPU
-            n_gpu_layers=-1,  # Use all GPU layers if CUDA available, otherwise CPU
             # Chat template for Gemma 3n format
             chat_format="gemma",  # Try built-in gemma format first
         )
-        logger.info("✅ Successfully loaded Gemma 3n GGUF model")
     except Exception as e:
         logger.error(f"❌ Failed to initialize Gemma 3n model: {e}")
@@ -172,12 +177,15 @@ def ensure_model_ready():
     # For demo mode, we'll allow the service to run even without a model
     pass
-def generate_response_gguf(messages: List[ChatMessage], max_tokens: int = 512, temperature: float = 1.0, top_p: float = 0.95, top_k: int = 64) -> str:
-    """Generate response using GGUF model via llama-cpp-python."""
     if llm is None:
         # Demo mode response
         return "🤖 Demo mode: Gemma 3n model not loaded. This would be a real response from the Gemma 3n model. Please download the GGUF model from https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF"
     try:
         # Use the chat completion method if available
         if hasattr(llm, 'create_chat_completion'):
@@ -451,15 +459,15 @@ def start_training(req: StartTrainingRequest):
         proc = _start_training_subprocess(job_id, req.model_dump())
         TRAIN_JOBS[job_id]["status"] = "running"
         TRAIN_JOBS[job_id]["pid"] = proc.pid
-    save_job(job_id)
         watcher = threading.Thread(target=_watch_process, args=(job_id, proc), daemon=True)
         watcher.start()
         return StartTrainingResponse(job_id=job_id, status="running", output_dir=output_dir)
     except Exception as e:
-    logger.exception("Failed to start training job")
-    TRAIN_JOBS[job_id]["status"] = "failed_to_start"
-    save_job(job_id)
-    raise HTTPException(status_code=500, detail=f"Failed to start training: {e}")
 @app.get("/train/status/{job_id}", response_model=TrainStatusResponse)
 def train_status(job_id: str):

 class ChatCompletionRequest(BaseModel):
     model: str = Field(default="gemma-3n-e4b-it", description="The model to use for completion")
     messages: List[ChatMessage] = Field(..., description="List of messages in the conversation")
+    max_tokens: Optional[int] = Field(default=256, ge=1, le=1024, description="Maximum tokens to generate (reduced for memory efficiency)")
     temperature: Optional[float] = Field(default=1.0, ge=0.0, le=2.0, description="Sampling temperature")
     top_p: Optional[float] = Field(default=0.95, ge=0.0, le=1.0, description="Top-p sampling")
     top_k: Optional[int] = Field(default=64, ge=1, le=100, description="Top-k sampling")
     try:
         logger.info(f"📥 Loading Gemma 3n GGUF model from {current_model}...")
+        # Configure model parameters optimized for HF Spaces memory constraints
         llm = Llama.from_pretrained(
             repo_id=current_model,
+            filename="*Q4_0.gguf",  # Use Q4_0 instead of Q4_K_M for lower memory usage
             verbose=True,
+            # Memory-optimized settings for HF Spaces
+            n_ctx=2048,  # Reduced context length to save memory (was 4096)
+            n_threads=2,  # Fewer threads for lower memory usage (was 4)
+            n_gpu_layers=0,  # Force CPU-only to avoid GPU memory issues
+            # Additional memory optimizations
+            n_batch=512,  # Smaller batch size to reduce memory peaks
+            use_mmap=True,  # Use memory mapping to reduce RAM usage
+            use_mlock=False,  # Don't lock memory pages
+            low_vram=True,  # Enable low VRAM mode for additional memory savings
             # Chat template for Gemma 3n format
             chat_format="gemma",  # Try built-in gemma format first
         )
+        logger.info("✅ Successfully loaded Gemma 3n GGUF model with memory optimizations")
     except Exception as e:
         logger.error(f"❌ Failed to initialize Gemma 3n model: {e}")
     # For demo mode, we'll allow the service to run even without a model
     pass
+def generate_response_gguf(messages: List[ChatMessage], max_tokens: int = 256, temperature: float = 1.0, top_p: float = 0.95, top_k: int = 64) -> str:
+    """Generate response using GGUF model via llama-cpp-python (memory-optimized)."""
     if llm is None:
         # Demo mode response
         return "🤖 Demo mode: Gemma 3n model not loaded. This would be a real response from the Gemma 3n model. Please download the GGUF model from https://huggingface.co/unsloth/gemma-3n-E4B-it-GGUF"
+    # Limit max_tokens for memory efficiency on HF Spaces
+    max_tokens = min(max_tokens, 512)  # Cap at 512 tokens max
     try:
         # Use the chat completion method if available
         if hasattr(llm, 'create_chat_completion'):
         proc = _start_training_subprocess(job_id, req.model_dump())
         TRAIN_JOBS[job_id]["status"] = "running"
         TRAIN_JOBS[job_id]["pid"] = proc.pid
+        save_job(job_id)
         watcher = threading.Thread(target=_watch_process, args=(job_id, proc), daemon=True)
         watcher.start()
         return StartTrainingResponse(job_id=job_id, status="running", output_dir=output_dir)
     except Exception as e:
+        logger.exception("Failed to start training job")
+        TRAIN_JOBS[job_id]["status"] = "failed_to_start"
+        save_job(job_id)
+        raise HTTPException(status_code=500, detail=f"Failed to start training: {e}")
 @app.get("/train/status/{job_id}", response_model=TrainStatusResponse)
 def train_status(job_id: str):

requirements.txt CHANGED Viewed

@@ -1,24 +1,21 @@
-# Hugging Face Spaces requirements (transformers backend only)
 fastapi
 uvicorn
-transformers>=4.53.0
-torch
 python-dotenv
 httpx
 requests
 Pillow
-# Required dependencies for Gemma models
-protobuf
-tiktoken
-sentencepiece>=0.2.0
-tokenizers
-regex
-# Required for Gemma 3n vision components
-timm
 # Optional: gradio for demo UI
 # gradio

+# Hugging Face Spaces requirements (GGUF backend with llama-cpp-python)
 fastapi
 uvicorn
 python-dotenv
 httpx
 requests
 Pillow
+# GGUF model support - optimized for HF Spaces memory constraints
+llama-cpp-python>=0.3.14
+# Optional: minimal transformers for fallback (much lighter than full transformers)
+# transformers[torch]
+# Database support for training job persistence
+# sqlite3 (included in Python standard library)
 # Optional: gradio for demo UI
 # gradio