Spaces:

Dnfs
/

llm-apiku

Build error

App Files Files Community

Dnfs commited on Jul 4

Commit

b24565b

verified ·

1 Parent(s): 4af78cc

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -32

app.py CHANGED Viewed

@@ -5,35 +5,20 @@ import os
 import uvicorn
 from typing import Optional, List
 import logging
-# Set up loggings
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-app = FastAPI(title="Gema 4B Model API", version="1.0.0")
-# Request model
-class TextRequest(BaseModel):
-    inputs: str
-    system_prompt: Optional[str] = None
-    max_tokens: Optional[int] = 512
-    temperature: Optional[float] = 0.7
-    top_k: Optional[int] = 50
-    top_p: Optional[float] = 0.9
-    repeat_penalty: Optional[float] = 1.1
-    stop: Optional[List[str]] = None
-# Response model
-class TextResponse(BaseModel):
-    generated_text: str
 # Global model variable
 model = None
-@app.on_event("startup")
-async def load_model():
     global model
-    # Define the local model path
     model_path = "./model"
     model_file = "gema-4b-indra10k-model1-q4_k_m.gguf"
@@ -42,11 +27,12 @@ async def load_model():
              raise RuntimeError("Model files not found. Ensure the model was downloaded in the Docker build.")
         logger.info(f"Loading model from local path: {model_path}")
-        # Load the model from the local directory downloaded during the Docker build
         model = AutoModelForCausalLM.from_pretrained(
-            model_path, # Load from the local folder
-            model_file=model_file, # Specify the GGUF file name
-            model_type="llama",
             gpu_layers=0,
             context_length=2048,
             threads=os.cpu_count() or 1
@@ -54,22 +40,44 @@ async def load_model():
         logger.info("Model loaded successfully!")
     except Exception as e:
         logger.error(f"Failed to load model: {e}")
-        # Raising the exception will prevent the app from starting if the model fails to load
         raise e
 @app.post("/generate", response_model=TextResponse)
 async def generate_text(request: TextRequest):
     if model is None:
-        raise HTTPException(status_code=503, detail="Model is not ready or failed to load. Please try again later.")
     try:
-        # Create prompt
         if request.system_prompt:
             full_prompt = f"{request.system_prompt}\n\nUser: {request.inputs}\nAssistant:"
         else:
             full_prompt = request.inputs
-        # Generate text with parameters from the request
         generated_text = model(
             full_prompt,
             max_new_tokens=request.max_tokens,
@@ -80,7 +88,6 @@ async def generate_text(request: TextRequest):
             stop=request.stop or []
         )
-        # Clean up the response
         if "Assistant:" in generated_text:
             generated_text = generated_text.split("Assistant:")[-1].strip()
@@ -92,8 +99,6 @@ async def generate_text(request: TextRequest):
 @app.get("/health")
 async def health_check():
-    # The health check now also implicitly checks if the model has been loaded
-    # because a failure in load_model will stop the app from running.
     return {"status": "healthy", "model_loaded": model is not None}
 @app.get("/")

 import uvicorn
 from typing import Optional, List
 import logging
+from contextlib import asynccontextmanager
+# Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Global model variable
 model = None
+# Lifespan manager to load the model on startup
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # This code runs on startup
     global model
     model_path = "./model"
     model_file = "gema-4b-indra10k-model1-q4_k_m.gguf"
              raise RuntimeError("Model files not found. Ensure the model was downloaded in the Docker build.")
         logger.info(f"Loading model from local path: {model_path}")
+        # FIX: Changed model_type from "llama" to "gemma"
         model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            model_file=model_file,
+            model_type="gemma",  # This was the main cause of the error
             gpu_layers=0,
             context_length=2048,
             threads=os.cpu_count() or 1
         logger.info("Model loaded successfully!")
     except Exception as e:
         logger.error(f"Failed to load model: {e}")
+        # Raising an exception during startup will prevent the app from starting
         raise e
+    yield
+    # This code runs on shutdown (optional)
+    logger.info("Application is shutting down.")
+app = FastAPI(title="Gema 4B Model API", version="1.0.0", lifespan=lifespan)
+# Request model
+class TextRequest(BaseModel):
+    inputs: str
+    system_prompt: Optional[str] = None
+    max_tokens: Optional[int] = 512
+    temperature: Optional[float] = 0.7
+    top_k: Optional[int] = 50
+    top_p: Optional[float] = 0.9
+    repeat_penalty: Optional[float] = 1.1
+    stop: Optional[List[str]] = None
+# Response model
+class TextResponse(BaseModel):
+    generated_text: str
 @app.post("/generate", response_model=TextResponse)
 async def generate_text(request: TextRequest):
     if model is None:
+        raise HTTPException(status_code=503, detail="Model is not ready or failed to load. Please check logs.")
     try:
         if request.system_prompt:
             full_prompt = f"{request.system_prompt}\n\nUser: {request.inputs}\nAssistant:"
         else:
             full_prompt = request.inputs
         generated_text = model(
             full_prompt,
             max_new_tokens=request.max_tokens,
             stop=request.stop or []
         )
         if "Assistant:" in generated_text:
             generated_text = generated_text.split("Assistant:")[-1].strip()
 @app.get("/health")
 async def health_check():
     return {"status": "healthy", "model_loaded": model is not None}
 @app.get("/")