Spaces:

Dnfs
/

llm-apiku

Build error

App Files Files Community

Dnfs commited on Jul 4

Commit

0504f7a

verified ·

1 Parent(s): b24565b

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -25

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from ctransformers import AutoModelForCausalLM
 import os
 import uvicorn
 from typing import Optional, List
@@ -17,34 +17,30 @@ model = None
 # Lifespan manager to load the model on startup
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    # This code runs on startup
     global model
-    model_path = "./model"
-    model_file = "gema-4b-indra10k-model1-q4_k_m.gguf"
     try:
-        if not os.path.exists(model_path) or not os.path.exists(os.path.join(model_path, model_file)):
-             raise RuntimeError("Model files not found. Ensure the model was downloaded in the Docker build.")
-        logger.info(f"Loading model from local path: {model_path}")
-        # FIX: Changed model_type from "llama" to "gemma"
-        model = AutoModelForCausalLM.from_pretrained(
-            model_path,
-            model_file=model_file,
-            model_type="gemma",  # This was the main cause of the error
-            gpu_layers=0,
-            context_length=2048,
-            threads=os.cpu_count() or 1
         )
-        logger.info("Model loaded successfully!")
     except Exception as e:
         logger.error(f"Failed to load model: {e}")
-        # Raising an exception during startup will prevent the app from starting
         raise e
     yield
-    # This code runs on shutdown (optional)
     logger.info("Application is shutting down.")
@@ -70,26 +66,28 @@ class TextResponse(BaseModel):
 @app.post("/generate", response_model=TextResponse)
 async def generate_text(request: TextRequest):
     if model is None:
-        raise HTTPException(status_code=503, detail="Model is not ready or failed to load. Please check logs.")
     try:
         if request.system_prompt:
             full_prompt = f"{request.system_prompt}\n\nUser: {request.inputs}\nAssistant:"
         else:
             full_prompt = request.inputs
-        generated_text = model(
-            full_prompt,
-            max_new_tokens=request.max_tokens,
             temperature=request.temperature,
             top_p=request.top_p,
             top_k=request.top_k,
-            repetition_penalty=request.repeat_penalty,
             stop=request.stop or []
         )
-        if "Assistant:" in generated_text:
-            generated_text = generated_text.split("Assistant:")[-1].strip()
         return TextResponse(generated_text=generated_text)

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from llama_cpp import Llama
 import os
 import uvicorn
 from typing import Optional, List
 # Lifespan manager to load the model on startup
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global model
+    model_gguf_path = os.path.join("./model", "gema-4b-indra10k-model1-q4_k_m.gguf")
     try:
+        if not os.path.exists(model_gguf_path):
+            raise RuntimeError(f"Model file not found at: {model_gguf_path}")
+        logger.info(f"Loading model from: {model_gguf_path}")
+        # Load the model using llama-cpp-python
+        model = Llama(
+            model_path=model_gguf_path,
+            n_ctx=2048,           # Context length
+            n_gpu_layers=0,      # Set to a positive number if GPU is available
+            n_threads=os.cpu_count() or 1,
+            verbose=True,
         )
+        logger.info("Model loaded successfully using llama-cpp-python!")
     except Exception as e:
         logger.error(f"Failed to load model: {e}")
         raise e
     yield
+    # Cleanup code if needed on shutdown
     logger.info("Application is shutting down.")
 @app.post("/generate", response_model=TextResponse)
 async def generate_text(request: TextRequest):
     if model is None:
+        raise HTTPException(status_code=503, detail="Model is not ready or failed to load.")
     try:
+        # Create prompt
         if request.system_prompt:
             full_prompt = f"{request.system_prompt}\n\nUser: {request.inputs}\nAssistant:"
         else:
             full_prompt = request.inputs
+        # Generate text using llama-cpp-python syntax
+        output = model(
+            prompt=full_prompt,
+            max_tokens=request.max_tokens,
             temperature=request.temperature,
             top_p=request.top_p,
             top_k=request.top_k,
+            repeat_penalty=request.repeat_penalty,
             stop=request.stop or []
         )
+        # Extract the generated text from the response structure
+        generated_text = output['choices'][0]['text'].strip()
         return TextResponse(generated_text=generated_text)