from fastapi import FastAPI, HTTPException from pydantic import BaseModel from ctransformers import AutoModelForCausalLM import os import uvicorn from typing import Optional, List import logging # Set up loggings logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = FastAPI(title="Gema 4B Model API", version="1.0.0") # Request model - fleksibel untuk menerima semua parameter class TextRequest(BaseModel): inputs: str system_prompt: Optional[str] = None max_tokens: Optional[int] = 512 temperature: Optional[float] = 0.7 top_k: Optional[int] = 50 top_p: Optional[float] = 0.9 repeat_penalty: Optional[float] = 1.1 stop: Optional[List[str]] = None # Response model class TextResponse(BaseModel): generated_text: str # Global model variable model = None @app.on_event("startup") async def load_model(): global model try: logger.info("Loading model...") model = AutoModelForCausalLM.from_pretrained( "Dnfs/gema-4b-indra10k-model1-Q4_K_M-GGUF", model_file="gema-4b-indra10k-model1-q4_k_m.gguf", model_type="llama", gpu_layers=0, # Set to appropriate number if using GPU context_length=2048, threads=os.cpu_count() ) logger.info("Model loaded successfully!") except Exception as e: logger.error(f"Failed to load model: {e}") raise e @app.post("/generate", response_model=TextResponse) async def generate_text(request: TextRequest): if model is None: raise HTTPException(status_code=500, detail="Model not loaded") try: # Buat prompt - gunakan system_prompt jika ada, atau langsung input user if request.system_prompt: full_prompt = f"{request.system_prompt}\n\nUser: {request.inputs}\nAssistant:" else: full_prompt = request.inputs # Generate text dengan parameter dari request generated_text = model( full_prompt, max_new_tokens=request.max_tokens, temperature=request.temperature, top_p=request.top_p, top_k=request.top_k, repetition_penalty=request.repeat_penalty, stop=request.stop or [] ) # Bersihkan response dari system prompt jika ada if "Assistant:" in generated_text: generated_text = generated_text.split("Assistant:")[-1].strip() return TextResponse(generated_text=generated_text) except Exception as e: logger.error(f"Generation error: {e}") raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}") @app.get("/health") async def health_check(): return {"status": "healthy", "model_loaded": model is not None} @app.get("/") async def root(): return {"message": "Gema 4B Model API", "docs": "/docs"} if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")