llm-apiku / app.py
Dnfs's picture
Update app.py
778eaf5 verified
raw
history blame
2.99 kB
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from ctransformers import AutoModelForCausalLM
import os
import uvicorn
from typing import Optional, List
import logging
# Set up loggings
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="llm-apiku", version="1.0.0")
# Request model - fleksibel untuk menerima semua parameter
class TextRequest(BaseModel):
inputs: str
system_prompt: Optional[str] = None
max_tokens: Optional[int] = 512
temperature: Optional[float] = 0.7
top_k: Optional[int] = 50
top_p: Optional[float] = 0.9
repeat_penalty: Optional[float] = 1.1
stop: Optional[List[str]] = None
# Response model
class TextResponse(BaseModel):
generated_text: str
# Global model variable
model = None
@app.on_event("startup")
async def load_model():
global model
try:
logger.info("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
"Dnfs/gema-4b-indra10k-model1-Q4_K_M-GGUF",
model_file="gema-4b-indra10k-model1-q4_k_m.gguf",
model_type="llama",
gpu_layers=0, # Set to appropriate number if using GPU
context_length=2048,
threads=os.cpu_count()
)
logger.info("Model loaded successfully!")
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise e
@app.post("/generate", response_model=TextResponse)
async def generate_text(request: TextRequest):
if model is None:
raise HTTPException(status_code=500, detail="Model not loaded")
try:
# Buat prompt - gunakan system_prompt jika ada, atau langsung input user
if request.system_prompt:
full_prompt = f"{request.system_prompt}\n\nUser: {request.inputs}\nAssistant:"
else:
full_prompt = request.inputs
# Generate text dengan parameter dari request
generated_text = model(
full_prompt,
max_new_tokens=request.max_tokens,
temperature=request.temperature,
top_p=request.top_p,
top_k=request.top_k,
repetition_penalty=request.repeat_penalty,
stop=request.stop or []
)
# Bersihkan response dari system prompt jika ada
if "Assistant:" in generated_text:
generated_text = generated_text.split("Assistant:")[-1].strip()
return TextResponse(generated_text=generated_text)
except Exception as e:
logger.error(f"Generation error: {e}")
raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
@app.get("/health")
async def health_check():
return {"status": "healthy", "model_loaded": model is not None}
@app.get("/")
async def root():
return {"message": "Gema 4B Model API", "docs": "/docs"}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")