Spaces:
Sleeping
Sleeping
from fastapi import FastAPI | |
from pydantic import BaseModel | |
from typing import List, Optional | |
from llama_cpp import Llama | |
import os | |
import time | |
import uuid | |
app = FastAPI() | |
llm = None | |
class Message(BaseModel): | |
role: str | |
content: str | |
class ChatRequest(BaseModel): | |
model: str | |
messages: List[Message] | |
temperature: Optional[float] = 0.7 | |
max_tokens: Optional[int] = 256 | |
def load_model(): | |
global llm | |
model_path_file = "/tmp/model_path.txt" | |
if not os.path.exists(model_path_file): | |
raise RuntimeError(f"Model path file not found: {model_path_file}") | |
with open(model_path_file, "r") as f: | |
model_path = f.read().strip() | |
if not os.path.exists(model_path): | |
raise RuntimeError(f"Model not found at path: {model_path}") | |
llm = Llama(model_path=model_path) | |
async def root(): | |
return {"message": "API is running"} | |
async def api_tags(): | |
return [] | |
async def chat(req: ChatRequest): | |
global llm | |
if llm is None: | |
return {"error": "Model not initialized."} | |
# Build prompt from messages, Ollama uses system/user/assistant roles | |
prompt = "" | |
for m in req.messages: | |
prompt += f"{m.role}: {m.content}\n" | |
prompt += "assistant:" | |
output = llm( | |
prompt, | |
max_tokens=req.max_tokens, | |
temperature=req.temperature, | |
stop=["user:", "assistant:"] | |
) | |
text = output.get("choices", [{}])[0].get("text", "").strip() | |
response = { | |
"id": str(uuid.uuid4()), | |
"model": req.model, | |
"choices": [ | |
{ | |
"message": {"role": "assistant", "content": text}, | |
"finish_reason": "stop" | |
} | |
] | |
} | |
return response |