Spaces:
Sleeping
Sleeping
from fastapi import FastAPI | |
from pydantic import BaseModel | |
from typing import List | |
from llama_cpp import Llama | |
app = FastAPI() | |
llm = Llama( | |
model_path="phi-2.Q4_K_M.gguf", | |
n_ctx=2048, | |
n_threads=2 | |
) | |
class Message(BaseModel): | |
role: str | |
content: str | |
class ChatRequest(BaseModel): | |
model: str | |
messages: List[Message] | |
temperature: float = 0.7 | |
max_tokens: int = 256 | |
async def chat_completions(req: ChatRequest): | |
prompt = "\n".join([f"{m.role}: {m.content}" for m in req.messages]) + "\nassistant:" | |
output = llm( | |
prompt, | |
max_tokens=req.max_tokens, | |
temperature=req.temperature, | |
stop=["user:", "assistant:"] | |
) | |
text = output["choices"][0]["text"] | |
return { | |
"id": "chatcmpl-123", | |
"object": "chat.completion", | |
"choices": [{ | |
"index": 0, | |
"message": {"role": "assistant", "content": text}, | |
"finish_reason": "stop" | |
}], | |
"model": req.model | |
} | |