File size: 1,374 Bytes
43a49a4
cdbdba1
 
43a49a4
45c840a
cdbdba1
 
 
45c840a
cdbdba1
 
 
 
 
 
 
 
 
 
 
45c840a
 
 
 
 
 
 
 
 
 
 
 
cdbdba1
 
45c840a
 
 
 
cdbdba1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List
from llama_cpp import Llama
import os

app = FastAPI()

llm = None  # Will initialize on startup

class Message(BaseModel):
    role: str
    content: str

class ChatRequest(BaseModel):
    model: str
    messages: List[Message]
    temperature: float = 0.7
    max_tokens: int = 256

@app.on_event("startup")
def load_model():
    global llm
    model_path = "phi-2.Q4_K_M.gguf"
    if not os.path.exists(model_path):
        raise RuntimeError(f"Model not found: {model_path}")
    llm = Llama(
        model_path=model_path,
        n_ctx=2048,
        n_threads=2
    )

@app.post("/v1/chat/completions")
async def chat_completions(req: ChatRequest):
    global llm
    if llm is None:
        return {"error": "Model not initialized."}

    prompt = "\n".join([f"{m.role}: {m.content}" for m in req.messages]) + "\nassistant:"
    output = llm(
        prompt,
        max_tokens=req.max_tokens,
        temperature=req.temperature,
        stop=["user:", "assistant:"]
    )
    text = output["choices"][0]["text"]
    return {
        "id": "chatcmpl-123",
        "object": "chat.completion",
        "choices": [{
            "index": 0,
            "message": {"role": "assistant", "content": text},
            "finish_reason": "stop"
        }],
        "model": req.model
    }