Spaces:
Sleeping
Sleeping
File size: 1,790 Bytes
43a49a4 cdbdba1 ec4633f 43a49a4 45c840a ec4633f c0132d6 cdbdba1 ec4633f cdbdba1 ec4633f cdbdba1 45c840a ec4633f ddfcea6 45c840a ec4633f 45c840a 2cdd46e c0132d6 45c840a c0132d6 ec4633f c0132d6 ec4633f cdbdba1 ec4633f c0132d6 ec4633f cdbdba1 ec4633f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List, Optional
from llama_cpp import Llama
import os
import time
import uuid
app = FastAPI()
llm = None
class Message(BaseModel):
role: str
content: str
class ChatRequest(BaseModel):
model: str
messages: List[Message]
temperature: Optional[float] = 0.7
max_tokens: Optional[int] = 256
@app.on_event("startup")
def load_model():
global llm
model_path_file = "/tmp/model_path.txt"
if not os.path.exists(model_path_file):
raise RuntimeError(f"Model path file not found: {model_path_file}")
with open(model_path_file, "r") as f:
model_path = f.read().strip()
if not os.path.exists(model_path):
raise RuntimeError(f"Model not found at path: {model_path}")
llm = Llama(model_path=model_path)
@app.get("/")
async def root():
return {"message": "API is running"}
@app.get("/api/tags")
async def api_tags():
return []
@app.post("/chat")
async def chat(req: ChatRequest):
global llm
if llm is None:
return {"error": "Model not initialized."}
# Build prompt from messages, Ollama uses system/user/assistant roles
prompt = ""
for m in req.messages:
prompt += f"{m.role}: {m.content}\n"
prompt += "assistant:"
output = llm(
prompt,
max_tokens=req.max_tokens,
temperature=req.temperature,
stop=["user:", "assistant:"]
)
text = output.get("choices", [{}])[0].get("text", "").strip()
response = {
"id": str(uuid.uuid4()),
"model": req.model,
"choices": [
{
"message": {"role": "assistant", "content": text},
"finish_reason": "stop"
}
]
}
return response |