from fastapi import FastAPI, Request from llama_cpp import Llama from pydantic import BaseModel from typing import List import uvicorn app = FastAPI() # Load small model (e.g., Phi-2 or DeepSeek) llm = Llama(model_path="phi-2.Q4_K_M.gguf", n_ctx=2048, n_threads=2) class Message(BaseModel): role: str content: str class ChatRequest(BaseModel): model: str messages: List[Message] temperature: float = 0.7 max_tokens: int = 256 stream: bool = False @app.post("/v1/chat/completions") async def chat_completions(req: ChatRequest): prompt = "\n".join([f"{m.role}: {m.content}" for m in req.messages]) + "\nassistant:" output = llm( prompt, max_tokens=req.max_tokens, temperature=req.temperature, stop=["user:", "assistant:"] ) text = output["choices"][0]["text"] return { "id": "chatcmpl-123", "object": "chat.completion", "choices": [{ "index": 0, "message": {"role": "assistant", "content": text}, "finish_reason": "stop" }], "model": req.model } if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000)