Spaces:
Runtime error
Runtime error
File size: 2,788 Bytes
0442491 12530ec 0442491 12530ec 0442491 12530ec 0442491 12530ec 0442491 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import os
import gradio as gr
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List
from contextlib import asynccontextmanager
from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
# --- Configuration ---
MODEL_ID = os.getenv("MODEL_ID", "meta-llama/Llama-3.1-8B-Instruct")
engine = None
# --- Lifespan Manager for Model Loading ---
@asynccontextmanager
async def lifespan(app: FastAPI):
global engine
print(f"Lifespan startup: Loading model {MODEL_ID}...")
engine_args = AsyncEngineArgs(
model=MODEL_ID,
tokenizer="hf-internal-testing/llama-tokenizer",
tensor_parallel_size=1,
gpu_memory_utilization=0.90,
download_dir="/data/huggingface"
)
engine = AsyncLLMEngine.from_engine_args(engine_args)
print("Model loading complete.")
yield
# 1. Create the FastAPI app instance
app = FastAPI(lifespan=lifespan)
# --- API Data Models ---
class ChatMessage(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
messages: List[ChatMessage]
model: str = MODEL_ID
temperature: float = 0.7
max_tokens: int = 1024
# 2. Define the API endpoint on the FastAPI `app` object
@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
if not engine:
return {"error": "Model is not ready or has failed to load."}, 503
user_prompt = request.messages[-1].content
sampling_params = SamplingParams(temperature=request.temperature, max_tokens=request.max_tokens)
request_id = f"api-{os.urandom(4).hex()}"
results_generator = engine.generate(user_prompt, sampling_params, request_id)
final_output = await results_generator
return {
"choices": [{"message": {"role": "assistant", "content": final_output.outputs[0].text}}]
}
# 3. Create the Gradio UI
async def gradio_predict(prompt: str):
if not engine:
yield "Model is not ready. Please wait a few moments after startup."
return
sampling_params = SamplingParams(temperature=0.7, max_tokens=1024)
stream = engine.generate(prompt, sampling_params, f"gradio-req-{os.urandom(4).hex()}")
async for result in stream:
yield result.outputs[0].text
gradio_ui = gr.Blocks()
with gradio_ui:
gr.Markdown(f"# VLLM Server for {MODEL_ID}")
gr.Markdown("This UI and the `/v1/chat/completions` API are served from the same container.")
with gr.Row():
inp = gr.Textbox(lines=4, label="Input")
out = gr.Textbox(lines=10, label="Output", interactive=False)
btn = gr.Button("Generate")
btn.click(fn=gradio_predict, inputs=inp, outputs=out)
# 4. Mount the Gradio UI onto the FastAPI app
app = gr.mount_gradio_app(app, gradio_ui, path="/") |