Spaces:

Leon4gr45
/

docker_selfhosted

Runtime error

App Files Files Community

Leon4gr45 commited on 20 days ago

Commit

0442491

verified ·

1 Parent(s): fb40632

Create app.py

Browse files

Files changed (1) hide show

app.py +86 -0

app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import gradio as gr
+from fastapi import FastAPI
+from pydantic import BaseModel
+from typing import List
+from contextlib import asynccontextmanager
+from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
+# --- Configuration ---
+MODEL_ID = os.getenv("MODEL_ID", "meta-llama/Llama-3.1-8B-Instruct")
+engine = None
+# --- Lifespan Manager for Model Loading ---
+# This is the correct way to load a model on startup in FastAPI.
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global engine
+    print(f"Lifespan startup: Loading model {MODEL_ID}...")
+    engine_args = AsyncEngineArgs(
+        model=MODEL_ID,
+        tokenizer="hf-internal-testing/llama-tokenizer",
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.90,
+        download_dir="/data/huggingface" # Cache directory inside the container
+    )
+    engine = AsyncLLMEngine.from_engine_args(engine_args)
+    print("Model loading complete.")
+    yield
+    # Cleanup logic can be added here if needed
+# 1. Create the FastAPI app instance FIRST
+app = FastAPI(lifespan=lifespan)
+# --- API Data Models ---
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+class ChatCompletionRequest(BaseModel):
+    messages: List[ChatMessage]
+    model: str = MODEL_ID
+    temperature: float = 0.7
+    max_tokens: int = 1024
+# 2. Define the API endpoint on the FastAPI `app` object
+@app.post("/v1/chat/completions")
+async def chat_completions(request: ChatCompletionRequest):
+    if not engine:
+        return {"error": "Model is not ready or has failed to load."}, 503
+    user_prompt = request.messages[-1].content
+    sampling_params = SamplingParams(temperature=request.temperature, max_tokens=request.max_tokens)
+    request_id = f"api-{os.urandom(4).hex()}"
+    results_generator = engine.generate(user_prompt, sampling_params, request_id)
+    final_output = await results_generator
+    return {
+        "choices": [{"message": {"role": "assistant", "content": final_output.outputs[0].text}}]
+    }
+# 3. Create the Gradio UI in a separate object
+async def gradio_predict(prompt: str):
+    if not engine:
+        yield "Model is not ready. Please wait a few moments after startup."
+        return
+    sampling_params = SamplingParams(temperature=0.7, max_tokens=1024)
+    stream = engine.generate(prompt, sampling_params, f"gradio-req-{os.urandom(4).hex()}")
+    async for result in stream:
+        yield result.outputs[0].text
+gradio_ui = gr.Blocks()
+with gradio_ui:
+    gr.Markdown(f"# VLLM Server for {MODEL_ID}")
+    gr.Markdown("This UI and the `/v1/chat/completions` API are served from the same container.")
+    with gr.Row():
+        inp = gr.Textbox(lines=4, label="Input")
+        out = gr.Textbox(lines=10, label="Output", interactive=False)
+    btn = gr.Button("Generate")
+    btn.click(fn=gradio_predict, inputs=inp, outputs=out)
+# 4. Mount the Gradio UI onto the FastAPI app at the root path
+app = gr.mount_gradio_app(app, gradio_ui, path="/")