Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -12,7 +12,6 @@ MODEL_ID = os.getenv("MODEL_ID", "meta-llama/Llama-3.1-8B-Instruct")
|
|
12 |
engine = None
|
13 |
|
14 |
# --- Lifespan Manager for Model Loading ---
|
15 |
-
# This is the correct way to load a model on startup in FastAPI.
|
16 |
@asynccontextmanager
|
17 |
async def lifespan(app: FastAPI):
|
18 |
global engine
|
@@ -22,14 +21,13 @@ async def lifespan(app: FastAPI):
|
|
22 |
tokenizer="hf-internal-testing/llama-tokenizer",
|
23 |
tensor_parallel_size=1,
|
24 |
gpu_memory_utilization=0.90,
|
25 |
-
download_dir="/data/huggingface"
|
26 |
)
|
27 |
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
28 |
print("Model loading complete.")
|
29 |
yield
|
30 |
-
# Cleanup logic can be added here if needed
|
31 |
|
32 |
-
# 1. Create the FastAPI app instance
|
33 |
app = FastAPI(lifespan=lifespan)
|
34 |
|
35 |
# --- API Data Models ---
|
@@ -60,7 +58,7 @@ async def chat_completions(request: ChatCompletionRequest):
|
|
60 |
"choices": [{"message": {"role": "assistant", "content": final_output.outputs[0].text}}]
|
61 |
}
|
62 |
|
63 |
-
# 3. Create the Gradio UI
|
64 |
async def gradio_predict(prompt: str):
|
65 |
if not engine:
|
66 |
yield "Model is not ready. Please wait a few moments after startup."
|
@@ -82,5 +80,5 @@ with gradio_ui:
|
|
82 |
btn = gr.Button("Generate")
|
83 |
btn.click(fn=gradio_predict, inputs=inp, outputs=out)
|
84 |
|
85 |
-
# 4. Mount the Gradio UI onto the FastAPI app
|
86 |
app = gr.mount_gradio_app(app, gradio_ui, path="/")
|
|
|
12 |
engine = None
|
13 |
|
14 |
# --- Lifespan Manager for Model Loading ---
|
|
|
15 |
@asynccontextmanager
|
16 |
async def lifespan(app: FastAPI):
|
17 |
global engine
|
|
|
21 |
tokenizer="hf-internal-testing/llama-tokenizer",
|
22 |
tensor_parallel_size=1,
|
23 |
gpu_memory_utilization=0.90,
|
24 |
+
download_dir="/data/huggingface"
|
25 |
)
|
26 |
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
27 |
print("Model loading complete.")
|
28 |
yield
|
|
|
29 |
|
30 |
+
# 1. Create the FastAPI app instance
|
31 |
app = FastAPI(lifespan=lifespan)
|
32 |
|
33 |
# --- API Data Models ---
|
|
|
58 |
"choices": [{"message": {"role": "assistant", "content": final_output.outputs[0].text}}]
|
59 |
}
|
60 |
|
61 |
+
# 3. Create the Gradio UI
|
62 |
async def gradio_predict(prompt: str):
|
63 |
if not engine:
|
64 |
yield "Model is not ready. Please wait a few moments after startup."
|
|
|
80 |
btn = gr.Button("Generate")
|
81 |
btn.click(fn=gradio_predict, inputs=inp, outputs=out)
|
82 |
|
83 |
+
# 4. Mount the Gradio UI onto the FastAPI app
|
84 |
app = gr.mount_gradio_app(app, gradio_ui, path="/")
|