Spaces:

Leon4gr45
/

docker_selfhosted

Runtime error

Leon4gr45 commited on 20 days ago

Commit

12530ec

verified ·

1 Parent(s): 7ad47ac

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -12,7 +12,6 @@ MODEL_ID = os.getenv("MODEL_ID", "meta-llama/Llama-3.1-8B-Instruct")
 engine = None
 # --- Lifespan Manager for Model Loading ---
-# This is the correct way to load a model on startup in FastAPI.
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global engine
@@ -22,14 +21,13 @@ async def lifespan(app: FastAPI):
         tokenizer="hf-internal-testing/llama-tokenizer",
         tensor_parallel_size=1,
         gpu_memory_utilization=0.90,
-        download_dir="/data/huggingface" # Cache directory inside the container
     )
     engine = AsyncLLMEngine.from_engine_args(engine_args)
     print("Model loading complete.")
     yield
-    # Cleanup logic can be added here if needed
-# 1. Create the FastAPI app instance FIRST
 app = FastAPI(lifespan=lifespan)
 # --- API Data Models ---
@@ -60,7 +58,7 @@ async def chat_completions(request: ChatCompletionRequest):
         "choices": [{"message": {"role": "assistant", "content": final_output.outputs[0].text}}]
     }
-# 3. Create the Gradio UI in a separate object
 async def gradio_predict(prompt: str):
     if not engine:
         yield "Model is not ready. Please wait a few moments after startup."
@@ -82,5 +80,5 @@ with gradio_ui:
     btn = gr.Button("Generate")
     btn.click(fn=gradio_predict, inputs=inp, outputs=out)
-# 4. Mount the Gradio UI onto the FastAPI app at the root path
 app = gr.mount_gradio_app(app, gradio_ui, path="/")

 engine = None
 # --- Lifespan Manager for Model Loading ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global engine
         tokenizer="hf-internal-testing/llama-tokenizer",
         tensor_parallel_size=1,
         gpu_memory_utilization=0.90,
+        download_dir="/data/huggingface"
     )
     engine = AsyncLLMEngine.from_engine_args(engine_args)
     print("Model loading complete.")
     yield
+# 1. Create the FastAPI app instance
 app = FastAPI(lifespan=lifespan)
 # --- API Data Models ---
         "choices": [{"message": {"role": "assistant", "content": final_output.outputs[0].text}}]
     }
+# 3. Create the Gradio UI
 async def gradio_predict(prompt: str):
     if not engine:
         yield "Model is not ready. Please wait a few moments after startup."
     btn = gr.Button("Generate")
     btn.click(fn=gradio_predict, inputs=inp, outputs=out)
+# 4. Mount the Gradio UI onto the FastAPI app
 app = gr.mount_gradio_app(app, gradio_ui, path="/")