Leon4gr45 commited on
Commit
12530ec
·
verified ·
1 Parent(s): 7ad47ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -6
app.py CHANGED
@@ -12,7 +12,6 @@ MODEL_ID = os.getenv("MODEL_ID", "meta-llama/Llama-3.1-8B-Instruct")
12
  engine = None
13
 
14
  # --- Lifespan Manager for Model Loading ---
15
- # This is the correct way to load a model on startup in FastAPI.
16
  @asynccontextmanager
17
  async def lifespan(app: FastAPI):
18
  global engine
@@ -22,14 +21,13 @@ async def lifespan(app: FastAPI):
22
  tokenizer="hf-internal-testing/llama-tokenizer",
23
  tensor_parallel_size=1,
24
  gpu_memory_utilization=0.90,
25
- download_dir="/data/huggingface" # Cache directory inside the container
26
  )
27
  engine = AsyncLLMEngine.from_engine_args(engine_args)
28
  print("Model loading complete.")
29
  yield
30
- # Cleanup logic can be added here if needed
31
 
32
- # 1. Create the FastAPI app instance FIRST
33
  app = FastAPI(lifespan=lifespan)
34
 
35
  # --- API Data Models ---
@@ -60,7 +58,7 @@ async def chat_completions(request: ChatCompletionRequest):
60
  "choices": [{"message": {"role": "assistant", "content": final_output.outputs[0].text}}]
61
  }
62
 
63
- # 3. Create the Gradio UI in a separate object
64
  async def gradio_predict(prompt: str):
65
  if not engine:
66
  yield "Model is not ready. Please wait a few moments after startup."
@@ -82,5 +80,5 @@ with gradio_ui:
82
  btn = gr.Button("Generate")
83
  btn.click(fn=gradio_predict, inputs=inp, outputs=out)
84
 
85
- # 4. Mount the Gradio UI onto the FastAPI app at the root path
86
  app = gr.mount_gradio_app(app, gradio_ui, path="/")
 
12
  engine = None
13
 
14
  # --- Lifespan Manager for Model Loading ---
 
15
  @asynccontextmanager
16
  async def lifespan(app: FastAPI):
17
  global engine
 
21
  tokenizer="hf-internal-testing/llama-tokenizer",
22
  tensor_parallel_size=1,
23
  gpu_memory_utilization=0.90,
24
+ download_dir="/data/huggingface"
25
  )
26
  engine = AsyncLLMEngine.from_engine_args(engine_args)
27
  print("Model loading complete.")
28
  yield
 
29
 
30
+ # 1. Create the FastAPI app instance
31
  app = FastAPI(lifespan=lifespan)
32
 
33
  # --- API Data Models ---
 
58
  "choices": [{"message": {"role": "assistant", "content": final_output.outputs[0].text}}]
59
  }
60
 
61
+ # 3. Create the Gradio UI
62
  async def gradio_predict(prompt: str):
63
  if not engine:
64
  yield "Model is not ready. Please wait a few moments after startup."
 
80
  btn = gr.Button("Generate")
81
  btn.click(fn=gradio_predict, inputs=inp, outputs=out)
82
 
83
+ # 4. Mount the Gradio UI onto the FastAPI app
84
  app = gr.mount_gradio_app(app, gradio_ui, path="/")