# app.py
import os, base64, json, uuid, torch, gradio as gr
from pathlib import Path
# === Your vision-LLM stack (imported from src/… as organised earlier) ===
from src.llm.chat import FunctionCallingChat # wrapper around Llama-3.2-1B
chatbot = FunctionCallingChat() # load once at start-up
# -------- helpers --------------------------------------------------------
def image_to_base64(image_path: str):
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
def save_uploaded_image(pil_img) -> Path:
"""Persist uploaded PIL image to ./static/ and return the file path."""
Path("static").mkdir(exist_ok=True)
filename = f"upload_{uuid.uuid4().hex[:8]}.png"
path = Path("static") / filename
pil_img.save(path)
return path
# -------- inference ------------------------------------------------------
def inference(pil_img, prompt, task):
"""
• pil_img : uploaded PIL image
• prompt : optional free-form request
• task : "Detection" | "Segmentation" | "Auto"
Returns plain-text JSON with the LLM tool-call and its results.
"""
if pil_img is None:
return "❗ Please upload an image first."
img_path = save_uploaded_image(pil_img)
# Build user message for the LLM
if task == "Detection":
user_msg = f"Please detect objects in the image '{img_path}'."
elif task == "Segmentation":
user_msg = f"Please segment objects in the image '{img_path}'."
else: # Auto / custom
prompt = prompt.strip() or "Analyse this image."
user_msg = f"{prompt} (image: '{img_path}')"
# Run chat → tool calls → tool execution
out = chatbot(user_msg)
txt = (
"### 🔧 Raw tool-call \n"
f"{out['raw_tool_call']}\n\n"
"### 📦 Tool results\n"
f"{json.dumps(out['results'], indent=2)}"
)
return txt
# -------- UI (unchanged shell) ------------------------------------------
def create_header():
with gr.Row():
with gr.Column(scale=1):
logo_base64 = image_to_base64("static/aivn_logo.png")
gr.HTML(
f""""""
)
with gr.Column(scale=4):
gr.Markdown(
"""
LLM-driven Detection & Segmentation
🔍 Using Llama 3.2-1B + YOLO + SAM