# app.py import os, base64, json, uuid, torch, gradio as gr from pathlib import Path # === Your vision-LLM stack (imported from src/… as organised earlier) === from src.llm.chat import FunctionCallingChat # wrapper around Llama-3.2-1B chatbot = FunctionCallingChat() # load once at start-up # -------- helpers -------------------------------------------------------- def image_to_base64(image_path: str): with open(image_path, "rb") as f: return base64.b64encode(f.read()).decode("utf-8") def save_uploaded_image(pil_img) -> Path: """Persist uploaded PIL image to ./static/ and return the file path.""" Path("static").mkdir(exist_ok=True) filename = f"upload_{uuid.uuid4().hex[:8]}.png" path = Path("static") / filename pil_img.save(path) return path # -------- inference ------------------------------------------------------ def inference(pil_img, prompt, task): """ • pil_img : uploaded PIL image • prompt : optional free-form request • task : "Detection" | "Segmentation" | "Auto" Returns plain-text JSON with the LLM tool-call and its results. """ if pil_img is None: return "❗ Please upload an image first." img_path = save_uploaded_image(pil_img) # Build user message for the LLM if task == "Detection": user_msg = f"Please detect objects in the image '{img_path}'." elif task == "Segmentation": user_msg = f"Please segment objects in the image '{img_path}'." else: # Auto / custom prompt = prompt.strip() or "Analyse this image." user_msg = f"{prompt} (image: '{img_path}')" # Run chat → tool calls → tool execution out = chatbot(user_msg) txt = ( "### 🔧 Raw tool-call \n" f"{out['raw_tool_call']}\n\n" "### 📦 Tool results\n" f"{json.dumps(out['results'], indent=2)}" ) return txt # -------- UI (unchanged shell) ------------------------------------------ def create_header(): with gr.Row(): with gr.Column(scale=1): logo_base64 = image_to_base64("static/aivn_logo.png") gr.HTML( f""" Logo

""" ) with gr.Column(scale=4): gr.Markdown( """

🖼️ Vision Tool-Calling Demo

LLM-driven Detection & Segmentation

🚀 AIO2024 Module 10 Project 🤗

🔍 Using Llama 3.2-1B + YOLO + SAM

""" ) def create_footer(): footer_html = """ """ return gr.HTML(footer_html) custom_css = """ .gradio-container {min-height:100vh;} .content-wrap {padding-bottom:60px;} .full-width-btn {width:100%!important;height:50px!important;font-size:18px!important; margin-top:20px!important;background:linear-gradient(45deg,#FF6B6B,#4ECDC4)!important; color:white!important;border:none!important;} .full-width-btn:hover {background:linear-gradient(45deg,#FF5252,#3CB4AC)!important;} """ with gr.Blocks(css=custom_css) as demo: create_header() with gr.Row(equal_height=True, variant="panel"): with gr.Column(scale=3): upload_image = gr.Image(label="Upload image", type="pil") prompt_input = gr.Textbox(label="Optional prompt", placeholder="e.g. Detect cats only") task_choice = gr.Radio( ["Auto", "Detection", "Segmentation"], value="Auto", label="Task" ) submit_btn = gr.Button("Run 🔧", elem_classes="full-width-btn") with gr.Column(scale=4): output_text = gr.Markdown(label="Result") submit_btn.click( inference, inputs=[upload_image, prompt_input, task_choice], outputs=output_text, ) create_footer() if __name__ == "__main__": demo.launch(allowed_paths=["static/aivn_logo.png", "static"])