Spaces:

VLAI-AIVN
/

AIO2024M10_Tutorial_Tool_Calling

Running

App Files Files Community

wjnwjn59 commited on 20 days ago

Commit

811ffe8

1 Parent(s): bac4914

first init

Browse files

Files changed (10) hide show

.gitignore +12 -0
app.py +134 -0
requirements.txt +7 -0
src/chat.py +0 -0
src/llm/__init__.py +0 -0
src/llm/chat.py +56 -0
src/tools/__init__.py +36 -0
src/tools/detection_model.py +26 -0
src/tools/segmentation_model.py +14 -0
static/aivn_logo.png +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+__pycache__
+*.pyc
+*.pyo
+*.pyd
+*.pdb
+*.egg-info
+*.egg
+*.whl
+*.zip
+*.tar.gz
+weights/

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# app.py
+import os, base64, json, uuid, torch, gradio as gr
+from pathlib import Path
+# === Your vision-LLM stack (imported from src/… as organised earlier) ===
+from src.llm.chat import FunctionCallingChat     # wrapper around Llama-3.2-1B
+chatbot = FunctionCallingChat()                  # load once at start-up
+# -------- helpers --------------------------------------------------------
+def image_to_base64(image_path: str):
+    with open(image_path, "rb") as f:
+        return base64.b64encode(f.read()).decode("utf-8")
+def save_uploaded_image(pil_img) -> Path:
+    """Persist uploaded PIL image to ./static/ and return the file path."""
+    Path("static").mkdir(exist_ok=True)
+    filename = f"upload_{uuid.uuid4().hex[:8]}.png"
+    path = Path("static") / filename
+    pil_img.save(path)
+    return path
+# -------- inference ------------------------------------------------------
+def inference(pil_img, prompt, task):
+    """
+    • pil_img : uploaded PIL image
+    • prompt  : optional free-form request
+    • task    : "Detection" | "Segmentation" | "Auto"
+    Returns plain-text JSON with the LLM tool-call and its results.
+    """
+    if pil_img is None:
+        return "❗ Please upload an image first."
+    img_path = save_uploaded_image(pil_img)
+    # Build user message for the LLM
+    if task == "Detection":
+        user_msg = f"Please detect objects in the image '{img_path}'."
+    elif task == "Segmentation":
+        user_msg = f"Please segment objects in the image '{img_path}'."
+    else:  # Auto / custom
+        prompt = prompt.strip() or "Analyse this image."
+        user_msg = f"{prompt} (image: '{img_path}')"
+    # Run chat → tool calls → tool execution
+    out = chatbot(user_msg)
+    txt = (
+        "### 🔧 Raw tool-call \n"
+        f"{out['raw_tool_call']}\n\n"
+        "### 📦 Tool results\n"
+        f"{json.dumps(out['results'], indent=2)}"
+    )
+    return txt
+# -------- UI (unchanged shell) ------------------------------------------
+def create_header():
+    with gr.Row():
+        with gr.Column(scale=1):
+            logo_base64 = image_to_base64("static/aivn_logo.png")
+            gr.HTML(
+                f"""<img src="data:image/png;base64,{logo_base64}"
+                        alt="Logo"
+                        style="height:120px;width:auto;margin-right:20px;margin-bottom:20px;">"""
+            )
+        with gr.Column(scale=4):
+            gr.Markdown(
+                """
+<div style="display:flex;justify-content:space-between;align-items:center;padding:0 15px;">
+  <div>
+    <h1 style="margin-bottom:0;">🖼️ Vision Tool-Calling Demo</h1>
+    <p style="margin-top:0.5em;color:#666;">LLM-driven Detection & Segmentation</p>
+  </div>
+  <div style="text-align:right;border-left:2px solid #ddd;padding-left:20px;">
+    <h3 style="margin:0;color:#2c3e50;">🚀 AIO2024 Module 10 Project 🤗</h3>
+    <p style="margin:0;color:#7f8c8d;">🔍 Using Llama 3.2-1B + YOLO + SAM</p>
+  </div>
+</div>
+"""
+            )
+def create_footer():
+    footer_html = """
+<style>
+  .sticky-footer{position:fixed;bottom:0;left:0;width:100%;background:white;
+                 padding:10px;box-shadow:0 -2px 10px rgba(0,0,0,0.1);z-index:1000;}
+  .content-wrap{padding-bottom:60px;}
+</style>
+<div class="sticky-footer">
+  <div style="text-align:center;font-size:14px;">
+    Created by <a href="https://vlai.work" target="_blank"
+    style="color:#007BFF;text-decoration:none;">VLAI</a> • AI VIETNAM
+  </div>
+</div>
+"""
+    return gr.HTML(footer_html)
+custom_css = """
+.gradio-container {min-height:100vh;}
+.content-wrap {padding-bottom:60px;}
+.full-width-btn {width:100%!important;height:50px!important;font-size:18px!important;
+                 margin-top:20px!important;background:linear-gradient(45deg,#FF6B6B,#4ECDC4)!important;
+                 color:white!important;border:none!important;}
+.full-width-btn:hover {background:linear-gradient(45deg,#FF5252,#3CB4AC)!important;}
+"""
+with gr.Blocks(css=custom_css) as demo:
+    create_header()
+    with gr.Row(equal_height=True, variant="panel"):
+        with gr.Column(scale=3):
+            upload_image = gr.Image(label="Upload image", type="pil")
+            prompt_input = gr.Textbox(label="Optional prompt", placeholder="e.g. Detect cats only")
+            task_choice  = gr.Radio(
+                ["Auto", "Detection", "Segmentation"], value="Auto", label="Task"
+            )
+            submit_btn   = gr.Button("Run 🔧", elem_classes="full-width-btn")
+        with gr.Column(scale=4):
+            output_text = gr.Markdown(label="Result")
+        submit_btn.click(
+            inference,
+            inputs=[upload_image, prompt_input, task_choice],
+            outputs=output_text,
+        )
+    create_footer()
+if __name__ == "__main__":
+    demo.launch(allowed_paths=["static/aivn_logo.png", "static"])

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+ultralytics==8.3.130
+torch==2.6.0
+transformers==4.51.3
+matplotlib==3.10.3
+opencv-python==4.11.0.86
+gradio==5.29.0
+Pillow==11.2.1

src/chat.py ADDED Viewed

File without changes

src/llm/__init__.py ADDED Viewed

File without changes

src/llm/chat.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import ast, sys, json, torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
+from ..tools import run_detection, run_segmentation, FUNCTION_SCHEMA
+TOOLS = {"run_detection": run_detection, "run_segmentation": run_segmentation}
+SYSTEM_PROMPT = """
+You are an expert in composing functions. You are given a question and a set of possible functions.
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
+If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
+also point it out. You should only return the function call in tools call sections.
+If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\n
+You SHOULD NOT include any other text in the response.
+Here is a list of functions in JSON format that you can invoke.\n\n{functions}\n""".format(functions=FUNCTION_SCHEMA)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+class FunctionCallingChat:
+    def __init__(self, model_id: str = "meta-llama/Llama-3.2-1B-Instruct"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.model     = AutoModelForCausalLM.from_pretrained(
+            model_id, device_map=device, torch_dtype=torch.bfloat16
+        )
+    def __call__(self, user_msg: str) -> dict:
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user",   "content": user_msg},
+        ]
+        generation_cfg = GenerationConfig(
+            max_new_tokens=512, temperature=0.5, top_p=0.95, do_sample=True
+        )
+        tokenized = self.tokenizer.apply_chat_template(
+            messages, tokenize=True, add_generation_prompt=True,
+            return_attention_mask=True, return_tensors="pt"
+        ).to(device)
+        output = self.model.generate(tokenized, generation_config=generation_cfg)
+        raw    = self.tokenizer.decode(output[0], skip_special_tokens=True)
+        tool_calls_str = raw.split("assistant")[-1]
+        try:
+            calls = ast.literal_eval(tool_calls_str)
+        except Exception as e:
+            raise RuntimeError(f"Cannot parse tool call: {e}\nRaw: {tool_calls_str}")
+        results = []
+        for call in calls:
+            fn_name  = call.func.id
+            kwargs   = {kw.arg: ast.literal_eval(kw.value) for kw in call.keywords}
+            results.append(TOOLS[fn_name](**kwargs))
+        return {"raw_tool_call": tool_calls_str, "results": results}

src/tools/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from .detection_model import run_detection
+from .segmentation_model import run_segmentation
+__all__ = ["run_detection", "run_segmentation"]
+FUNCTION_SCHEMA = [
+    {
+        "type": "function",
+        "function": {
+            "name": "run_detection",
+            "description": "Detect objects in an image and return bounding boxes and labels.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "image_path":  {"type": "string", "description": "Local path to the image file."},
+                    "is_visualize":{"type": "bool",   "description": "If true draw bboxes and save next to image."}
+                },
+                "required": ["image_path"]
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "run_segmentation",
+            "description": "Segment objects in an image and return binary masks.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "image_path": {"type": "string", "description": "Local path to the image file."}
+                },
+                "required": ["image_path"]
+            },
+        },
+    },
+]

src/tools/detection_model.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from ultralytics import YOLO
+detection_model_id = "yolo11n.pt"
+detection_model = YOLO(detection_model_id)
+def run_detection(image_path: str, is_visualize: bool = False):
+    """YOLOv11: return list of {box, label, score} for a single image."""
+    results = detection_model(image_path)
+    r = results[0]
+    detections = []
+    for box in r.boxes:
+        # box.xyxy is a 1×4 tensor, box.conf is a 1-element tensor, box.cls likewise
+        coords  = box.xyxy.cpu().numpy().flatten().tolist()
+        score   = float(box.conf.cpu().numpy().item())
+        cls_id  = int(box.cls.cpu().numpy().item())
+        detections.append({
+            "box": coords,
+            "label": r.names[cls_id],
+            "score": score,
+        })
+    if is_visualize:
+        r.save()
+        r.show()
+    return {"detections": detections}

src/tools/segmentation_model.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from transformers import SamModel, SamProcessor
+from PIL import Image
+segmentation_model_id = "facebook/sam-vit-base"
+sam_processor = SamProcessor.from_pretrained(segmentation_model_id)
+sam_model = SamModel.from_pretrained(segmentation_model_id)
+def run_segmentation(image_path: str):
+    """SAM: return binary masks as nested lists"""
+    img = Image.open(image_path).convert("RGB")
+    inputs = sam_processor(images=img, return_tensors="pt")
+    outputs = sam_model(**inputs)
+    masks = outputs.pred_masks.squeeze(0).cpu().detach().numpy().tolist()
+    return {"masks": masks}

static/aivn_logo.png ADDED Viewed