Spaces:

A-Mahla
/

Smol2Operator

Running on Zero

[email protected] commited on Sep 19

Commit

d32faf0

1 Parent(s): 3fe2480

ADD new app

Browse files

Files changed (6) hide show

README.md +4 -4
app.py +280 -0
assets/google.png +0 -0
prompt.py +143 -0
requirements.txt +8 -0
smolvlm_inference.py +23 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: Smol2Operator Demo
-emoji: 🏆
-colorFrom: blue
-colorTo: yellow
 sdk: gradio
-sdk_version: 5.46.0
 app_file: app.py
 pinned: false
 license: apache-2.0

 ---
 title: Smol2Operator Demo
+emoji: 🐢
+colorFrom: purple
+colorTo: green
 sdk: gradio
+sdk_version: 5.44.1
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import re
+from typing import Tuple, Optional
+import gradio as gr
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+from smolvlm_inference import TransformersModel
+from prompt import OS_SYSTEM_PROMPT
+# --- Configuration ---
+MODEL_ID = "smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI"
+# --- Model and Processor Loading (Load once) ---
+print(f"Loading model and processor for {MODEL_ID}...")
+model = None
+processor = None
+model_loaded = False
+load_error_message = ""
+model = TransformersModel(
+    model_id=MODEL_ID,
+    to_device="cuda:0",
+)
+title = "Smol2Operator Demo"
+description = """
+This is a demo of the Smol2Operator model designed to interact with graphical user interfaces (GUIs) and perform actions within them.
+This proof-of-concept (POC) version, described in [blogpost], showcases the model’s core capabilities.
+This compact release is intentionally scoped to fundamental tasks, with complex workflows planned for future iterations. :hugging_face:
+"""
+SYSTEM_PROMPT: str = OS_SYSTEM_PROMPT
+def get_navigation_prompt(task, image, step=1):
+    """
+    Get the prompt for the navigation task.
+    - task: The task to complete
+    - image: The current screenshot of the web page
+    - step: The current step of the task
+    """
+    system_prompt = SYSTEM_PROMPT
+    return [
+        {
+            "role": "system",
+            "content": [
+                {"type": "text", "text": system_prompt},
+            ],
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image,
+                },
+                {"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\nNone"},
+            ],
+        },
+    ]
+def array_to_image(image_array: np.ndarray) -> Image.Image:
+    if image_array is None:
+        raise ValueError("No image provided. Please upload an image before submitting.")
+    # Convert numpy array to PIL Image
+    img = Image.fromarray(np.uint8(image_array))
+    return img
+def parse_actions_from_response(response: str) -> list[str]:
+    """Parse actions from model response using regex pattern."""
+    pattern = r"<code>\n(.*?)\n</code>"
+    matches = re.findall(pattern, response, re.DOTALL)
+    return matches
+def extract_coordinates_from_action(action_code: str) -> list[dict]:
+    """Extract coordinates from action code for localization actions."""
+    localization_actions = []
+    # Patterns for different action types
+    patterns = {
+        'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
+        'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
+        'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)',
+        'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)'
+    }
+    for action_type, pattern in patterns.items():
+        matches = re.finditer(pattern, action_code)
+        for match in matches:
+            if action_type == 'drag':
+                # Drag has from and to coordinates
+                from_x, from_y, to_x, to_y = match.groups()
+                localization_actions.append({
+                    'type': 'drag_from',
+                    'x': float(from_x),
+                    'y': float(from_y),
+                    'action': action_type
+                })
+                localization_actions.append({
+                    'type': 'drag_to',
+                    'x': float(to_x),
+                    'y': float(to_y),
+                    'action': action_type
+                })
+            else:
+                # Single coordinate actions
+                x_val = match.group(1)
+                y_val = match.group(2) if match.group(2) else x_val  # Handle single coordinate case
+                if x_val and y_val:
+                    localization_actions.append({
+                        'type': action_type,
+                        'x': float(x_val),
+                        'y': float(y_val),
+                        'action': action_type
+                    })
+    return localization_actions
+def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
+    """Create an image with localization markers drawn on it."""
+    if not coordinates:
+        return None
+    # Create a copy of the original image
+    img_copy = original_image.copy()
+    draw = ImageDraw.Draw(img_copy)
+    # Get image dimensions
+    width, height = img_copy.size
+    # Try to load a font, fallback to default if not available
+    font = ImageFont.load_default()
+    # Color scheme for different actions
+    colors = {
+        'click': 'red',
+        'double_click': 'blue',
+        'move_mouse': 'green',
+        'drag_from': 'orange',
+        'drag_to': 'purple'
+    }
+    for i, coord in enumerate(coordinates):
+        # Convert normalized coordinates to pixel coordinates
+        pixel_x = int(coord['x'] * width)
+        pixel_y = int(coord['y'] * height)
+        # Get color for this action type
+        color = colors.get(coord['type'], 'red')
+        # Draw a circle at the coordinate
+        circle_radius = 8
+        draw.ellipse([
+            pixel_x - circle_radius, pixel_y - circle_radius,
+            pixel_x + circle_radius, pixel_y + circle_radius
+        ], fill=color, outline='white', width=2)
+        # Add text label
+        label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})"
+        if font:
+            draw.text((pixel_x + 10, pixel_y - 10), label, fill=color, font=font)
+        else:
+            draw.text((pixel_x + 10, pixel_y - 10), label, fill=color)
+        # For drag actions, draw an arrow
+        if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
+            next_coord = coordinates[i + 1]
+            end_x = int(next_coord['x'] * width)
+            end_y = int(next_coord['y'] * height)
+            # Draw arrow line
+            draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)
+            # Draw arrowhead
+            arrow_size = 10
+            dx = end_x - pixel_x
+            dy = end_y - pixel_y
+            length = (dx**2 + dy**2)**0.5
+            if length > 0:
+                dx_norm = dx / length
+                dy_norm = dy / length
+                # Arrowhead points
+                arrow_x1 = end_x - arrow_size * dx_norm + arrow_size * dy_norm * 0.5
+                arrow_y1 = end_y - arrow_size * dy_norm - arrow_size * dx_norm * 0.5
+                arrow_x2 = end_x - arrow_size * dx_norm - arrow_size * dy_norm * 0.5
+                arrow_y2 = end_y - arrow_size * dy_norm + arrow_size * dx_norm * 0.5
+                draw.polygon([end_x, end_y, arrow_x1, arrow_y1, arrow_x2, arrow_y2], fill='orange')
+    return img_copy
+# --- Gradio processing function ---
+def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
+    input_pil_image = array_to_image(input_numpy_image)
+    assert isinstance(input_pil_image, Image.Image)
+    prompt = get_navigation_prompt(task, input_pil_image)
+    print("Prompt:")
+    print(prompt)
+    if model is None:
+        raise ValueError("Model not loaded")
+    navigation_str = model.generate(prompt, max_new_tokens=500)
+    print(f"Navigation string: {navigation_str}")
+    navigation_str = navigation_str.strip()
+    # Parse actions from the response
+    actions = parse_actions_from_response(navigation_str)
+    # Extract coordinates from all actions
+    all_coordinates = []
+    for action_code in actions:
+        coordinates = extract_coordinates_from_action(action_code)
+        all_coordinates.extend(coordinates)
+    # Create localized image if there are coordinates
+    localized_image = None
+    if all_coordinates:
+        localized_image = create_localized_image(input_pil_image, all_coordinates)
+        print(f"Found {len(all_coordinates)} localization actions")
+    return navigation_str, localized_image
+# --- Load Example Data ---
+example_1_image: str = "./assets/google.png"
+example_1_image = Image.open(example_1_image)
+example_1_task = "Search for the name of the current UK Prime Minister."
+example_2_image: str = "./assets/huggingface.png"
+example_2_image = Image.open(example_2_image)
+example_2_task = "Find the most trending model."
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
+    # gr.Markdown(description)
+    with gr.Row():
+        with gr.Column():
+            input_image_component = gr.Image(label="Input UI Image", height=400)
+            task_component = gr.Textbox(
+                label="task",
+                placeholder="e.g., Find the latest model by H Company",
+                info="Type the task you want the model to complete.",
+            )
+            submit_button = gr.Button("Navigate", variant="primary")
+        with gr.Column():
+            localization_image_component = gr.Image(label="Action Localization", height=400)
+            output_coords_component = gr.Textbox(label="Agent Output", lines=20)
+    submit_button.click(navigate, [input_image_component, task_component], [output_coords_component, localization_image_component])
+    gr.Examples(
+        examples=[[example_1_image, example_1_task], [example_2_image, example_2_task]],
+        inputs=[input_image_component, task_component],
+        outputs=[output_coords_component, localization_image_component],
+        fn=navigate,
+        cache_examples=True,
+    )
+demo.queue(api_open=False)
+demo.launch(debug=True)

assets/google.png ADDED Viewed

prompt.py ADDED Viewed

	@@ -0,0 +1,143 @@

+OS_ACTIONS = """
+def final_answer(answer: any) -> any:
+    \"\"\"
+    Provides a final answer to the given problem.
+    Args:
+        answer: The final answer to the problem
+    \"\"\"
+def move_mouse(self, x: float, y: float) -> str:
+    \"\"\"
+    Moves the mouse cursor to the specified coordinates
+    Args:
+        x: The x coordinate (horizontal position)
+        y: The y coordinate (vertical position)
+    \"\"\"
+def click(x: Optional[float] = None, y: Optional[float] = None) -> str:
+    \"\"\"
+    Performs a left-click at the specified normalized coordinates
+    Args:
+        x: The x coordinate (horizontal position)
+        y: The y coordinate (vertical position)
+    \"\"\"
+def double_click(x: Optional[float] = None, y: Optional[float] = None) -> str:
+    \"\"\"
+    Performs a double-click at the specified normalized coordinates
+    Args:
+        x: The x coordinate (horizontal position)
+        y: The y coordinate (vertical position)
+    \"\"\"
+def type(text: str) -> str:
+    \"\"\"
+    Types the specified text at the current cursor position.
+    Args:
+        text: The text to type
+    \"\"\"
+def press(keys: str | list[str]) -> str:
+    \"\"\"
+    Presses a keyboard key
+    Args:
+        keys: The key or list of keys to press (e.g. "enter", "space", "backspace", "ctrl", etc.).
+    \"\"\"
+def navigate_back() -> str:
+    \"\"\"
+    Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
+    \"\"\"
+def drag(from_coord: list[float], to_coord: list[float]) -> str:
+    \"\"\"
+    Clicks [x1, y1], drags mouse to [x2, y2], then release click.
+    Args:
+        x1: origin x coordinate
+        y1: origin y coordinate
+        x2: end x coordinate
+        y2: end y coordinate
+    \"\"\"
+def scroll(direction: Literal["up", "down"] = "down", amount: int = 1) -> str:
+    \"\"\"
+    Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
+    Args:
+        x: The x coordinate (horizontal position) of the element to scroll/zoom, defaults to None to not focus on specific coordinates
+        y: The y coordinate (vertical position) of the element to scroll/zoom, defaults to None to not focus on specific coordinates
+        direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
+        amount: The amount to scroll. A good amount is 1 or 2.
+    \"\"\"
+def wait(seconds: float) -> str:
+    \"\"\"
+    Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
+    Args:
+        seconds: Number of seconds to wait, generally 2 is enough.
+    \"\"\"
+"""
+MOBILE_ACTIONS = """
+def navigate_back() -> str:
+    \"\"\"
+    Return to home page
+    \"\"\"
+def open_app(app_name: str) -> str:
+    \"\"\"
+    Launches the specified application.
+    Args:
+        app_name: the name of the application to launch
+    \"\"\"
+def swipe(from_coord: list[str], to_coord: list[str]) -> str:
+    \"\"\"
+    swipe from 'from_coord' to 'to_coord'
+    Args:
+        from_coord: origin coordinates
+        to_coord: end coordinates
+    \"\"\"
+def long_press(x: int, y: int) -> str:
+    \"\"\"
+    Performs a long-press at the specified coordinates
+    Args:
+        x: The x coordinate (horizontal position)
+        y: The y coordinate (vertical position)
+    \"\"\"
+"""
+OS_SYSTEM_PROMPT = f"""You are a helpful GUI agent. You’ll be given a task and a screenshot of the screen. Complete the task using Python function calls.
+For each step:
+	•	First, <think></think> to express the thought process guiding your next action and the reasoning behind it.
+	•	Then, use <code></code> to perform the action. it will be executed in a stateful environment.
+The following functions are exposed to the Python interpreter:
+<code>
+{OS_ACTIONS}
+</code>
+The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
+"""
+MOBILE_SYSTEM_PROMPT = f"""You are a helpful GUI agent. You’ll be given a task and a screenshot of the screen. Complete the task using Python function calls.
+For each step:
+	•	First, <think></think> to express the thought process guiding your next action and the reasoning behind it.
+	•	Then, use <code></code> to perform the action. it will be executed in a stateful environment.
+The following functions are exposed to the Python interpreter:
+<code>
+# OS ACTIONS
+{OS_ACTIONS}
+# MOBILE ACTIONS
+{MOBILE_ACTIONS}
+</code>
+The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
+"""

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+numpy==2.3.3
+Pillow==11.3.0
+torch==2.8.0
+torchvision==0.23.0
+gradio==5.46.0
+num2words==0.5.14
+transformers==4.56.1
+spaces==0.41.0

smolvlm_inference.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch
+from transformers import AutoModelForImageTextToText, AutoProcessor
+class TransformersModel:
+    def __init__(self, model_id: str, to_device: str = "cuda"):
+        self.model_id = model_id
+        self.processor = AutoProcessor.from_pretrained(model_id)
+        self.processor.image_processor.size = {"longest_edge": 3 * 384}
+        self.model = AutoModelForImageTextToText.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(to_device)
+    def generate(self, messages: list[dict], **kwargs):
+        inputs = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(self.model.device, dtype=torch.bfloat16)
+        generated_ids = self.model.generate(**inputs, **kwargs)
+        return self.processor.batch_decode(
+                generated_ids[:, len(inputs["input_ids"][0]) :], skip_special_tokens=True
+            )[0]