|
import base64, os, json |
|
from typing import Optional |
|
|
|
import torch |
|
import gradio as gr |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
from PIL import Image, ImageDraw |
|
|
|
|
|
try: |
|
import spaces |
|
GPU_DECORATOR = spaces.GPU |
|
except Exception: |
|
def GPU_DECORATOR(fn): |
|
return fn |
|
|
|
from qwen_vl_utils import process_vision_info |
|
from datasets import load_dataset |
|
from transformers import AutoProcessor |
|
from gui_actor.constants import chat_template |
|
from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer |
|
from gui_actor.inference import inference |
|
|
|
MAX_PIXELS = 3200 * 1800 |
|
|
|
def resize_image(image, resize_to_pixels=MAX_PIXELS): |
|
image_width, image_height = image.size |
|
if (resize_to_pixels is not None) and ((image_width * image_height) != resize_to_pixels): |
|
resize_ratio = (resize_to_pixels / (image_width * image_height)) ** 0.5 |
|
image_width_resized, image_height_resized = int(image_width * resize_ratio), int(image_height * resize_ratio) |
|
image = image.resize((image_width_resized, image_height_resized)) |
|
return image |
|
|
|
@torch.inference_mode() |
|
def draw_point(image: Image.Image, point: list, radius=8, color=(255, 0, 0, 128)): |
|
overlay = Image.new('RGBA', image.size, (255, 255, 255, 0)) |
|
overlay_draw = ImageDraw.Draw(overlay) |
|
x, y = point |
|
overlay_draw.ellipse( |
|
[(x - radius, y - radius), (x + radius, y + radius)], |
|
outline=color, |
|
width=5 |
|
) |
|
image = image.convert('RGBA') |
|
combined = Image.alpha_composite(image, overlay) |
|
combined = combined.convert('RGB') |
|
return combined |
|
|
|
@torch.inference_mode() |
|
def get_attn_map(image, attn_scores, n_width, n_height): |
|
w, h = image.size |
|
scores = np.array(attn_scores[0]).reshape(n_height, n_width) |
|
scores_norm = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8) |
|
score_map = Image.fromarray((scores_norm * 255).astype(np.uint8)).resize((w, h), resample=Image.NEAREST) |
|
colormap = plt.get_cmap('jet') |
|
colored_score_map = colormap(np.array(score_map) / 255.0)[:, :, :3] |
|
colored_overlay = Image.fromarray((colored_score_map * 255).astype(np.uint8)) |
|
blended = Image.blend(image, colored_overlay, alpha=0.3) |
|
return blended |
|
|
|
|
|
|
|
|
|
def _pick_gpu_dtype() -> torch.dtype: |
|
if not torch.cuda.is_available(): |
|
return torch.float32 |
|
major, minor = torch.cuda.get_device_capability() |
|
|
|
return torch.bfloat16 if major >= 8 else torch.float16 |
|
|
|
|
|
model = None |
|
tokenizer = None |
|
data_processor = None |
|
|
|
@GPU_DECORATOR |
|
def load_model(): |
|
""" |
|
Allocates the GPU on Spaces and loads the model on the right device/dtype. |
|
Runs once at startup. |
|
""" |
|
global model, tokenizer, data_processor |
|
|
|
model_name_or_path = "microsoft/GUI-Actor-3B-Qwen2.5-VL" |
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
dtype = _pick_gpu_dtype() |
|
|
|
|
|
if device.startswith("cuda"): |
|
torch.backends.cuda.matmul.allow_tf32 = True |
|
torch.set_grad_enabled(False) |
|
|
|
data_processor = AutoProcessor.from_pretrained(model_name_or_path) |
|
tokenizer = data_processor.tokenizer |
|
|
|
|
|
attn_impl = "sdpa" |
|
|
|
model_local = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained( |
|
model_name_or_path, |
|
torch_dtype=dtype, |
|
attn_implementation=attn_impl, |
|
).eval() |
|
|
|
|
|
model_local.to(device) |
|
|
|
model = model_local |
|
return f"Loaded {model_name_or_path} on {device} with dtype={dtype} (attn={attn_impl})" |
|
|
|
|
|
_ = load_model() |
|
|
|
@GPU_DECORATOR |
|
@torch.inference_mode() |
|
def process(image, instruction): |
|
|
|
if model is None: |
|
_ = load_model() |
|
|
|
|
|
w, h = image.size |
|
if w * h > MAX_PIXELS: |
|
image = resize_image(image) |
|
w, h = image.size |
|
|
|
conversation = [ |
|
{ |
|
"role": "system", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": ( |
|
"You are a GUI agent. Given a screenshot of the current GUI and a human instruction, " |
|
"your task is to locate the screen element that corresponds to the instruction. " |
|
"Output a PyAutoGUI action with a special token that points to the correct location." |
|
), |
|
} |
|
], |
|
}, |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "image", "image": image}, |
|
{"type": "text", "text": instruction}, |
|
], |
|
}, |
|
] |
|
|
|
device = next(model.parameters()).device |
|
|
|
try: |
|
pred = inference( |
|
conversation, |
|
model, |
|
tokenizer, |
|
data_processor, |
|
use_placeholder=True, |
|
topk=3 |
|
) |
|
except Exception as e: |
|
print("inference error:", e) |
|
return image, f"Error: {e}", None |
|
|
|
px, py = pred["topk_points"][0] |
|
output_coord = f"({px:.4f}, {py:.4f})" |
|
img_with_point = draw_point(image, (px * w, py * h)) |
|
|
|
n_width, n_height = pred["n_width"], pred["n_height"] |
|
attn_scores = pred["attn_scores"] |
|
att_map = get_attn_map(image, attn_scores, n_width, n_height) |
|
|
|
return img_with_point, output_coord, att_map |
|
|
|
|
|
|
|
|
|
|
|
title = "GUI-Actor" |
|
header = """ |
|
<div align="center"> |
|
<h1 style="padding-bottom: 10px; padding-top: 10px;">π― <strong>GUI-Actor</strong>: Coordinate-Free Visual Grounding for GUI Agents</h1> |
|
<div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;"> |
|
<a href="https://microsoft.github.io/GUI-Actor/">π Project Page</a> | <a href="https://arxiv.org/abs/2403.12968">π arXiv Paper</a> | <a href="https://github.com/microsoft/GUI-Actor">π» Github Repo</a><br/> |
|
</div> |
|
</div> |
|
""" |
|
theme = "soft" |
|
css = """#anno-img .mask {opacity: 0.5; transition: all 0.2s ease-in-out;} |
|
#anno-img .mask.active {opacity: 0.7}""" |
|
|
|
with gr.Blocks(title=title, css=css, theme=theme) as demo: |
|
gr.Markdown(header) |
|
with gr.Row(): |
|
with gr.Column(): |
|
input_image = gr.Image(type='pil', label='Upload image') |
|
input_instruction = gr.Textbox(label='Instruction', placeholder='Type your (low-level) instruction here') |
|
submit_button = gr.Button(value='Submit', variant='primary') |
|
with gr.Column(): |
|
image_with_point = gr.Image(type='pil', label='Image with Point (red circle)') |
|
with gr.Accordion('Detailed prediction'): |
|
pred_xy = gr.Textbox(label='Predicted Coordinates', placeholder='(x, y)') |
|
att_map = gr.Image(type='pil', label='Attention Map') |
|
|
|
submit_button.click( |
|
fn=process, |
|
inputs=[input_image, input_instruction], |
|
outputs=[image_with_point, pred_xy, att_map], |
|
queue=True, |
|
api_name="predict", |
|
) |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
demo.queue(concurrency_count=1, max_size=4) |
|
except TypeError: |
|
try: |
|
demo.queue(max_size=4) |
|
except TypeError: |
|
demo.queue() |
|
|
|
|
|
try: |
|
demo.launch(share=False, max_threads=1, max_queue_size=4) |
|
except TypeError: |
|
try: |
|
demo.launch(share=False, max_queue_size=4) |
|
except TypeError: |
|
demo.launch(share=False) |
|
|
|
|
|
|