|
import base64, os |
|
|
|
import json |
|
import torch |
|
import gradio as gr |
|
from typing import Optional |
|
from PIL import Image, ImageDraw |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
from qwen_vl_utils import process_vision_info |
|
from datasets import load_dataset |
|
from transformers import AutoProcessor |
|
from gui_actor.constants import chat_template |
|
from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer |
|
from gui_actor.inference import inference |
|
|
|
MAX_PIXELS = 3200 * 1800 |
|
|
|
def resize_image(image, resize_to_pixels=MAX_PIXELS): |
|
image_width, image_height = image.size |
|
if (resize_to_pixels is not None) and ((image_width * image_height) != resize_to_pixels): |
|
resize_ratio = (resize_to_pixels / (image_width * image_height)) ** 0.5 |
|
image_width_resized, image_height_resized = int(image_width * resize_ratio), int(image_height * resize_ratio) |
|
image = image.resize((image_width_resized, image_height_resized)) |
|
return image |
|
|
|
|
|
@torch.inference_mode() |
|
def draw_point(image: Image.Image, point: list, radius=8, color=(255, 0, 0, 128)): |
|
overlay = Image.new('RGBA', image.size, (255, 255, 255, 0)) |
|
overlay_draw = ImageDraw.Draw(overlay) |
|
x, y = point |
|
overlay_draw.ellipse( |
|
[(x - radius, y - radius), (x + radius, y + radius)], |
|
outline=color, |
|
width=5 |
|
) |
|
image = image.convert('RGBA') |
|
combined = Image.alpha_composite(image, overlay) |
|
combined = combined.convert('RGB') |
|
return combined |
|
|
|
|
|
@torch.inference_mode() |
|
def get_attn_map(image, attn_scores, n_width, n_height): |
|
w, h = image.size |
|
scores = np.array(attn_scores[0]).reshape(n_height, n_width) |
|
|
|
scores_norm = (scores - scores.min()) / (scores.max() - scores.min()) |
|
|
|
score_map = Image.fromarray((scores_norm * 255).astype(np.uint8)).resize((w, h), resample=Image.NEAREST) |
|
|
|
colormap = plt.get_cmap('jet') |
|
colored_score_map = colormap(np.array(score_map) / 255.0) |
|
colored_score_map = (colored_score_map[:, :, :3] * 255).astype(np.uint8) |
|
colored_overlay = Image.fromarray(colored_score_map) |
|
|
|
|
|
blended = Image.blend(image, colored_overlay, alpha=0.3) |
|
return blended |
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
|
model_name_or_path = "microsoft/GUI-Actor-7B-Qwen2.5-VL" |
|
data_processor = AutoProcessor.from_pretrained(model_name_or_path) |
|
tokenizer = data_processor.tokenizer |
|
model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained( |
|
model_name_or_path, |
|
torch_dtype=torch.bfloat16, |
|
device_map="cuda:0", |
|
attn_implementation="flash_attention_2" |
|
).eval() |
|
else: |
|
model_name_or_path = "microsoft/GUI-Actor-3B-Qwen2.5-VL" |
|
data_processor = AutoProcessor.from_pretrained(model_name_or_path) |
|
tokenizer = data_processor.tokenizer |
|
model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained( |
|
model_name_or_path, |
|
torch_dtype=torch.bfloat16, |
|
device_map="cpu" |
|
).eval() |
|
|
|
title = "GUI-Actor" |
|
header = """ |
|
<div align="center"> |
|
<h1 style="padding-bottom: 10px; padding-top: 10px;">π― <strong>GUI-Actor</strong>: Coordinate-Free Visual Grounding for GUI Agents</h1> |
|
<div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;"> |
|
Qianhui Wu*, Kanzhi Cheng*, Rui Yang*, Chaoyun Zhang, Jianwei Yang, Huiqiang Jiang, Jian Mu, Baolin Peng, Bo Qiao, Reuben Tan, Si Qin, Lars Liden<br> |
|
Qingwei Lin, Huan Zhang, Tong Zhang, Jianbing Zhang, Dongmei Zhang, Jianfeng Gao<br/> |
|
</div> |
|
<div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;"> |
|
<a href="https://microsoft.github.io/GUI-Actor/">π Project Page</a> | <a href="https://arxiv.org/abs/2403.12968">π arXiv Paper</a> | <a href="https://github.com/microsoft/GUI-Actor">π» Github Repo</a><br/> |
|
</div> |
|
</div> |
|
""" |
|
|
|
theme = "soft" |
|
css = """#anno-img .mask {opacity: 0.5; transition: all 0.2s ease-in-out;} |
|
#anno-img .mask.active {opacity: 0.7}""" |
|
|
|
|
|
@torch.inference_mode() |
|
def process(image, instruction): |
|
|
|
w, h = image.size |
|
if w * h > MAX_PIXELS: |
|
image = resize_image(image) |
|
|
|
conversation = [ |
|
{ |
|
"role": "system", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, your task is to locate the screen element that corresponds to the instruction. You should output a PyAutoGUI action that performs a click on the correct position. To indicate the click location, we will use some special tokens, which is used to refer to a visual patch later. For example, you can output: pyautogui.click(<your_special_token_here>).", |
|
} |
|
] |
|
}, |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "image", |
|
"image": image, |
|
|
|
}, |
|
{ |
|
"type": "text", |
|
"text": instruction, |
|
}, |
|
], |
|
}, |
|
] |
|
|
|
try: |
|
pred = inference(conversation, model, tokenizer, data_processor, use_placeholder=True, topk=3) |
|
except Exception as e: |
|
print(e) |
|
return image, f"Error: {e}", None |
|
|
|
px, py = pred["topk_points"][0] |
|
output_coord = f"({px:.4f}, {py:.4f})" |
|
img_with_point = draw_point(image, (px * w, py * h)) |
|
|
|
n_width, n_height = pred["n_width"], pred["n_height"] |
|
attn_scores = pred["attn_scores"] |
|
att_map = get_attn_map(image, attn_scores, n_width, n_height) |
|
|
|
return img_with_point, output_coord, att_map |
|
|
|
|
|
with gr.Blocks(title=title, css=css) as demo: |
|
gr.Markdown(header) |
|
with gr.Row(): |
|
with gr.Column(): |
|
input_image = gr.Image( |
|
type='pil', label='Upload image') |
|
|
|
input_instruction = gr.Textbox(label='Instruction', placeholder='Text your (low-level) instruction here') |
|
submit_button = gr.Button( |
|
value='Submit', variant='primary') |
|
with gr.Column(): |
|
image_with_point = gr.Image(type='pil', label='Image with Point (red circle)') |
|
with gr.Accordion('Detailed prediction'): |
|
pred_xy = gr.Textbox(label='Predicted Coordinates', placeholder='(x, y)') |
|
att_map = gr.Image(type='pil', label='Attention Map') |
|
|
|
submit_button.click( |
|
fn=process, |
|
inputs=[ |
|
input_image, |
|
input_instruction |
|
], |
|
outputs=[image_with_point, pred_xy, att_map] |
|
) |
|
|
|
|
|
|
|
demo.queue().launch(share=False) |