import base64, os, json from typing import Optional import torch import gradio as gr import numpy as np import matplotlib.pyplot as plt from PIL import Image, ImageDraw # ---- Hugging Face Spaces GPU decorator (safe fallback when not on Spaces) ---- try: import spaces GPU_DECORATOR = spaces.GPU except Exception: def GPU_DECORATOR(fn): # no-op locally return fn from qwen_vl_utils import process_vision_info # noqa: F401 (kept for parity if used elsewhere) from datasets import load_dataset # noqa: F401 from transformers import AutoProcessor from gui_actor.constants import chat_template # noqa: F401 from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer from gui_actor.inference import inference MAX_PIXELS = 3200 * 1800 def resize_image(image, resize_to_pixels=MAX_PIXELS): image_width, image_height = image.size if (resize_to_pixels is not None) and ((image_width * image_height) != resize_to_pixels): resize_ratio = (resize_to_pixels / (image_width * image_height)) ** 0.5 image_width_resized, image_height_resized = int(image_width * resize_ratio), int(image_height * resize_ratio) image = image.resize((image_width_resized, image_height_resized)) return image @torch.inference_mode() def draw_point(image: Image.Image, point: list, radius=8, color=(255, 0, 0, 128)): overlay = Image.new('RGBA', image.size, (255, 255, 255, 0)) overlay_draw = ImageDraw.Draw(overlay) x, y = point overlay_draw.ellipse( [(x - radius, y - radius), (x + radius, y + radius)], outline=color, width=5 ) image = image.convert('RGBA') combined = Image.alpha_composite(image, overlay) combined = combined.convert('RGB') return combined @torch.inference_mode() def get_attn_map(image, attn_scores, n_width, n_height): w, h = image.size scores = np.array(attn_scores[0]).reshape(n_height, n_width) scores_norm = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8) score_map = Image.fromarray((scores_norm * 255).astype(np.uint8)).resize((w, h), resample=Image.NEAREST) colormap = plt.get_cmap('jet') colored_score_map = colormap(np.array(score_map) / 255.0)[:, :, :3] colored_overlay = Image.fromarray((colored_score_map * 255).astype(np.uint8)) blended = Image.blend(image, colored_overlay, alpha=0.3) return blended # ---------------------------- # Model/device init for Spaces # ---------------------------- def _pick_gpu_dtype() -> torch.dtype: if not torch.cuda.is_available(): return torch.float32 major, minor = torch.cuda.get_device_capability() # Ampere (8.x) / Hopper (9.x) support bf16 well return torch.bfloat16 if major >= 8 else torch.float16 # Global holders initialized in load_model() model = None tokenizer = None data_processor = None @GPU_DECORATOR # <-- This is what Spaces looks for at startup def load_model(): """ Allocates the GPU on Spaces and loads the model on the right device/dtype. Runs once at startup. """ global model, tokenizer, data_processor model_name_or_path = "microsoft/GUI-Actor-3B-Qwen2.5-VL" device = "cuda:0" if torch.cuda.is_available() else "cpu" dtype = _pick_gpu_dtype() # Enable some healthy defaults on GPU if device.startswith("cuda"): torch.backends.cuda.matmul.allow_tf32 = True torch.set_grad_enabled(False) data_processor = AutoProcessor.from_pretrained(model_name_or_path) tokenizer = data_processor.tokenizer # Use SDPA attention to avoid flash-attn dependency attn_impl = "sdpa" model_local = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained( model_name_or_path, torch_dtype=dtype, attn_implementation=attn_impl, ).eval() # Move to device explicitly (avoid accelerate unless you need sharding) model_local.to(device) model = model_local return f"Loaded {model_name_or_path} on {device} with dtype={dtype} (attn={attn_impl})" # Trigger model loading on import so Spaces allocates GPU immediately _ = load_model() @GPU_DECORATOR @torch.inference_mode() def process(image, instruction): # Safety: ensure model is loaded if model is None: _ = load_model() # Resize if needed w, h = image.size if w * h > MAX_PIXELS: image = resize_image(image) w, h = image.size conversation = [ { "role": "system", "content": [ { "type": "text", "text": ( "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, " "your task is to locate the screen element that corresponds to the instruction. " "Output a PyAutoGUI action with a special token that points to the correct location." ), } ], }, { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": instruction}, ], }, ] device = next(model.parameters()).device try: pred = inference( conversation, model, tokenizer, data_processor, use_placeholder=True, topk=3 ) except Exception as e: print("inference error:", e) return image, f"Error: {e}", None px, py = pred["topk_points"][0] output_coord = f"({px:.4f}, {py:.4f})" img_with_point = draw_point(image, (px * w, py * h)) n_width, n_height = pred["n_width"], pred["n_height"] attn_scores = pred["attn_scores"] att_map = get_attn_map(image, attn_scores, n_width, n_height) return img_with_point, output_coord, att_map # ---------------------------- # Gradio UI # ---------------------------- title = "GUI-Actor" header = """