# app.py import spaces import ast import torch from PIL import Image, ImageDraw import gradio as gr import base64 from io import BytesIO from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info # include this file in your repo if not pip-installable _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained( "ByteDance-Seed/UI-TARS-1.5-7B", device_map="auto", torch_dtype=torch.float16 ) _PROCESSOR = AutoProcessor.from_pretrained( "ByteDance-Seed/UI-TARS-1.5-7B", size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28}, # sane res use_fast=True, ) model = _MODEL processor = _PROCESSOR def draw_point(image: Image.Image, point=None, radius: int = 5): """Overlay a red dot on the screenshot where the model clicked.""" img = image.copy() if point: x, y = point[0] * img.width, point[1] * img.height ImageDraw.Draw(img).ellipse( (x - radius, y - radius, x + radius, y + radius), fill="red" ) return img @spaces.GPU def navigate(screenshot, task: str): """Run one inference step on the GUI‑reasoning model. Args: screenshot (PIL.Image): Latest UI screenshot. task (str): Natural‑language task description history (list | str | None): Previous messages list. Accepts either an actual Python list (via gr.JSON) or a JSON/Python‑literal string. """ # ───────────────────── normalise history input ────────────────────────── messages=[] prompt_header = ( "You are a GUI agent. You are given a task and your action history, with screenshots." "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|<(x1, y1)>|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\nONLY OUTPUT THE CLICKS ACTIONS(CLICK, RIGHT SINGLE, LEFT DOUBLE)\n\n" f"## User Instruction\n{task}" ) current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]} messages.append(current) #New Comment 1 # ─────────────────────────── model forward ───────────────────────────── images, videos = process_vision_info(messages) text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = processor( text=[text], images=images, videos=videos, padding=True, return_tensors="pt", ).to("cuda") generated = model.generate(**inputs, max_new_tokens=128) trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated) ] raw_out = processor.batch_decode( trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] # ─────── draw predicted click for quick visual verification (optional) ────── try: actions = ast.literal_eval(raw_out) for act in actions if isinstance(actions, list) else [actions]: pos = act.get("position") if pos and isinstance(pos, list) and len(pos) == 2: screenshot = draw_point(screenshot, pos) except Exception: # decoding failed → just return original screenshot pass return screenshot, raw_out, messages # ────────────────────────── Gradio interface ─────────────────────────────── demo = gr.Interface( fn=navigate, inputs=[ gr.Image(type="pil", label="Screenshot"), gr.Textbox( lines=1, placeholder="e.g. Search the weather for New York", label="Task", ) ], outputs=[ gr.Image(label="With Click Point"), gr.Textbox(label="Raw Action JSON"), gr.JSON(label="Updated Conversation History") ], title="UI-Tars Navigation Demo", ) demo.launch( server_name="0.0.0.0", server_port=7860, share=False, # or True if you need a public link ssr_mode=False, # turn off experimental SSR so the process blocks )