Spaces:
Sleeping
Sleeping
File size: 5,703 Bytes
995b558 cf3d408 e995be0 995b558 cf3d408 995b558 815c282 995b558 7de7341 995b558 27ea670 995b558 cf3d408 e995be0 995b558 e995be0 995b558 e995be0 995b558 e995be0 8380e21 88bf1eb e995be0 88bf1eb e995be0 8380e21 f2cd623 3a49137 8380e21 f2cd623 e995be0 995b558 e995be0 995b558 e995be0 995b558 e995be0 995b558 e995be0 995b558 e995be0 cf3d408 e995be0 cf3d408 995b558 cf3d408 995b558 e995be0 995b558 e995be0 cf3d408 759fe06 cf3d408 995b558 e995be0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
# app.py
import spaces
import ast
import torch
from PIL import Image, ImageDraw
import gradio as gr
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info # include this file in your repo if not pip-installable
# ---- model & processor loaded on CPU ----
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"ByteDance-Seed/UI-TARS-1.5-7B",
device_map="auto",
torch_dtype=torch.float16, # CPUβfriendly
)
processor = AutoProcessor.from_pretrained(
"ByteDance-Seed/UI-TARS-1.5-7B",
size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28},
use_fast=True,
)
def draw_point(image: Image.Image, point=None, radius: int = 5):
"""Overlay a red dot on the screenshot where the model clicked."""
img = image.copy()
if point:
x, y = point[0] * img.width, point[1] * img.height
ImageDraw.Draw(img).ellipse(
(x - radius, y - radius, x + radius, y + radius), fill="red"
)
return img
@spaces.GPU
def navigate(screenshot, task: str, platform: str, history):
"""Run one inference step on the GUIβreasoning model.
Args:
screenshot (PIL.Image): Latest UI screenshot.
task (str): Naturalβlanguage task description.
platform (str): Either "web" or "phone" for prompt conditioning.
history (list | str | None): Previous messages list. Accepts either an
actual Python list (via gr.JSON) or a JSON/Pythonβliteral string.
"""
# βββββββββββββββββββββ normalise history input ββββββββββββββββββββββββββ
messages=[]
if isinstance(history, str):
try:
messages= ast.literal_eval(history)
except Exception as exc:
raise ValueError("`history` must be a JSON/Python list: " + str(exc))
else:
messages = history
prompt_header = (
"You are a GUI agent. You are given a task and your action history, with screenshots."
"You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nleft_double(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nright_single(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\ndrag(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', end_box='\u003c|box_start|\u003e(x3, y3)\u003c|box_end|\u003e')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n"
f"## User Instruction\n{task}"
)
current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]}
messages.append(current)
# βββββββββββββββββββββββββββ model forward βββββββββββββββββββββββββββββ
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
images, videos = process_vision_info(messages)
inputs = processor(
text=[text],
images=images,
videos=videos,
padding=True,
return_tensors="pt",
).to("cuda")
generated = model.generate(**inputs, max_new_tokens=128)
trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
]
raw_out = processor.batch_decode(
trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
# βββββββ draw predicted click for quick visual verification (optional) ββββββ
try:
actions = ast.literal_eval(raw_out)
for act in actions if isinstance(actions, list) else [actions]:
pos = act.get("position")
if pos and isinstance(pos, list) and len(pos) == 2:
screenshot = draw_point(screenshot, pos)
except Exception:
# decoding failed β just return original screenshot
pass
return screenshot, raw_out
# ββββββββββββββββββββββββββ Gradio interface βββββββββββββββββββββββββββββββ
demo = gr.Interface(
fn=navigate,
inputs=[
gr.Image(type="pil", label="Screenshot"),
gr.Textbox(
lines=1,
placeholder="e.g. Search the weather for New York",
label="Task",
),
gr.Dropdown(choices=["web", "phone"], value="web", label="Platform"),
gr.JSON(label="Conversation History (list)", value=[]),
],
outputs=[
gr.Image(label="With Click Point"),
gr.Textbox(label="Raw Action JSON"),
],
title="UI-Tars Navigation Demo",
)
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False, # or True if you need a public link
ssr_mode=False, # turn off experimental SSR so the process blocks
)
|