Spaces:

Stremly
/

uitars

Running

App Files Files Community

uitars / app.py

Stremly

Update app.py

dc6b09c verified about 1 month ago

raw

history blame contribute delete

4.85 kB

	# app.py
	import spaces
	import ast
	import torch
	from PIL import Image, ImageDraw
	import gradio as gr
	import base64
	from io import BytesIO

	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info # include this file in your repo if not pip-installable

	_MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	"ByteDance-Seed/UI-TARS-1.5-7B",
	device_map="auto",
	torch_dtype=torch.float16
	)

	_PROCESSOR = AutoProcessor.from_pretrained(
	"ByteDance-Seed/UI-TARS-1.5-7B",
	size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28}, # sane res
	use_fast=True,
	)

	model = _MODEL
	processor = _PROCESSOR


	def draw_point(image: Image.Image, point=None, radius: int = 5):
	"""Overlay a red dot on the screenshot where the model clicked."""
	img = image.copy()
	if point:
	x, y = point[0] * img.width, point[1] * img.height
	ImageDraw.Draw(img).ellipse(
	(x - radius, y - radius, x + radius, y + radius), fill="red"
	)
	return img


	@spaces.GPU
	def navigate(screenshot, task: str):
	"""Run one inference step on the GUI‑reasoning model.

	Args:
	screenshot (PIL.Image): Latest UI screenshot.
	task (str): Natural‑language task description
	history (list \| str \| None): Previous messages list. Accepts either an
	actual Python list (via gr.JSON) or a JSON/Python‑literal string.
	"""


	# ───────────────────── normalise history input ──────────────────────────

	messages=[]

	prompt_header = (
	"You are a GUI agent. You are given a task and your action history, with screenshots."
	"You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<\|box_start\|>(x1, y1)<\|box_end\|>')\nleft_double(start_box='<\|box_start\|>(x1, y1)<\|box_end\|>')\nright_single(start_box='<\|box_start\|<(x1, y1)>\|box_end\|>')\ndrag(start_box='<\|box_start\|>(x1, y1)<\|box_end\|>', end_box='<\|box_start\|>(x3, y3)<\|box_end\|>')\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\nONLY OUTPUT THE CLICKS ACTIONS(CLICK, RIGHT SINGLE, LEFT DOUBLE)\n\n"
	f"## User Instruction\n{task}"
	)
	current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]}

	messages.append(current)

	#New Comment 1
	# ─────────────────────────── model forward ─────────────────────────────

	images, videos = process_vision_info(messages)
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	inputs = processor(
	text=[text],
	images=images,
	videos=videos,
	padding=True,
	return_tensors="pt",
	).to("cuda")

	generated = model.generate(**inputs, max_new_tokens=128)
	trimmed = [
	out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
	]
	raw_out = processor.batch_decode(
	trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)[0]

	# ─────── draw predicted click for quick visual verification (optional) ──────
	try:
	actions = ast.literal_eval(raw_out)
	for act in actions if isinstance(actions, list) else [actions]:
	pos = act.get("position")
	if pos and isinstance(pos, list) and len(pos) == 2:
	screenshot = draw_point(screenshot, pos)
	except Exception:
	# decoding failed → just return original screenshot
	pass

	return screenshot, raw_out, messages

	# ────────────────────────── Gradio interface ───────────────────────────────

	demo = gr.Interface(
	fn=navigate,
	inputs=[
	gr.Image(type="pil", label="Screenshot"),
	gr.Textbox(
	lines=1,
	placeholder="e.g. Search the weather for New York",
	label="Task",
	)
	],
	outputs=[
	gr.Image(label="With Click Point"),
	gr.Textbox(label="Raw Action JSON"),
	gr.JSON(label="Updated Conversation History")
	],
	title="UI-Tars Navigation Demo",
	)

	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False, # or True if you need a public link
	ssr_mode=False, # turn off experimental SSR so the process blocks
	)