Spaces:

Stremly
/

uitars

Running

App Files Files Community

uitars / app1.py

Harshit0414

updates

995b558 about 2 months ago

raw

history blame contribute delete

4.35 kB

	"""
	Gradio demo for UI‑TARS 1.5‑7B (image‑text‑to‑text) on Hugging Face Spaces.
	Save this file as app.py and add a requirements.txt with the packages
	listed below. Then create a new Python Space, upload both files and
	commit — the Space will build and serve the app automatically.

	requirements.txt (suggested versions)
	-------------------------------------
	transformers==4.41.0
	accelerate>=0.29.0
	torch>=2.2
	sentencepiece # needed for many multilingual models
	bitsandbytes # optional: enables 4‑bit quantization if Space has GPU
	pillow
	gradio>=4.33
	"""

	from __future__ import annotations

	from typing import List, Dict, Any

	import gradio as gr
	from PIL import Image
	from transformers import pipeline
	import base64

	def load_model():
	"""Load the UI‑TARS multimodal pipeline once at startup."""
	print("Loading UI‑TARS 1.5‑7B… this may take a while the first time.")
	return pipeline(
	"image-text-to-text",
	model="ByteDance-Seed/UI-TARS-1.5-7B",
	device_map="auto", # automatically use GPU if available
	)


	pipe = load_model()


	def answer_question(image: Image.Image, question: str) -> str:
	"""Run the model on the provided image & question and return its answer."""
	if image is None or not question.strip():
	return "Please supply both an image and a question."

	base64_image = base64.b64encode(image.tobytes()).decode('utf-8')

	# Compose a messages list in the expected multimodal chat format.
	messages: List[Dict[str, Any]] = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": f"You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<\|box_start\|>(x1, y1)<\|box_end\|>')\nleft_double(start_box='<\|box_start\|>(x1, y1)<\|box_end\|>')\nright_single(start_box='<\|box_start\|>(x1, y1)<\|box_end\|>')\ndrag(start_box='<\|box_start\|>(x1, y1)<\|box_end\|>', end_box='<\|box_start\|>(x3, y3)<\|box_end\|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<\|box_start\|>(x1, y1)<\|box_end\|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use Chinese in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n## User Instruction\n{question.strip()}"},
	],
	},
	{
	"role":"user",
	"content": [
	{"type": "image_url",
	"image_url": base64_image},
	],
	}
	]

	# The pipeline returns a list with one dict when `messages` is passed via
	# the `text` keyword. We extract the generated text robustly.
	outputs = pipe(text=messages)

	if isinstance(outputs, list):
	first = outputs[0]
	if isinstance(first, dict) and "generated_text" in first:
	return first["generated_text"].strip()
	return str(first)

	return str(outputs)


	demo = gr.Interface(
	fn=answer_question,
	inputs=[
	gr.Image(type="pil", label="Upload image"),
	gr.Textbox(label="Ask a question about the image", placeholder="e.g. What animal is on the candy?"),
	],
	outputs=gr.Textbox(label="UI‑TARS answer"),
	title="UI‑TARS 1.5‑7B – Visual Q&A",
	description=(
	"Upload an image and ask a question. The UI‑TARS 1.5‑7B model will "
	"answer based on the visual content. Runs completely on‑device in this Space."
	),
	examples=[
	[
	"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG",
	"What animal is on the candy?",
	]
	],
	cache_examples=True,
	allow_flagging="never",
	)


	if __name__ == "__main__":
	# Spaces automatically call `demo.launch()`, but running locally this
	# guard lets you execute `python app.py` for quick tests.
	demo.launch()