Abs6187 commited on
Commit
01c1f7b
Β·
1 Parent(s): f737f3c

Updated app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -61
app.py CHANGED
@@ -1,76 +1,104 @@
 
1
  import spaces
 
2
  import torch
3
  from PIL import Image, ImageDraw
4
  import gradio as gr
5
- import re
 
 
6
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 
7
 
8
- @spaces.GPU
9
- def load_model_and_processor():
10
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
11
- "ByteDance-Seed/UI-TARS-1.5-7B",
12
- device_map="auto",
13
- torch_dtype=torch.float16
14
- )
15
- processor = AutoProcessor.from_pretrained(
16
- "ByteDance-Seed/UI-TARS-1.5-7B",
17
- use_fast=True,
18
- )
19
- return model, processor
20
 
21
- model, processor = load_model_and_processor()
 
22
 
23
- def draw_point(image: Image.Image, point_str: str, radius: int = 10):
 
 
24
  img = image.copy()
25
- try:
26
- coord_regex = r'click\(.*?<\|box_start\|>\s*\((\s*[\d.]+)\s*,\s*([\d.]+)\s*\).*?\)'
27
- match = re.search(coord_regex, point_str)
28
-
29
- if match:
30
- x_norm, y_norm = float(match.group(1)), float(match.group(2))
31
- x = x_norm * img.width
32
- y = y_norm * img.height
33
-
34
- draw = ImageDraw.Draw(img)
35
- draw.ellipse(
36
- (x - radius, y - radius, x + radius, y + radius),
37
- fill="red",
38
- outline="white",
39
- width=2
40
- )
41
- except Exception:
42
- pass
43
  return img
44
 
 
45
  @spaces.GPU
46
- def navigate(screenshot: Image.Image, task: str):
47
- if not screenshot or not task:
48
- raise gr.Error("Please provide both a screenshot and a task.")
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  prompt_header = (
51
- "You are a GUI agent. You are given a task and your action history, with screenshots."
52
- "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|<(x1, y1)>|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. Always use 'win' instead of 'meta' key\n\n"
53
- f"## User Instruction\n{task}"
 
 
 
 
 
 
 
 
 
 
 
54
  )
55
-
56
- messages = [{"role": "user", "content": [{"type": "text", "text": prompt_header}, {"type": "image", "image": screenshot}]}]
57
-
58
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
59
-
60
- inputs = processor(text=[text], images=[screenshot], return_tensors="pt").to(model.device)
61
-
62
- generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
63
-
64
- response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
65
-
 
 
 
 
 
 
66
  try:
67
- action_text = response.split('[/INST]')[-1].strip()
68
- except IndexError:
69
- action_text = response
 
 
 
 
 
 
 
70
 
71
- output_image = draw_point(screenshot, action_text)
72
-
73
- return output_image, action_text
74
 
75
  demo = gr.Interface(
76
  fn=navigate,
@@ -84,12 +112,15 @@ demo = gr.Interface(
84
  ],
85
  outputs=[
86
  gr.Image(label="With Click Point"),
87
- gr.Textbox(label="Raw Action Output"),
 
88
  ],
89
  title="UI-Tars Navigation Demo",
90
- description="Upload a UI screenshot, describe a task, and see the AI-predicted next action. This model helps automate GUI interactions.",
91
- allow_flagging="never",
92
  )
93
 
94
- if __name__ == "__main__":
95
- demo.launch()
 
 
 
 
 
1
+ # app.py
2
  import spaces
3
+ import ast
4
  import torch
5
  from PIL import Image, ImageDraw
6
  import gradio as gr
7
+ import base64
8
+ from io import BytesIO
9
+
10
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
11
+ from qwen_vl_utils import process_vision_info # include this file in your repo if not pip-installable
12
 
13
+ _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
14
+ "ByteDance-Seed/UI-TARS-1.5-7B",
15
+ device_map="auto",
16
+ torch_dtype=torch.float16
17
+ )
18
+
19
+ _PROCESSOR = AutoProcessor.from_pretrained(
20
+ "ByteDance-Seed/UI-TARS-1.5-7B",
21
+ size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28}, # sane res
22
+ use_fast=True,
23
+ )
 
24
 
25
+ model = _MODEL
26
+ processor = _PROCESSOR
27
 
28
+
29
+ def draw_point(image: Image.Image, point=None, radius: int = 5):
30
+ """Overlay a red dot on the screenshot where the model clicked."""
31
  img = image.copy()
32
+ if point:
33
+ x, y = point[0] * img.width, point[1] * img.height
34
+ ImageDraw.Draw(img).ellipse(
35
+ (x - radius, y - radius, x + radius, y + radius), fill="red"
36
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  return img
38
 
39
+
40
  @spaces.GPU
41
+ def navigate(screenshot, task: str):
42
+ """Run one inference step on the GUI‑reasoning model.
43
+
44
+ Args:
45
+ screenshot (PIL.Image): Latest UI screenshot.
46
+ task (str): Natural‑language task description
47
+ history (list | str | None): Previous messages list. Accepts either an
48
+ actual Python list (via gr.JSON) or a JSON/Python‑literal string.
49
+ """
50
+
51
+
52
+ # ───────────────────── normalise history input ──────────────────────────
53
+
54
+ messages=[]
55
 
56
  prompt_header = (
57
+ "You are a GUI agent. You are given a task and your action history, with screenshots."
58
+ "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|<(x1, y1)>|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. Always use 'win' instead of 'meta' key\n\n"
59
+ f"## User Instruction\n{task}"
60
+ )
61
+ current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]}
62
+
63
+ messages.append(current)
64
+
65
+ #New Comment 1
66
+ # ─────────────────────────── model forward ─────────────────────────────
67
+
68
+ images, videos = process_vision_info(messages)
69
+ text = processor.apply_chat_template(
70
+ messages, tokenize=False, add_generation_prompt=True
71
  )
72
+ inputs = processor(
73
+ text=[text],
74
+ images=images,
75
+ videos=videos,
76
+ padding=True,
77
+ return_tensors="pt",
78
+ ).to("cuda")
79
+
80
+ generated = model.generate(**inputs, max_new_tokens=128)
81
+ trimmed = [
82
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
83
+ ]
84
+ raw_out = processor.batch_decode(
85
+ trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
86
+ )[0]
87
+
88
+ # ─────── draw predicted click for quick visual verification (optional) ──────
89
  try:
90
+ actions = ast.literal_eval(raw_out)
91
+ for act in actions if isinstance(actions, list) else [actions]:
92
+ pos = act.get("position")
93
+ if pos and isinstance(pos, list) and len(pos) == 2:
94
+ screenshot = draw_point(screenshot, pos)
95
+ except Exception:
96
+ # decoding failed β†’ just return original screenshot
97
+ pass
98
+
99
+ return screenshot, raw_out, messages
100
 
101
+ # ────────────────────────── Gradio interface ───────────────────────────────
 
 
102
 
103
  demo = gr.Interface(
104
  fn=navigate,
 
112
  ],
113
  outputs=[
114
  gr.Image(label="With Click Point"),
115
+ gr.Textbox(label="Raw Action JSON"),
116
+ gr.JSON(label="Updated Conversation History")
117
  ],
118
  title="UI-Tars Navigation Demo",
 
 
119
  )
120
 
121
+ demo.launch(
122
+ server_name="0.0.0.0",
123
+ server_port=7860,
124
+ share=False, # or True if you need a public link
125
+ ssr_mode=False, # turn off experimental SSR so the process blocks
126
+ )