Abs6187 commited on
Commit
6c894b7
Β·
1 Parent(s): bd4d774

Updated app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -53
app.py CHANGED
@@ -1,58 +1,85 @@
1
- import ast
2
- import base64
3
- from io import BytesIO
4
-
5
  import spaces
 
6
  import torch
7
  from PIL import Image, ImageDraw
8
  import gradio as gr
 
 
 
9
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
10
- from qwen_vl_utils import process_vision_info # keep this file in repo
11
 
12
- # ─── Model & Processor ──────────────────────────────────────────────────────
13
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
14
  "ByteDance-Seed/UI-TARS-1.5-7B",
15
  device_map="auto",
16
  torch_dtype=torch.float16
17
  )
18
- processor = AutoProcessor.from_pretrained(
 
19
  "ByteDance-Seed/UI-TARS-1.5-7B",
20
  size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28},
21
  use_fast=True,
22
  )
23
 
24
- # ─── Helpers ────────────────────────────────────────────────────────────────
25
- def draw_point(img: Image.Image, pos=None, r: int = 5):
26
- if pos:
27
- x, y = pos[0] * img.width, pos[1] * img.height
28
- ImageDraw.Draw(img).ellipse((x - r, y - r, x + r, y + r), fill="red")
 
 
 
 
 
 
 
29
  return img
30
 
31
- # ─── Core Inference ─────────────────────────────────────────────────────────
32
  @spaces.GPU
33
  def navigate(screenshot, task: str):
 
 
 
34
  prompt_header = (
35
- "You are an expert GUI agent.\n"
36
- "Given the task and previous screenshots, decide only the next action.\n\n"
37
- "=== TASK ===\n"
38
- f"{task}\n"
39
- "==============\n\n"
40
- "Respond strictly as JSON: {\"Thought\": \"…\", \"Action\": {…}}\n"
41
- "Valid actions: click, left_double, right_single, drag, hotkey, type, scroll, wait, finished.\n"
42
- "Use English for Thought. Summarize your plan in one sentence."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  )
44
 
45
- messages = [{
46
  "role": "user",
47
  "content": [
48
  {"type": "text", "text": prompt_header},
49
  {"type": "image_url", "image_url": screenshot}
50
  ]
51
- }]
 
52
 
53
  images, videos = process_vision_info(messages)
54
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
55
-
 
56
  inputs = processor(
57
  text=[text],
58
  images=images,
@@ -61,45 +88,45 @@ def navigate(screenshot, task: str):
61
  return_tensors="pt",
62
  ).to("cuda")
63
 
64
- gen_ids = model.generate(**inputs, max_new_tokens=128)
 
 
 
65
  raw_out = processor.batch_decode(
66
- [out[len(inp):] for inp, out in zip(inputs.input_ids, gen_ids)],
67
- skip_special_tokens=True,
68
- clean_up_tokenization_spaces=False
69
  )[0]
70
 
71
  try:
72
- for act in ast.literal_eval(raw_out) if isinstance(ast.literal_eval(raw_out), list) else [ast.literal_eval(raw_out)]:
73
- p = act.get("position")
74
- if p and len(p) == 2:
75
- screenshot = draw_point(screenshot, p)
 
76
  except Exception:
77
  pass
78
 
79
  return screenshot, raw_out, messages
80
 
81
- # ─── UI ─────────────────────────────────────────────────────────────────────
82
- with gr.Blocks(title="UI-Tars Navigation Demo", theme=gr.themes.Soft()) as demo:
83
- with gr.Row():
84
- with gr.Column(scale=3):
85
- in_img = gr.Image(type="pil", height=380, show_label=False)
86
- in_task = gr.Textbox(
87
- placeholder="Describe what you want to do (e.g. β€œOpen Gmail and compose a message”)",
88
- lines=1,
89
- label="Task"
90
- )
91
- run_btn = gr.Button("Run", variant="primary")
92
- with gr.Column(scale=4):
93
- out_img = gr.Image(label="With Click Point", height=380)
94
- out_raw = gr.Textbox(label="Raw Action JSON", interactive=False)
95
- with gr.Accordion("Conversation History", open=False):
96
- out_hist = gr.JSON()
97
-
98
- run_btn.click(navigate, inputs=[in_img, in_task], outputs=[out_img, out_raw, out_hist])
99
 
100
  demo.launch(
101
  server_name="0.0.0.0",
102
  server_port=7860,
103
  share=False,
104
- ssr_mode=False,
105
  )
 
 
 
 
 
1
  import spaces
2
+ import ast
3
  import torch
4
  from PIL import Image, ImageDraw
5
  import gradio as gr
6
+ import base64
7
+ from io import BytesIO
8
+
9
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
10
+ from qwen_vl_utils import process_vision_info
11
 
12
+ _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 
13
  "ByteDance-Seed/UI-TARS-1.5-7B",
14
  device_map="auto",
15
  torch_dtype=torch.float16
16
  )
17
+
18
+ _PROCESSOR = AutoProcessor.from_pretrained(
19
  "ByteDance-Seed/UI-TARS-1.5-7B",
20
  size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28},
21
  use_fast=True,
22
  )
23
 
24
+ model = _MODEL
25
+ processor = _PROCESSOR
26
+
27
+
28
+ def draw_point(image: Image.Image, point=None, radius: int = 5):
29
+ """Overlay a red dot on the screenshot where the model clicked."""
30
+ img = image.copy()
31
+ if point:
32
+ x, y = point[0] * img.width, point[1] * img.height
33
+ ImageDraw.Draw(img).ellipse(
34
+ (x - radius, y - radius, x + radius, y + radius), fill="red"
35
+ )
36
  return img
37
 
38
+
39
  @spaces.GPU
40
  def navigate(screenshot, task: str):
41
+ """Run one inference step on the GUI‑reasoning model."""
42
+ messages = []
43
+
44
  prompt_header = (
45
+ "πŸ€– **GUI Agent Instructions**\n\n"
46
+ "You're an intelligent agent solving UI tasks through:\n"
47
+ "1. Visual understanding of screenshots\n"
48
+ "2. Natural language task interpretation\n"
49
+ "3. Action sequence generation\n\n"
50
+ "## Action Reference\n"
51
+ "```
52
+ "| Action Type | Syntax |\n"
53
+ "|--------------------|-------------------------|\n"
54
+ "| Click Button | click(start_box='(x,y)') |\n"
55
+ "| Double-Click | left_double(start_box='(x,y)') |\n"
56
+ "| Drag Element | drag(start_box='(x1,y1)', end_box='(x2,y2)') |\n"
57
+ "| Hotkey Input | hotkey(key='Ctrl+A') |\n"
58
+ "| Text Input | type(content='Search term') |\n"
59
+ "| Scroll Action | scroll(start_box='(x,y)', direction='down') |\n"
60
+ "| Wait & Recheck | wait() |\n"
61
+ "| Task Completion | finished(content='Result') |\n"
62
+ "```"
63
+ "\n**Note:**\n"
64
+ "1. Use 'win' key instead of 'meta' in hotkey commands\n"
65
+ "2. Include position coordinates in all spatial actions\n"
66
+ "3. Keep 'Thought' concise - max 3 sentence strategy\n"
67
+ f"\n**Task:**\n{task}"
68
  )
69
 
70
+ current = {
71
  "role": "user",
72
  "content": [
73
  {"type": "text", "text": prompt_header},
74
  {"type": "image_url", "image_url": screenshot}
75
  ]
76
+ }
77
+ messages.append(current)
78
 
79
  images, videos = process_vision_info(messages)
80
+ text = processor.apply_chat_template(
81
+ messages, tokenize=False, add_generation_prompt=True
82
+ )
83
  inputs = processor(
84
  text=[text],
85
  images=images,
 
88
  return_tensors="pt",
89
  ).to("cuda")
90
 
91
+ generated = model.generate(**inputs, max_new_tokens=128)
92
+ trimmed = [
93
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated)
94
+ ]
95
  raw_out = processor.batch_decode(
96
+ trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
 
 
97
  )[0]
98
 
99
  try:
100
+ actions = ast.literal_eval(raw_out)
101
+ for act in (actions if isinstance(actions, list) else [actions]):
102
+ pos = act.get("position")
103
+ if pos and isinstance(pos, list) and len(pos) == 2:
104
+ screenshot = draw_point(screenshot, pos)
105
  except Exception:
106
  pass
107
 
108
  return screenshot, raw_out, messages
109
 
110
+
111
+ demo = gr.Blocks()
112
+ with gr.Row():
113
+ gr.Image(type="pil", label="πŸ–ΌοΈ Screenshot Input").style(width=400)
114
+ gr.Textbox(
115
+ lines=1,
116
+ placeholder="e.g., 'Book a flight to Paris'",
117
+ label="πŸ” Task Description"
118
+ ).style(width=400)
119
+
120
+ with gr.Row():
121
+ gr.Image(label="πŸ“ Click Point Visualization").style(width=400)
122
+ gr.Textbox(label="πŸ“ Action Response").style(width=400)
123
+
124
+ with gr.Row():
125
+ gr.JSON(label="πŸ“œ Conversation History").style(width=800)
 
 
126
 
127
  demo.launch(
128
  server_name="0.0.0.0",
129
  server_port=7860,
130
  share=False,
131
+ ssr_mode=False
132
  )