Abs6187 commited on
Commit
bd4d774
Β·
1 Parent(s): 01c1f7b

Updated app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -82
app.py CHANGED
@@ -1,74 +1,58 @@
1
- # app.py
2
- import spaces
3
  import ast
4
- import torch
5
- from PIL import Image, ImageDraw
6
- import gradio as gr
7
  import base64
8
  from io import BytesIO
9
 
 
 
 
 
10
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
11
- from qwen_vl_utils import process_vision_info # include this file in your repo if not pip-installable
12
 
13
- _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 
14
  "ByteDance-Seed/UI-TARS-1.5-7B",
15
  device_map="auto",
16
  torch_dtype=torch.float16
17
  )
18
-
19
- _PROCESSOR = AutoProcessor.from_pretrained(
20
  "ByteDance-Seed/UI-TARS-1.5-7B",
21
- size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28}, # sane res
22
  use_fast=True,
23
  )
24
 
25
- model = _MODEL
26
- processor = _PROCESSOR
27
-
28
-
29
- def draw_point(image: Image.Image, point=None, radius: int = 5):
30
- """Overlay a red dot on the screenshot where the model clicked."""
31
- img = image.copy()
32
- if point:
33
- x, y = point[0] * img.width, point[1] * img.height
34
- ImageDraw.Draw(img).ellipse(
35
- (x - radius, y - radius, x + radius, y + radius), fill="red"
36
- )
37
  return img
38
 
39
-
40
  @spaces.GPU
41
  def navigate(screenshot, task: str):
42
- """Run one inference step on the GUI‑reasoning model.
43
-
44
- Args:
45
- screenshot (PIL.Image): Latest UI screenshot.
46
- task (str): Natural‑language task description
47
- history (list | str | None): Previous messages list. Accepts either an
48
- actual Python list (via gr.JSON) or a JSON/Python‑literal string.
49
- """
50
-
51
-
52
- # ───────────────────── normalise history input ──────────────────────────
53
-
54
- messages=[]
55
-
56
  prompt_header = (
57
- "You are a GUI agent. You are given a task and your action history, with screenshots."
58
- "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|<(x1, y1)>|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. Always use 'win' instead of 'meta' key\n\n"
59
- f"## User Instruction\n{task}"
60
- )
61
- current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]}
62
-
63
- messages.append(current)
 
 
64
 
65
- #New Comment 1
66
- # ─────────────────────────── model forward ─────────────────────────────
 
 
 
 
 
67
 
68
  images, videos = process_vision_info(messages)
69
- text = processor.apply_chat_template(
70
- messages, tokenize=False, add_generation_prompt=True
71
- )
72
  inputs = processor(
73
  text=[text],
74
  images=images,
@@ -77,50 +61,45 @@ def navigate(screenshot, task: str):
77
  return_tensors="pt",
78
  ).to("cuda")
79
 
80
- generated = model.generate(**inputs, max_new_tokens=128)
81
- trimmed = [
82
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
83
- ]
84
  raw_out = processor.batch_decode(
85
- trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
 
 
86
  )[0]
87
 
88
- # ─────── draw predicted click for quick visual verification (optional) ──────
89
  try:
90
- actions = ast.literal_eval(raw_out)
91
- for act in actions if isinstance(actions, list) else [actions]:
92
- pos = act.get("position")
93
- if pos and isinstance(pos, list) and len(pos) == 2:
94
- screenshot = draw_point(screenshot, pos)
95
  except Exception:
96
- # decoding failed β†’ just return original screenshot
97
  pass
98
 
99
  return screenshot, raw_out, messages
100
 
101
- # ────────────────────────── Gradio interface ───────────────────────────────
102
-
103
- demo = gr.Interface(
104
- fn=navigate,
105
- inputs=[
106
- gr.Image(type="pil", label="Screenshot"),
107
- gr.Textbox(
108
- lines=1,
109
- placeholder="e.g. Search the weather for New York",
110
- label="Task",
111
- )
112
- ],
113
- outputs=[
114
- gr.Image(label="With Click Point"),
115
- gr.Textbox(label="Raw Action JSON"),
116
- gr.JSON(label="Updated Conversation History")
117
- ],
118
- title="UI-Tars Navigation Demo",
119
- )
120
 
121
  demo.launch(
122
  server_name="0.0.0.0",
123
  server_port=7860,
124
- share=False, # or True if you need a public link
125
- ssr_mode=False, # turn off experimental SSR so the process blocks
126
  )
 
 
 
1
  import ast
 
 
 
2
  import base64
3
  from io import BytesIO
4
 
5
+ import spaces
6
+ import torch
7
+ from PIL import Image, ImageDraw
8
+ import gradio as gr
9
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
10
+ from qwen_vl_utils import process_vision_info # keep this file in repo
11
 
12
+ # ─── Model & Processor ──────────────────────────────────────────────────────
13
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
14
  "ByteDance-Seed/UI-TARS-1.5-7B",
15
  device_map="auto",
16
  torch_dtype=torch.float16
17
  )
18
+ processor = AutoProcessor.from_pretrained(
 
19
  "ByteDance-Seed/UI-TARS-1.5-7B",
20
+ size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28},
21
  use_fast=True,
22
  )
23
 
24
+ # ─── Helpers ────────────────────────────────────────────────────────────────
25
+ def draw_point(img: Image.Image, pos=None, r: int = 5):
26
+ if pos:
27
+ x, y = pos[0] * img.width, pos[1] * img.height
28
+ ImageDraw.Draw(img).ellipse((x - r, y - r, x + r, y + r), fill="red")
 
 
 
 
 
 
 
29
  return img
30
 
31
+ # ─── Core Inference ─────────────────────────────────────────────────────────
32
  @spaces.GPU
33
  def navigate(screenshot, task: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  prompt_header = (
35
+ "You are an expert GUI agent.\n"
36
+ "Given the task and previous screenshots, decide only the next action.\n\n"
37
+ "=== TASK ===\n"
38
+ f"{task}\n"
39
+ "==============\n\n"
40
+ "Respond strictly as JSON: {\"Thought\": \"…\", \"Action\": {…}}\n"
41
+ "Valid actions: click, left_double, right_single, drag, hotkey, type, scroll, wait, finished.\n"
42
+ "Use English for Thought. Summarize your plan in one sentence."
43
+ )
44
 
45
+ messages = [{
46
+ "role": "user",
47
+ "content": [
48
+ {"type": "text", "text": prompt_header},
49
+ {"type": "image_url", "image_url": screenshot}
50
+ ]
51
+ }]
52
 
53
  images, videos = process_vision_info(messages)
54
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
55
+
 
56
  inputs = processor(
57
  text=[text],
58
  images=images,
 
61
  return_tensors="pt",
62
  ).to("cuda")
63
 
64
+ gen_ids = model.generate(**inputs, max_new_tokens=128)
 
 
 
65
  raw_out = processor.batch_decode(
66
+ [out[len(inp):] for inp, out in zip(inputs.input_ids, gen_ids)],
67
+ skip_special_tokens=True,
68
+ clean_up_tokenization_spaces=False
69
  )[0]
70
 
 
71
  try:
72
+ for act in ast.literal_eval(raw_out) if isinstance(ast.literal_eval(raw_out), list) else [ast.literal_eval(raw_out)]:
73
+ p = act.get("position")
74
+ if p and len(p) == 2:
75
+ screenshot = draw_point(screenshot, p)
 
76
  except Exception:
 
77
  pass
78
 
79
  return screenshot, raw_out, messages
80
 
81
+ # ─── UI ─────────────────────────────────────────────────────────────────────
82
+ with gr.Blocks(title="UI-Tars Navigation Demo", theme=gr.themes.Soft()) as demo:
83
+ with gr.Row():
84
+ with gr.Column(scale=3):
85
+ in_img = gr.Image(type="pil", height=380, show_label=False)
86
+ in_task = gr.Textbox(
87
+ placeholder="Describe what you want to do (e.g. β€œOpen Gmail and compose a message”)",
88
+ lines=1,
89
+ label="Task"
90
+ )
91
+ run_btn = gr.Button("Run", variant="primary")
92
+ with gr.Column(scale=4):
93
+ out_img = gr.Image(label="With Click Point", height=380)
94
+ out_raw = gr.Textbox(label="Raw Action JSON", interactive=False)
95
+ with gr.Accordion("Conversation History", open=False):
96
+ out_hist = gr.JSON()
97
+
98
+ run_btn.click(navigate, inputs=[in_img, in_task], outputs=[out_img, out_raw, out_hist])
 
99
 
100
  demo.launch(
101
  server_name="0.0.0.0",
102
  server_port=7860,
103
+ share=False,
104
+ ssr_mode=False,
105
  )