Mungert commited on
Commit
fba038c
Β·
verified Β·
1 Parent(s): 9942ab7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -85
app.py CHANGED
@@ -1,16 +1,24 @@
1
- import base64, os
2
- # import spaces
3
- import json
4
  import torch
5
  import gradio as gr
6
- from typing import Optional
7
- from PIL import Image, ImageDraw
8
  import numpy as np
9
  import matplotlib.pyplot as plt
10
- from qwen_vl_utils import process_vision_info
11
- from datasets import load_dataset
 
 
 
 
 
 
 
 
 
 
12
  from transformers import AutoProcessor
13
- from gui_actor.constants import chat_template
14
  from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer
15
  from gui_actor.inference import inference
16
 
@@ -24,7 +32,6 @@ def resize_image(image, resize_to_pixels=MAX_PIXELS):
24
  image = image.resize((image_width_resized, image_height_resized))
25
  return image
26
 
27
- # @spaces.GPU
28
  @torch.inference_mode()
29
  def draw_point(image: Image.Image, point: list, radius=8, color=(255, 0, 0, 128)):
30
  overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
@@ -33,79 +40,91 @@ def draw_point(image: Image.Image, point: list, radius=8, color=(255, 0, 0, 128)
33
  overlay_draw.ellipse(
34
  [(x - radius, y - radius), (x + radius, y + radius)],
35
  outline=color,
36
- width=5 # Adjust thickness as needed
37
  )
38
  image = image.convert('RGBA')
39
  combined = Image.alpha_composite(image, overlay)
40
  combined = combined.convert('RGB')
41
  return combined
42
 
43
- # @spaces.GPU
44
  @torch.inference_mode()
45
  def get_attn_map(image, attn_scores, n_width, n_height):
46
  w, h = image.size
47
  scores = np.array(attn_scores[0]).reshape(n_height, n_width)
48
-
49
- scores_norm = (scores - scores.min()) / (scores.max() - scores.min())
50
- # Resize score map to match image size
51
- score_map = Image.fromarray((scores_norm * 255).astype(np.uint8)).resize((w, h), resample=Image.NEAREST) # BILINEAR)
52
- # Apply colormap
53
  colormap = plt.get_cmap('jet')
54
- colored_score_map = colormap(np.array(score_map) / 255.0) # returns RGBA
55
- colored_score_map = (colored_score_map[:, :, :3] * 255).astype(np.uint8)
56
- colored_overlay = Image.fromarray(colored_score_map)
57
-
58
- # Blend with original image
59
  blended = Image.blend(image, colored_overlay, alpha=0.3)
60
  return blended
61
 
62
- # load model
63
- if torch.cuda.is_available():
64
- # os.system('pip install flash-attn --no-build-isolation')
65
- model_name_or_path = "microsoft/GUI-Actor-3B-Qwen2.5-VL"
66
- data_processor = AutoProcessor.from_pretrained(model_name_or_path)
67
- tokenizer = data_processor.tokenizer
68
- model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
69
- model_name_or_path,
70
- torch_dtype=torch.bfloat16,
71
- device_map="cuda:0",
72
- attn_implementation="flash_attention_2"
73
- ).eval()
74
- else:
 
 
 
 
 
 
 
 
 
 
75
  model_name_or_path = "microsoft/GUI-Actor-3B-Qwen2.5-VL"
 
 
 
 
 
 
 
 
 
76
  data_processor = AutoProcessor.from_pretrained(model_name_or_path)
77
  tokenizer = data_processor.tokenizer
78
- model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
 
 
 
 
79
  model_name_or_path,
80
- torch_dtype=torch.bfloat16,
81
- device_map="cpu"
82
  ).eval()
83
 
84
- title = "GUI-Actor"
85
- header = """
86
- <div align="center">
87
- <h1 style="padding-bottom: 10px; padding-top: 10px;">🎯 <strong>GUI-Actor</strong>: Coordinate-Free Visual Grounding for GUI Agents</h1>
88
- <div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;">
89
- Qianhui Wu*, Kanzhi Cheng*, Rui Yang*, Chaoyun Zhang, Jianwei Yang, Huiqiang Jiang, Jian Mu, Baolin Peng, Bo Qiao, Reuben Tan, Si Qin, Lars Liden<br>
90
- Qingwei Lin, Huan Zhang, Tong Zhang, Jianbing Zhang, Dongmei Zhang, Jianfeng Gao<br/>
91
- </div>
92
- <div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;">
93
- <a href="https://microsoft.github.io/GUI-Actor/">🌐 Project Page</a> | <a href="https://arxiv.org/abs/2403.12968">πŸ“„ arXiv Paper</a> | <a href="https://github.com/microsoft/GUI-Actor">πŸ’» Github Repo</a><br/>
94
- </div>
95
- </div>
96
- """
97
 
98
- theme = "soft"
99
- css = """#anno-img .mask {opacity: 0.5; transition: all 0.2s ease-in-out;}
100
- #anno-img .mask.active {opacity: 0.7}"""
 
 
101
 
102
- # @spaces.GPU
103
  @torch.inference_mode()
104
  def process(image, instruction):
105
- # resize image
 
 
 
 
106
  w, h = image.size
107
  if w * h > MAX_PIXELS:
108
  image = resize_image(image)
 
109
 
110
  conversation = [
111
  {
@@ -113,32 +132,39 @@ def process(image, instruction):
113
  "content": [
114
  {
115
  "type": "text",
116
- "text": "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, your task is to locate the screen element that corresponds to the instruction. You should output a PyAutoGUI action that performs a click on the correct position. To indicate the click location, we will use some special tokens, which is used to refer to a visual patch later. For example, you can output: pyautogui.click(<your_special_token_here>).",
 
 
 
 
117
  }
118
- ]
119
  },
120
  {
121
  "role": "user",
122
  "content": [
123
- {
124
- "type": "image",
125
- "image": image, # PIL.Image.Image or str to path
126
- # "image_url": "https://xxxxx.png" or "https://xxxxx.jpg" or "file://xxxxx.png" or "", will be split by "base64,"
127
- },
128
- {
129
- "type": "text",
130
- "text": instruction,
131
- },
132
  ],
133
  },
134
  ]
135
 
 
 
136
  try:
137
- pred = inference(conversation, model, tokenizer, data_processor, use_placeholder=True, topk=3)
 
 
 
 
 
 
 
 
138
  except Exception as e:
139
- print(e)
140
  return image, f"Error: {e}", None
141
-
142
  px, py = pred["topk_points"][0]
143
  output_coord = f"({px:.4f}, {py:.4f})"
144
  img_with_point = draw_point(image, (px * w, py * h))
@@ -146,20 +172,37 @@ def process(image, instruction):
146
  n_width, n_height = pred["n_width"], pred["n_height"]
147
  attn_scores = pred["attn_scores"]
148
  att_map = get_attn_map(image, attn_scores, n_width, n_height)
149
-
150
  return img_with_point, output_coord, att_map
151
 
152
 
153
- with gr.Blocks(title=title, css=css) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  gr.Markdown(header)
155
  with gr.Row():
156
  with gr.Column():
157
- input_image = gr.Image(
158
- type='pil', label='Upload image')
159
- # text box
160
- input_instruction = gr.Textbox(label='Instruction', placeholder='Text your (low-level) instruction here')
161
- submit_button = gr.Button(
162
- value='Submit', variant='primary')
163
  with gr.Column():
164
  image_with_point = gr.Image(type='pil', label='Image with Point (red circle)')
165
  with gr.Accordion('Detailed prediction'):
@@ -168,13 +211,11 @@ with gr.Blocks(title=title, css=css) as demo:
168
 
169
  submit_button.click(
170
  fn=process,
171
- inputs=[
172
- input_image,
173
- input_instruction
174
- ],
175
- outputs=[image_with_point, pred_xy, att_map]
176
  )
177
 
178
- # demo.launch(debug=False, show_error=True, share=True)
179
- # demo.launch(share=True, server_port=7861, server_name='0.0.0.0')
180
- demo.queue().launch(share=False)
 
1
+ import base64, os, json
2
+ from typing import Optional
3
+
4
  import torch
5
  import gradio as gr
 
 
6
  import numpy as np
7
  import matplotlib.pyplot as plt
8
+ from PIL import Image, ImageDraw
9
+
10
+ # ---- Hugging Face Spaces GPU decorator (safe fallback when not on Spaces) ----
11
+ try:
12
+ import spaces
13
+ GPU_DECORATOR = spaces.GPU
14
+ except Exception:
15
+ def GPU_DECORATOR(fn): # no-op locally
16
+ return fn
17
+
18
+ from qwen_vl_utils import process_vision_info # noqa: F401 (kept for parity if used elsewhere)
19
+ from datasets import load_dataset # noqa: F401
20
  from transformers import AutoProcessor
21
+ from gui_actor.constants import chat_template # noqa: F401
22
  from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer
23
  from gui_actor.inference import inference
24
 
 
32
  image = image.resize((image_width_resized, image_height_resized))
33
  return image
34
 
 
35
  @torch.inference_mode()
36
  def draw_point(image: Image.Image, point: list, radius=8, color=(255, 0, 0, 128)):
37
  overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
 
40
  overlay_draw.ellipse(
41
  [(x - radius, y - radius), (x + radius, y + radius)],
42
  outline=color,
43
+ width=5
44
  )
45
  image = image.convert('RGBA')
46
  combined = Image.alpha_composite(image, overlay)
47
  combined = combined.convert('RGB')
48
  return combined
49
 
 
50
  @torch.inference_mode()
51
  def get_attn_map(image, attn_scores, n_width, n_height):
52
  w, h = image.size
53
  scores = np.array(attn_scores[0]).reshape(n_height, n_width)
54
+ scores_norm = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
55
+ score_map = Image.fromarray((scores_norm * 255).astype(np.uint8)).resize((w, h), resample=Image.NEAREST)
 
 
 
56
  colormap = plt.get_cmap('jet')
57
+ colored_score_map = colormap(np.array(score_map) / 255.0)[:, :, :3]
58
+ colored_overlay = Image.fromarray((colored_score_map * 255).astype(np.uint8))
 
 
 
59
  blended = Image.blend(image, colored_overlay, alpha=0.3)
60
  return blended
61
 
62
+ # ----------------------------
63
+ # Model/device init for Spaces
64
+ # ----------------------------
65
+ def _pick_gpu_dtype() -> torch.dtype:
66
+ if not torch.cuda.is_available():
67
+ return torch.float32
68
+ major, minor = torch.cuda.get_device_capability()
69
+ # Ampere (8.x) / Hopper (9.x) support bf16 well
70
+ return torch.bfloat16 if major >= 8 else torch.float16
71
+
72
+ # Global holders initialized in load_model()
73
+ model = None
74
+ tokenizer = None
75
+ data_processor = None
76
+
77
+ @GPU_DECORATOR # <-- This is what Spaces looks for at startup
78
+ def load_model():
79
+ """
80
+ Allocates the GPU on Spaces and loads the model on the right device/dtype.
81
+ Runs once at startup.
82
+ """
83
+ global model, tokenizer, data_processor
84
+
85
  model_name_or_path = "microsoft/GUI-Actor-3B-Qwen2.5-VL"
86
+
87
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
88
+ dtype = _pick_gpu_dtype()
89
+
90
+ # Enable some healthy defaults on GPU
91
+ if device.startswith("cuda"):
92
+ torch.backends.cuda.matmul.allow_tf32 = True
93
+ torch.set_grad_enabled(False)
94
+
95
  data_processor = AutoProcessor.from_pretrained(model_name_or_path)
96
  tokenizer = data_processor.tokenizer
97
+
98
+ # Use SDPA attention to avoid flash-attn dependency
99
+ attn_impl = "sdpa"
100
+
101
+ model_local = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
102
  model_name_or_path,
103
+ torch_dtype=dtype,
104
+ attn_implementation=attn_impl,
105
  ).eval()
106
 
107
+ # Move to device explicitly (avoid accelerate unless you need sharding)
108
+ model_local.to(device)
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ model = model_local
111
+ return f"Loaded {model_name_or_path} on {device} with dtype={dtype} (attn={attn_impl})"
112
+
113
+ # Trigger model loading on import so Spaces allocates GPU immediately
114
+ _ = load_model()
115
 
116
+ @GPU_DECORATOR
117
  @torch.inference_mode()
118
  def process(image, instruction):
119
+ # Safety: ensure model is loaded
120
+ if model is None:
121
+ _ = load_model()
122
+
123
+ # Resize if needed
124
  w, h = image.size
125
  if w * h > MAX_PIXELS:
126
  image = resize_image(image)
127
+ w, h = image.size
128
 
129
  conversation = [
130
  {
 
132
  "content": [
133
  {
134
  "type": "text",
135
+ "text": (
136
+ "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, "
137
+ "your task is to locate the screen element that corresponds to the instruction. "
138
+ "Output a PyAutoGUI action with a special token that points to the correct location."
139
+ ),
140
  }
141
+ ],
142
  },
143
  {
144
  "role": "user",
145
  "content": [
146
+ {"type": "image", "image": image},
147
+ {"type": "text", "text": instruction},
 
 
 
 
 
 
 
148
  ],
149
  },
150
  ]
151
 
152
+ device = next(model.parameters()).device
153
+
154
  try:
155
+ pred = inference(
156
+ conversation,
157
+ model,
158
+ tokenizer,
159
+ data_processor,
160
+ use_placeholder=True,
161
+ topk=3,
162
+ device=str(device),
163
+ )
164
  except Exception as e:
165
+ print("inference error:", e)
166
  return image, f"Error: {e}", None
167
+
168
  px, py = pred["topk_points"][0]
169
  output_coord = f"({px:.4f}, {py:.4f})"
170
  img_with_point = draw_point(image, (px * w, py * h))
 
172
  n_width, n_height = pred["n_width"], pred["n_height"]
173
  attn_scores = pred["attn_scores"]
174
  att_map = get_attn_map(image, attn_scores, n_width, n_height)
175
+
176
  return img_with_point, output_coord, att_map
177
 
178
 
179
+ # ----------------------------
180
+ # Gradio UI
181
+ # ----------------------------
182
+ title = "GUI-Actor"
183
+ header = """
184
+ <div align="center">
185
+ <h1 style="padding-bottom: 10px; padding-top: 10px;">🎯 <strong>GUI-Actor</strong>: Coordinate-Free Visual Grounding for GUI Agents</h1>
186
+ <div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;">
187
+ Qianhui Wu*, Kanzhi Cheng*, Rui Yang*, Chaoyun Zhang, Jianwei Yang, Huiqiang Jiang, Jian Mu, Baolin Peng, Bo Qiao, Reuben Tan, Si Qin, Lars Liden<br>
188
+ Qingwei Lin, Huan Zhang, Tong Zhang, Jianbing Zhang, Dongmei Zhang, Jianfeng Gao<br/>
189
+ </div>
190
+ <div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;">
191
+ <a href="https://microsoft.github.io/GUI-Actor/">🌐 Project Page</a> | <a href="https://arxiv.org/abs/2403.12968">πŸ“„ arXiv Paper</a> | <a href="https://github.com/microsoft/GUI-Actor">πŸ’» Github Repo</a><br/>
192
+ </div>
193
+ </div>
194
+ """
195
+ theme = "soft"
196
+ css = """#anno-img .mask {opacity: 0.5; transition: all 0.2s ease-in-out;}
197
+ #anno-img .mask.active {opacity: 0.7}"""
198
+
199
+ with gr.Blocks(title=title, css=css, theme=theme) as demo:
200
  gr.Markdown(header)
201
  with gr.Row():
202
  with gr.Column():
203
+ input_image = gr.Image(type='pil', label='Upload image')
204
+ input_instruction = gr.Textbox(label='Instruction', placeholder='Type your (low-level) instruction here')
205
+ submit_button = gr.Button(value='Submit', variant='primary')
 
 
 
206
  with gr.Column():
207
  image_with_point = gr.Image(type='pil', label='Image with Point (red circle)')
208
  with gr.Accordion('Detailed prediction'):
 
211
 
212
  submit_button.click(
213
  fn=process,
214
+ inputs=[input_image, input_instruction],
215
+ outputs=[image_with_point, pred_xy, att_map],
216
+ queue=True,
217
+ api_name="predict",
 
218
  )
219
 
220
+ # On Spaces, queue is required to get GPU scheduling; set a modest concurrency
221
+ demo.queue(concurrency_count=1, max_size=8).launch(share=False)