Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -22,25 +22,6 @@ processor = AutoProcessor.from_pretrained(
|
|
22 |
use_fast=True,
|
23 |
)
|
24 |
|
25 |
-
def load_base64_to_pil(base64_string):
|
26 |
-
"""
|
27 |
-
Loads a Base64 encoded image string into a PIL Image object.
|
28 |
-
|
29 |
-
Args:
|
30 |
-
base64_string (str): The Base64 encoded string of the image.
|
31 |
-
|
32 |
-
Returns:
|
33 |
-
PIL.Image.Image: The loaded PIL Image object.
|
34 |
-
"""
|
35 |
-
# If the Base64 string includes a data URI prefix (e.g., "data:image/png;base64,"),
|
36 |
-
# you should strip it before decoding.
|
37 |
-
if ',' in base64_string:
|
38 |
-
base64_string = base64_string.split(',')[1]
|
39 |
-
|
40 |
-
decoded_bytes = base64.b64decode(base64_string)
|
41 |
-
image_stream = BytesIO(decoded_bytes)
|
42 |
-
pil_image = Image.open(image_stream)
|
43 |
-
return pil_image
|
44 |
|
45 |
def draw_point(image: Image.Image, point=None, radius: int = 5):
|
46 |
"""Overlay a red dot on the screenshot where the model clicked."""
|
@@ -76,13 +57,6 @@ def navigate(screenshot, task: str, platform: str, history):
|
|
76 |
else:
|
77 |
messages = history
|
78 |
|
79 |
-
for message in messages:
|
80 |
-
if message['role'] == 'user' and isinstance(message.get('content'), list):
|
81 |
-
for item in message['content']:
|
82 |
-
if item.get('type') == 'image_url' and isinstance(item.get('image_url'), str):
|
83 |
-
# This is a base64 string, convert it to a PIL image
|
84 |
-
item['image_url'] = load_base64_to_pil(item['image_url'])
|
85 |
-
|
86 |
prompt_header = (
|
87 |
"You are a GUI agent. You are given a task and your action history, with screenshots."
|
88 |
"You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nleft_double(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nright_single(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\ndrag(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', end_box='\u003c|box_start|\u003e(x3, y3)\u003c|box_end|\u003e')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n"
|
@@ -97,6 +71,7 @@ def navigate(screenshot, task: str, platform: str, history):
|
|
97 |
messages, tokenize=False, add_generation_prompt=True
|
98 |
)
|
99 |
images, videos = process_vision_info(messages)
|
|
|
100 |
inputs = processor(
|
101 |
text=[text],
|
102 |
images=images,
|
|
|
22 |
use_fast=True,
|
23 |
)
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def draw_point(image: Image.Image, point=None, radius: int = 5):
|
27 |
"""Overlay a red dot on the screenshot where the model clicked."""
|
|
|
57 |
else:
|
58 |
messages = history
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
prompt_header = (
|
61 |
"You are a GUI agent. You are given a task and your action history, with screenshots."
|
62 |
"You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nleft_double(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nright_single(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\ndrag(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', end_box='\u003c|box_start|\u003e(x3, y3)\u003c|box_end|\u003e')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n"
|
|
|
71 |
messages, tokenize=False, add_generation_prompt=True
|
72 |
)
|
73 |
images, videos = process_vision_info(messages)
|
74 |
+
print(images)
|
75 |
inputs = processor(
|
76 |
text=[text],
|
77 |
images=images,
|