Stremly commited on
Commit
49fd159
·
verified ·
1 Parent(s): 1c7800e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -26
app.py CHANGED
@@ -22,25 +22,6 @@ processor = AutoProcessor.from_pretrained(
22
  use_fast=True,
23
  )
24
 
25
- def load_base64_to_pil(base64_string):
26
- """
27
- Loads a Base64 encoded image string into a PIL Image object.
28
-
29
- Args:
30
- base64_string (str): The Base64 encoded string of the image.
31
-
32
- Returns:
33
- PIL.Image.Image: The loaded PIL Image object.
34
- """
35
- # If the Base64 string includes a data URI prefix (e.g., "data:image/png;base64,"),
36
- # you should strip it before decoding.
37
- if ',' in base64_string:
38
- base64_string = base64_string.split(',')[1]
39
-
40
- decoded_bytes = base64.b64decode(base64_string)
41
- image_stream = BytesIO(decoded_bytes)
42
- pil_image = Image.open(image_stream)
43
- return pil_image
44
 
45
  def draw_point(image: Image.Image, point=None, radius: int = 5):
46
  """Overlay a red dot on the screenshot where the model clicked."""
@@ -76,13 +57,6 @@ def navigate(screenshot, task: str, platform: str, history):
76
  else:
77
  messages = history
78
 
79
- for message in messages:
80
- if message['role'] == 'user' and isinstance(message.get('content'), list):
81
- for item in message['content']:
82
- if item.get('type') == 'image_url' and isinstance(item.get('image_url'), str):
83
- # This is a base64 string, convert it to a PIL image
84
- item['image_url'] = load_base64_to_pil(item['image_url'])
85
-
86
  prompt_header = (
87
  "You are a GUI agent. You are given a task and your action history, with screenshots."
88
  "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nleft_double(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nright_single(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\ndrag(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', end_box='\u003c|box_start|\u003e(x3, y3)\u003c|box_end|\u003e')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n"
@@ -97,6 +71,7 @@ def navigate(screenshot, task: str, platform: str, history):
97
  messages, tokenize=False, add_generation_prompt=True
98
  )
99
  images, videos = process_vision_info(messages)
 
100
  inputs = processor(
101
  text=[text],
102
  images=images,
 
22
  use_fast=True,
23
  )
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def draw_point(image: Image.Image, point=None, radius: int = 5):
27
  """Overlay a red dot on the screenshot where the model clicked."""
 
57
  else:
58
  messages = history
59
 
 
 
 
 
 
 
 
60
  prompt_header = (
61
  "You are a GUI agent. You are given a task and your action history, with screenshots."
62
  "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nleft_double(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nright_single(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\ndrag(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', end_box='\u003c|box_start|\u003e(x3, y3)\u003c|box_end|\u003e')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n"
 
71
  messages, tokenize=False, add_generation_prompt=True
72
  )
73
  images, videos = process_vision_info(messages)
74
+ print(images)
75
  inputs = processor(
76
  text=[text],
77
  images=images,