Spaces:

Stremly
/

uitars

Running

App Files Files Community

Stremly commited on Jun 27

Commit

49fd159

verified ·

1 Parent(s): 1c7800e

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -26

app.py CHANGED Viewed

@@ -22,25 +22,6 @@ processor = AutoProcessor.from_pretrained(
     use_fast=True,
 )
-def load_base64_to_pil(base64_string):
-    """
-    Loads a Base64 encoded image string into a PIL Image object.
-    Args:
-        base64_string (str): The Base64 encoded string of the image.
-    Returns:
-        PIL.Image.Image: The loaded PIL Image object.
-    """
-    # If the Base64 string includes a data URI prefix (e.g., "data:image/png;base64,"),
-    # you should strip it before decoding.
-    if ',' in base64_string:
-        base64_string = base64_string.split(',')[1]
-    decoded_bytes = base64.b64decode(base64_string)
-    image_stream = BytesIO(decoded_bytes)
-    pil_image = Image.open(image_stream)
-    return pil_image
 def draw_point(image: Image.Image, point=None, radius: int = 5):
     """Overlay a red dot on the screenshot where the model clicked."""
@@ -76,13 +57,6 @@ def navigate(screenshot, task: str, platform: str, history):
     else:
         messages = history
-    for message in messages:
-        if message['role'] == 'user' and isinstance(message.get('content'), list):
-            for item in message['content']:
-                if item.get('type') == 'image_url' and isinstance(item.get('image_url'), str):
-                    # This is a base64 string, convert it to a PIL image
-                    item['image_url'] = load_base64_to_pil(item['image_url'])
     prompt_header = (
             "You are a GUI agent. You are given a task and your action history, with screenshots."
             "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nleft_double(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nright_single(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\ndrag(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', end_box='\u003c|box_start|\u003e(x3, y3)\u003c|box_end|\u003e')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n"
@@ -97,6 +71,7 @@ def navigate(screenshot, task: str, platform: str, history):
         messages, tokenize=False, add_generation_prompt=True
     )
     images, videos = process_vision_info(messages)
     inputs = processor(
         text=[text],
         images=images,

     use_fast=True,
 )
 def draw_point(image: Image.Image, point=None, radius: int = 5):
     """Overlay a red dot on the screenshot where the model clicked."""
     else:
         messages = history
     prompt_header = (
             "You are a GUI agent. You are given a task and your action history, with screenshots."
             "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nleft_double(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nright_single(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\ndrag(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', end_box='\u003c|box_start|\u003e(x3, y3)\u003c|box_end|\u003e')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n"
         messages, tokenize=False, add_generation_prompt=True
     )
     images, videos = process_vision_info(messages)
+    print(images)
     inputs = processor(
         text=[text],
         images=images,