Stremly commited on
Commit
9a0d6b1
Β·
verified Β·
1 Parent(s): 15c1569

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -63
app.py CHANGED
@@ -62,72 +62,72 @@ def navigate(screenshot, task: str, platform: str, history):
62
 
63
 
64
  # ───────────────────── normalise history input ──────────────────────────
65
- try:
66
- messages=[]
67
-
68
- if isinstance(history, str):
69
- try:
70
- messages= ast.literal_eval(history)
71
- except Exception as exc:
72
- raise ValueError("`history` must be a JSON/Python list: " + str(exc))
73
- else:
74
- messages = history
75
-
76
- prompt_header = (
77
- "You are a GUI agent. You are given a task and your action history, with screenshots."
78
- "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nleft_double(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nright_single(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\ndrag(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', end_box='\u003c|box_start|\u003e(x3, y3)\u003c|box_end|\u003e')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n"
79
- f"## User Instruction\n{task}"
80
- )
81
- current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]}
82
-
83
- messages.append(current)
84
-
85
-
86
- # ─────────────────────────── model forward ─────────────────────────────
87
-
88
- images, videos = process_vision_info(messages)
89
- i=0
90
- for message in messages:
91
- if message['role'] == 'user' and isinstance(message.get('content'), list):
92
- for item in message['content']:
93
- if item.get('type') == 'image_url' and isinstance(item.get('image_url'), str):
94
- item['image_url'] = images[i]
95
- i+=1
96
-
97
- text = processor.apply_chat_template(
98
- messages, tokenize=False, add_generation_prompt=True
99
  )
100
- print("\nimages\n:",images)
101
- print("\ntext\n",text)
102
- print("\nmessages\n",messages)
103
- inputs = processor(
104
- text=[text],
105
- images=images,
106
- videos=videos,
107
- padding=True,
108
- return_tensors="pt",
109
- ).to("cuda")
110
-
111
- generated = model.generate(**inputs, max_new_tokens=128)
112
- trimmed = [
113
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
114
- ]
115
- raw_out = processor.batch_decode(
116
- trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
117
- )[0]
118
 
119
- # ─────── draw predicted click for quick visual verification (optional) ──────
120
- try:
121
- actions = ast.literal_eval(raw_out)
122
- for act in actions if isinstance(actions, list) else [actions]:
123
- pos = act.get("position")
124
- if pos and isinstance(pos, list) and len(pos) == 2:
125
- screenshot = draw_point(screenshot, pos)
126
- except Exception:
127
- # decoding failed β†’ just return original screenshot
128
- pass
129
 
130
- return screenshot, raw_out, messages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  # ────────────────────────── Gradio interface ───────────────────────────────
133
 
 
62
 
63
 
64
  # ───────────────────── normalise history input ──────────────────────────
65
+
66
+ messages=[]
67
+
68
+ if isinstance(history, str):
69
+ try:
70
+ messages= ast.literal_eval(history)
71
+ except Exception as exc:
72
+ raise ValueError("`history` must be a JSON/Python list: " + str(exc))
73
+ else:
74
+ messages = history
75
+
76
+ prompt_header = (
77
+ "You are a GUI agent. You are given a task and your action history, with screenshots."
78
+ "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nleft_double(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nright_single(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\ndrag(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', end_box='\u003c|box_start|\u003e(x3, y3)\u003c|box_end|\u003e')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n"
79
+ f"## User Instruction\n{task}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  )
81
+ current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]}
82
+
83
+ messages.append(current)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+
86
+ # ─────────────────────────── model forward ─────────────────────────────
 
 
 
 
 
 
 
 
87
 
88
+ images, videos = process_vision_info(messages)
89
+ i=0
90
+ for message in messages:
91
+ if message['role'] == 'user' and isinstance(message.get('content'), list):
92
+ for item in message['content']:
93
+ if item.get('type') == 'image_url' and isinstance(item.get('image_url'), str):
94
+ item['image_url'] = images[i]
95
+ i+=1
96
+
97
+ text = processor.apply_chat_template(
98
+ messages, tokenize=False, add_generation_prompt=True
99
+ )
100
+ print("\nimages\n:",images)
101
+ print("\ntext\n",text)
102
+ print("\nmessages\n",messages)
103
+ inputs = processor(
104
+ text=[text],
105
+ images=images,
106
+ videos=videos,
107
+ padding=True,
108
+ return_tensors="pt",
109
+ ).to("cuda")
110
+
111
+ generated = model.generate(**inputs, max_new_tokens=128)
112
+ trimmed = [
113
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
114
+ ]
115
+ raw_out = processor.batch_decode(
116
+ trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
117
+ )[0]
118
+
119
+ # ─────── draw predicted click for quick visual verification (optional) ──────
120
+ try:
121
+ actions = ast.literal_eval(raw_out)
122
+ for act in actions if isinstance(actions, list) else [actions]:
123
+ pos = act.get("position")
124
+ if pos and isinstance(pos, list) and len(pos) == 2:
125
+ screenshot = draw_point(screenshot, pos)
126
+ except Exception:
127
+ # decoding failed β†’ just return original screenshot
128
+ pass
129
+
130
+ return screenshot, raw_out, messages
131
 
132
  # ────────────────────────── Gradio interface ───────────────────────────────
133