Duibonduil commited on
Commit
ea7486e
·
verified ·
1 Parent(s): fa5256f

Upload 3 files

Browse files
examples/tools/browsers/action/actions.py ADDED
@@ -0,0 +1,824 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+ # Copyright (c) 2025 inclusionAI.
3
+ import os
4
+ import traceback
5
+
6
+ import asyncio
7
+ import time
8
+ from typing import Tuple, Any
9
+
10
+ from examples.tools.tool_action import BrowserAction
11
+ from aworld.core.tool.action_factory import ActionFactory
12
+ from aworld.core.common import ActionModel, ActionResult, Observation
13
+ from examples.tools.browsers.util.dom import DOMElementNode
14
+ from aworld.logs.util import logger
15
+ from examples.tools.browsers.action.utils import DomUtil
16
+ from aworld.core.tool.action import ExecutableAction
17
+ from aworld.utils import import_packages
18
+ from aworld.models.llm import get_llm_model, call_llm_model
19
+
20
+
21
+ def get_page(**kwargs):
22
+ tool = kwargs.get("tool")
23
+ if tool is None:
24
+ page = kwargs.get('page')
25
+ else:
26
+ page = tool.page
27
+ return page
28
+
29
+
30
+ def get_browser(**kwargs):
31
+ tool = kwargs.get("tool")
32
+ if tool is None:
33
+ page = kwargs.get('browser')
34
+ else:
35
+ page = tool.context
36
+ return page
37
+
38
+
39
+ @ActionFactory.register(name=BrowserAction.GO_TO_URL.value.name,
40
+ desc=BrowserAction.GO_TO_URL.value.desc,
41
+ tool_name="browser")
42
+ class GotoUrl(ExecutableAction):
43
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
44
+ logger.info(f"exec {BrowserAction.GO_TO_URL.value.name} action")
45
+ page = get_page(**kwargs)
46
+ if page is None:
47
+ logger.warning(f"{BrowserAction.GO_TO_URL.name} page is none")
48
+ return ActionResult(content="no page", keep=True), page
49
+
50
+ params = action.params
51
+ url = params.get("url")
52
+ if not url:
53
+ logger.warning("empty url, go to nothing.")
54
+ return ActionResult(content="empty url", keep=True), page
55
+ items = url.split('://')
56
+ if len(items) == 1:
57
+ if items[0][0] != '/':
58
+ url = "file://" + os.path.join(os.getcwd(), url)
59
+
60
+ page.goto(url)
61
+ page.wait_for_load_state()
62
+ msg = f'Navigated to {url}'
63
+ logger.info(msg)
64
+ return ActionResult(content=msg, keep=True), page
65
+
66
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
67
+ logger.info(f"exec {BrowserAction.GO_TO_URL.value.name} action")
68
+ page = get_page(**kwargs)
69
+ if page is None:
70
+ logger.warning(f"{BrowserAction.GO_TO_URL.name} page is none")
71
+ return ActionResult(content="no page", keep=True), page
72
+
73
+ url = action.params.get("url")
74
+ if not url:
75
+ logger.warning("empty url, go to nothing.")
76
+ return ActionResult(content="empty url", keep=True), page
77
+
78
+ items = url.split('://')
79
+ if len(items) == 1:
80
+ if items[0][0] != '/':
81
+ url = "file://" + os.path.join(os.getcwd(), url)
82
+
83
+ await page.goto(url)
84
+ await page.wait_for_load_state()
85
+ msg = f'Navigated to {url}'
86
+ logger.info(msg)
87
+ return ActionResult(content=msg, keep=True), page
88
+
89
+
90
+ @ActionFactory.register(name=BrowserAction.INPUT_TEXT.value.name,
91
+ desc=BrowserAction.INPUT_TEXT.value.desc,
92
+ tool_name="browser")
93
+ class InputText(ExecutableAction):
94
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
95
+ logger.info(f"exec {BrowserAction.INPUT_TEXT.value.name} action")
96
+ page = get_page(**kwargs)
97
+ if page is None:
98
+ logger.warning(f"{BrowserAction.INPUT_TEXT.name} page is none")
99
+ return ActionResult(content="input text no page", keep=True), page
100
+
101
+ params = action.params
102
+ index = params.get("index", 0)
103
+ # compatible with int and str datatype
104
+ index = int(index)
105
+ input = params.get("text", "")
106
+
107
+ ob: Observation = kwargs.get("observation")
108
+ if not ob or index not in ob.dom_tree.element_map:
109
+ raise RuntimeError(f'Element index {index} does not exist')
110
+ if not input:
111
+ raise ValueError(f'No input to the page')
112
+
113
+ element_node = ob.dom_tree.element_map[index]
114
+ self.input_to_element(input, page, element_node)
115
+ msg = f'Input {input} into index {index}'
116
+ logger.info(f"action {msg}")
117
+ logger.debug(f'Element xpath: {element_node.xpath}')
118
+ return ActionResult(content=msg, keep=True), page
119
+
120
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
121
+ logger.info(f"exec {BrowserAction.INPUT_TEXT.value.name} action")
122
+ page = get_page(**kwargs)
123
+ if page is None:
124
+ logger.warning(f"{BrowserAction.INPUT_TEXT.name} page is none")
125
+ return ActionResult(content="input text no page", keep=True), page
126
+
127
+ params = action.params
128
+ index = params.get("index")
129
+ # compatible with int and str datatype
130
+ index = int(index)
131
+ input = params.get("text", "")
132
+
133
+ ob: Observation = kwargs.get("observation")
134
+ if not ob or index not in ob.dom_tree.element_map:
135
+ raise RuntimeError(f'Element index {index} does not exist')
136
+ if not input:
137
+ raise ValueError(f'No input to the page')
138
+
139
+ element_node = ob.dom_tree.element_map[index]
140
+ await self.async_input_to_element(input, page, element_node)
141
+ msg = f'Input {input} into index {index}'
142
+ logger.info(f"action {msg}")
143
+ logger.debug(f'Element xpath: {element_node.xpath}')
144
+ return ActionResult(content=msg, keep=True), page
145
+
146
+ def input_to_element(self, input: str, page, element_node: DOMElementNode):
147
+ try:
148
+ # Highlight before typing
149
+ # if element_node.highlight_index is not None:
150
+ # await self._update_state(focus_element=element_node.highlight_index)
151
+
152
+ element_handle = DomUtil.get_locate_element(page, element_node)
153
+
154
+ if element_handle is None:
155
+ raise RuntimeError(f'Element: {repr(element_node)} not found')
156
+
157
+ # Ensure element is ready for input
158
+ try:
159
+ element_handle.wait_for_element_state('stable', timeout=1000)
160
+ element_handle.scroll_into_view_if_needed(timeout=1000)
161
+ except Exception:
162
+ pass
163
+
164
+ # Get element properties to determine input method
165
+ is_contenteditable = element_handle.get_property('isContentEditable')
166
+
167
+ # Different handling for contenteditable vs input fields
168
+ if is_contenteditable.json_value():
169
+ element_handle.evaluate('el => el.textContent = ""')
170
+ element_handle.type(input, delay=5)
171
+ else:
172
+ element_handle.fill(input)
173
+
174
+ except Exception as e:
175
+ logger.warning(f'Failed to input text into element: {repr(element_node)}. Error: {str(e)}')
176
+ raise RuntimeError(f'Failed to input text into index {element_node.highlight_index}')
177
+
178
+ async def async_input_to_element(self, input: str, page, element_node: DOMElementNode):
179
+ try:
180
+ element_handle = await DomUtil.async_get_locate_element(page, element_node)
181
+
182
+ if element_handle is None:
183
+ raise RuntimeError(f'Element: {repr(element_node)} not found')
184
+
185
+ # Ensure element is ready for input
186
+ try:
187
+ await element_handle.wait_for_element_state('stable', timeout=1000)
188
+ await element_handle.scroll_into_view_if_needed(timeout=1000)
189
+ except Exception:
190
+ pass
191
+
192
+ # Get element properties to determine input method
193
+ is_contenteditable = await element_handle.get_property('isContentEditable')
194
+
195
+ # Different handling for contenteditable vs input fields
196
+ if await is_contenteditable.json_value():
197
+ await element_handle.evaluate('el => el.textContent = ""')
198
+ await element_handle.type(input, delay=5)
199
+ else:
200
+ await element_handle.fill(input)
201
+ except Exception as e:
202
+ logger.warning(f'Failed to input text into element: {repr(element_node)}. Error: {str(e)}')
203
+ raise RuntimeError(f'Failed to input text into index {element_node.highlight_index}')
204
+
205
+
206
+ @ActionFactory.register(name=BrowserAction.CLICK_ELEMENT.value.name,
207
+ desc=BrowserAction.CLICK_ELEMENT.value.desc,
208
+ tool_name="browser")
209
+ class ClickElement(ExecutableAction):
210
+ def __init__(self):
211
+ import_packages(['playwright', 'markdownify'])
212
+
213
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
214
+ from playwright.sync_api import BrowserContext
215
+
216
+ logger.info(f"exec {BrowserAction.CLICK_ELEMENT.value.name} action")
217
+ page = get_page(**kwargs)
218
+ if page is None:
219
+ logger.warning(f"{BrowserAction.CLICK_ELEMENT.name} page is none")
220
+ return ActionResult(content="input text no page", keep=True), page
221
+
222
+ browser: BrowserContext = get_browser(**kwargs)
223
+ if browser is None:
224
+ logger.warning(f"{BrowserAction.CLICK_ELEMENT.name} browser context is none")
225
+ return ActionResult(content="none browser context", keep=True), page
226
+
227
+ index = action.params.get("index")
228
+ # compatible with int and str datatype
229
+ index = int(index)
230
+ ob: Observation = kwargs.get("observation")
231
+ if not ob or index not in ob.dom_tree.element_map:
232
+ raise RuntimeError(f'Element index {index} does not exist')
233
+ if not input:
234
+ raise ValueError(f'No input to the page')
235
+ element_node = ob.dom_tree.element_map[index]
236
+
237
+ try:
238
+ pages = len(browser.pages)
239
+ msg = f'Clicked button with index {index}: {element_node.get_all_text_till_next_clickable_element(max_depth=2)}'
240
+ logger.info(msg)
241
+
242
+ DomUtil.click_element(page, element_node, browser=browser)
243
+ logger.debug(f'Element xpath: {element_node.xpath}')
244
+ if len(browser.pages) > pages:
245
+ new_tab_msg = 'Open the new tab'
246
+ msg += f' - {new_tab_msg}'
247
+ logger.info(new_tab_msg)
248
+ page = browser.pages[-1]
249
+ page.bring_to_front()
250
+ page.wait_for_load_state(timeout=60000)
251
+ return ActionResult(content=msg, keep=True), page
252
+ except Exception as e:
253
+ logger.warning(f'Element not clickable with index {index} - most likely the page changed')
254
+ return ActionResult(error=str(e)), page
255
+
256
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
257
+ logger.info(f"exec {BrowserAction.CLICK_ELEMENT.value.name} action")
258
+ page = get_page(**kwargs)
259
+ if page is None:
260
+ logger.warn(f"{BrowserAction.CLICK_ELEMENT.name} page is none")
261
+ return ActionResult(content="input text no page", keep=True), page
262
+
263
+ browser = get_browser(**kwargs)
264
+ if browser is None:
265
+ logger.warning(f"{BrowserAction.CLICK_ELEMENT.name} browser context is none")
266
+ return ActionResult(content="none browser context", keep=True), page
267
+
268
+ index = action.params.get("index")
269
+ # compatible with int and str datatype
270
+ index = int(index)
271
+ ob: Observation = kwargs.get("observation")
272
+ if not ob or index not in ob.dom_tree.element_map:
273
+ raise RuntimeError(f'Element index {index} does not exist')
274
+ if not input:
275
+ raise ValueError(f'No input to the page')
276
+ element_node = ob.dom_tree.element_map[index]
277
+ pages = len(browser.pages)
278
+
279
+ try:
280
+ await DomUtil.async_click_element(page, element_node, browser=browser)
281
+ msg = f'Clicked button with index {index}: {element_node.get_all_text_till_next_clickable_element(max_depth=2)}'
282
+
283
+ logger.info(msg)
284
+ logger.debug(f'Element xpath: {element_node.xpath}')
285
+ if len(browser.pages) > pages:
286
+ new_tab_msg = 'Open the new tab'
287
+ msg += f' - {new_tab_msg}'
288
+ logger.info(new_tab_msg)
289
+ page = browser.pages[-1]
290
+ await page.bring_to_front()
291
+ await page.wait_for_load_state(timeout=60000)
292
+ return ActionResult(content=msg, keep=True), page
293
+ except Exception as e:
294
+ logger.warning(f'Element not clickable with index {index} - most likely the page changed')
295
+ return ActionResult(error=str(e)), page
296
+
297
+
298
+ # SEARCH_ENGINE = {"": "https://www.google.com/search?udm=14&q=",
299
+ # "google": "https://www.google.com/search?udm=14&q="}
300
+
301
+ SEARCH_ENGINE = {"": "https://www.bing.com/search?q=",
302
+ "google": "https://www.bing.com/search?q="}
303
+
304
+
305
+ @ActionFactory.register(name=BrowserAction.SEARCH.value.name,
306
+ desc=BrowserAction.SEARCH.value.desc,
307
+ tool_name="browser")
308
+ class Search(ExecutableAction):
309
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
310
+ logger.info(f"exec {BrowserAction.SEARCH.value.name} action")
311
+ page = get_page(**kwargs)
312
+ if page is None:
313
+ logger.warning(f"{BrowserAction.SEARCH.name} page is none")
314
+ return ActionResult(content="search no page", keep=True), page
315
+
316
+ params = action.params if action.params else {}
317
+ engine = params.get("engine", "")
318
+ url = SEARCH_ENGINE.get(engine)
319
+ query = params.get("query")
320
+ page.goto(f'{url}{query}')
321
+ page.wait_for_load_state()
322
+ msg = f'Searched for "{query}" in {url}'
323
+ logger.info(msg)
324
+ return ActionResult(content=msg, keep=True), page
325
+
326
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
327
+ logger.info(f"exec {BrowserAction.SEARCH.value.name} action")
328
+ page = get_page(**kwargs)
329
+ if page is None:
330
+ logger.warning(f"{BrowserAction.SEARCH.name} page is none")
331
+ return ActionResult(content="search no page", keep=True), page
332
+
333
+ params = action.params if action.params else {}
334
+ engine = params.get("engine", "")
335
+ url = SEARCH_ENGINE.get(engine)
336
+ query = params.get("query")
337
+ await page.goto(f'{url}{query}')
338
+ await page.wait_for_load_state()
339
+ msg = f'Searched for "{query}" in {url}'
340
+ logger.info(msg)
341
+ return ActionResult(content=msg, keep=True), page
342
+
343
+
344
+ @ActionFactory.register(name=BrowserAction.SEARCH_GOOGLE.value.name,
345
+ desc=BrowserAction.SEARCH_GOOGLE.value.desc,
346
+ tool_name="browser")
347
+ class SearchGoogle(ExecutableAction):
348
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
349
+ logger.info(f"exec {BrowserAction.SEARCH_GOOGLE.value.name} action")
350
+ page = get_page(**kwargs)
351
+ if page is None:
352
+ logger.warning(f"{BrowserAction.SEARCH_GOOGLE.name} page is none")
353
+ return ActionResult(content="search no page", keep=True), page
354
+
355
+ query = action.params.get("query")
356
+ page.goto(f'{SEARCH_ENGINE.get("")}{query}')
357
+ page.wait_for_load_state()
358
+ msg = f'Searched for "{query}" in Google'
359
+ logger.info(msg)
360
+ return ActionResult(content=msg, keep=True), page
361
+
362
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
363
+ logger.info(f"exec {BrowserAction.SEARCH_GOOGLE.value.name} action")
364
+ page = get_page(**kwargs)
365
+ if page is None:
366
+ logger.warning(f"{BrowserAction.SEARCH_GOOGLE.name} page is none")
367
+ return ActionResult(content="search no page", keep=True), page
368
+
369
+ query = action.params.get("query")
370
+ await page.goto(f'{SEARCH_ENGINE.get("")}{query}')
371
+ await page.wait_for_load_state()
372
+ msg = f'Searched for "{query}" in Google'
373
+ logger.info(msg)
374
+ return ActionResult(content=msg, keep=True), page
375
+
376
+
377
+ @ActionFactory.register(name=BrowserAction.NEW_TAB.value.name,
378
+ desc=BrowserAction.NEW_TAB.value.desc,
379
+ tool_name="browser")
380
+ class NewTab(ExecutableAction):
381
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
382
+ logger.info(f"exec {BrowserAction.NEW_TAB.value.name} action")
383
+ browser = get_browser(**kwargs)
384
+ url = action.params.get("url")
385
+
386
+ new_page = browser.new_page()
387
+ new_page.wait_for_load_state()
388
+
389
+ if url:
390
+ new_page.goto(url)
391
+ DomUtil.wait_for_stable_network(new_page)
392
+
393
+ msg = f'Opened new tab with {url}'
394
+ logger.debug(msg)
395
+ return ActionResult(content=msg, keep=True), new_page
396
+
397
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
398
+ logger.info(f"exec {BrowserAction.NEW_TAB.value.name} action")
399
+ browser = get_browser(**kwargs)
400
+ url = action.params.get("url")
401
+ new_page = await browser.new_page()
402
+ await new_page.wait_for_load_state()
403
+
404
+ if url:
405
+ await new_page.goto(url)
406
+ DomUtil.wait_for_stable_network(new_page)
407
+ msg = f'Opened new tab with {url}'
408
+ logger.debug(msg)
409
+ return ActionResult(content=msg, keep=True), get_page(**kwargs)
410
+
411
+
412
+ @ActionFactory.register(name=BrowserAction.GO_BACK.value.name,
413
+ desc=BrowserAction.GO_BACK.value.desc,
414
+ tool_name="browser")
415
+ class GoBack(ExecutableAction):
416
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
417
+ logger.info(f"exec {BrowserAction.GO_BACK.value.name} action")
418
+ page = get_page(**kwargs)
419
+ if page is None:
420
+ logger.warning(f"{BrowserAction.GO_BACK.name} page is none")
421
+ return ActionResult(content="search no page", keep=True), page
422
+
423
+ page.go_back()
424
+ msg = 'Navigated back'
425
+ logger.info(msg)
426
+ return ActionResult(content=msg, keep=True), page
427
+
428
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
429
+ logger.info(f"exec {BrowserAction.GO_BACK.value.name} action")
430
+ page = get_page(**kwargs)
431
+ if page is None:
432
+ logger.warning(f"{BrowserAction.GO_BACK.name} page is none")
433
+ return ActionResult(content="search no page", keep=True), page
434
+
435
+ await page.go_back()
436
+ msg = 'Navigated back'
437
+ logger.info(msg)
438
+ return ActionResult(content=msg, keep=True), page
439
+
440
+
441
+ @ActionFactory.register(name=BrowserAction.EXTRACT_CONTENT.value.name,
442
+ desc=BrowserAction.EXTRACT_CONTENT.value.desc,
443
+ tool_name="browser")
444
+ class ExtractContent(ExecutableAction):
445
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
446
+ import markdownify
447
+ from langchain_core.prompts import PromptTemplate
448
+
449
+ logger.info(f"exec {BrowserAction.EXTRACT_CONTENT.value.name} action")
450
+ page = get_page(**kwargs)
451
+ if page is None:
452
+ logger.warning(f"{BrowserAction.EXTRACT_CONTENT.name} page is none")
453
+ return ActionResult(content="extract content no page", keep=True), page
454
+
455
+ goal = action.params.get("goal")
456
+ llm_config = kwargs.get("llm_config")
457
+ if llm_config and llm_config.llm_api_key:
458
+ llm = get_llm_model(llm_config)
459
+ max_extract_content_output_tokens = kwargs.get("max_extract_content_output_tokens")
460
+ max_extract_content_input_tokens = kwargs.get("max_extract_content_input_tokens")
461
+
462
+ content = markdownify.markdownify(page.content())
463
+
464
+ # Truncate content if it exceeds max input tokens
465
+ if max_extract_content_input_tokens and len(content) > max_extract_content_input_tokens:
466
+ logger.warning(
467
+ f"Content length ({len(content)}) exceeds max input tokens ({max_extract_content_input_tokens}). Truncating content.")
468
+ content = content[:max_extract_content_input_tokens]
469
+
470
+ prompt = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page}'
471
+ prompt_with_outputlimit = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page} \n\n#The length of the returned result must be less than {max_extract_content_output_tokens} characters.'
472
+ template = PromptTemplate(input_variables=['goal', 'page'], template=prompt)
473
+
474
+ messages = [{'role': 'user', 'content': template.format(goal=goal, page=content)}]
475
+ try:
476
+ output = call_llm_model(llm,
477
+ messages=messages,
478
+ model=llm_config.llm_model_name,
479
+ temperature=llm_config.llm_temperature)
480
+ result_content = output.content
481
+
482
+ # Check if output exceeds the token limit and retry with length-limited prompt if needed
483
+ if max_extract_content_output_tokens and len(result_content) > max_extract_content_output_tokens:
484
+ logger.warning(
485
+ f"Output exceeds maximum length ({len(result_content)} > {max_extract_content_output_tokens}). Retrying with limited prompt.")
486
+ template_with_limit = PromptTemplate(
487
+ input_variables=['goal', 'page', 'max_extract_content_output_tokens'],
488
+ template=prompt_with_outputlimit
489
+ )
490
+ messages = [{'role': 'user', 'content': template_with_limit.format(
491
+ goal=goal,
492
+ page=content,
493
+ max_extract_content_output_tokens=max_extract_content_output_tokens,
494
+ max_tokens=max_extract_content_output_tokens
495
+ )}]
496
+ # extract content with length limit
497
+ output = call_llm_model(llm,
498
+ messages=messages,
499
+ model=llm_config.llm_model_name,
500
+ temperature=llm_config.llm_temperature)
501
+ result_content = output.content
502
+
503
+ msg = f'Extracted from page\n: {result_content}\n'
504
+ logger.info(msg)
505
+ return ActionResult(content=msg, keep=True), page
506
+ except Exception as e:
507
+ logger.debug(f'Error extracting content: {e}')
508
+ msg = f'Extracted from page\n: {content}\n'
509
+ logger.info(msg)
510
+ return ActionResult(content=msg), page
511
+
512
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
513
+ import markdownify
514
+ from langchain_core.prompts import PromptTemplate
515
+
516
+ logger.info(f"exec {BrowserAction.EXTRACT_CONTENT.value.name} action")
517
+ page = get_page(**kwargs)
518
+ if page is None:
519
+ logger.warning(f"{BrowserAction.EXTRACT_CONTENT.name} page is none")
520
+ return ActionResult(content="extract content no page", keep=True), page
521
+
522
+ goal = action.params.get("goal")
523
+ llm_config = kwargs.get("llm_config")
524
+ if llm_config and llm_config.llm_api_key:
525
+ llm = get_llm_model(llm_config)
526
+ content = markdownify.markdownify(await page.content())
527
+ max_extract_content_output_tokens = kwargs.get("max_extract_content_output_tokens")
528
+ max_extract_content_input_tokens = kwargs.get("max_extract_content_input_tokens")
529
+
530
+ # Truncate content if it exceeds max input tokens
531
+ if max_extract_content_input_tokens and len(content) > max_extract_content_input_tokens:
532
+ logger.warning(
533
+ f"Content length ({len(content)}) exceeds max input tokens ({max_extract_content_input_tokens}). Truncating content.")
534
+ content = content[:max_extract_content_input_tokens]
535
+
536
+ prompt = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page}'
537
+ prompt_with_outputlimit = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page} \n\n#The length of the returned result must be less than {max_extract_content_output_tokens} characters.'
538
+ template = PromptTemplate(input_variables=['goal', 'page'], template=prompt)
539
+
540
+ messages = [{'role': 'user', 'content': template.format(goal=goal, page=content)}]
541
+ try:
542
+ output = call_llm_model(llm,
543
+ messages=messages,
544
+ model=llm_config.llm_model_name,
545
+ temperature=llm_config.llm_temperature)
546
+ result_content = output.content
547
+
548
+ # Check if output exceeds the token limit and retry with length-limited prompt if needed
549
+ if max_extract_content_output_tokens and len(result_content) > max_extract_content_output_tokens:
550
+ logger.info(
551
+ f"Output exceeds maximum length ({len(result_content)} > {max_extract_content_output_tokens}). Retrying with limited prompt.")
552
+ template_with_limit = PromptTemplate(
553
+ input_variables=['goal', 'page', 'max_extract_content_output_tokens'],
554
+ template=prompt_with_outputlimit
555
+ )
556
+ messages = [{'role': 'user', 'content': template_with_limit.format(
557
+ goal=goal,
558
+ page=content,
559
+ max_extract_content_output_tokens=max_extract_content_output_tokens,
560
+ max_tokens=max_extract_content_output_tokens
561
+ )}]
562
+ # extract content with length limit
563
+ output = call_llm_model(llm,
564
+ messages=messages,
565
+ model=llm_config.llm_model_name,
566
+ temperature=llm_config.llm_temperature)
567
+ result_content = output.content
568
+
569
+ msg = f'Extracted from page\n: {result_content}\n'
570
+ logger.info(msg)
571
+ return ActionResult(content=msg, keep=True), page
572
+ except Exception as e:
573
+ logger.debug(f'Error extracting content: {e}')
574
+ msg = f'Extracted from page\n: {content}\n'
575
+ logger.info(msg)
576
+ return ActionResult(content=msg), page
577
+
578
+
579
+ @ActionFactory.register(name=BrowserAction.SCROLL_DOWN.value.name,
580
+ desc=BrowserAction.SCROLL_DOWN.value.desc,
581
+ tool_name="browser")
582
+ class ScrollDown(ExecutableAction):
583
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
584
+ logger.info(f"exec {BrowserAction.SCROLL_DOWN.value.name} action")
585
+ page = get_page(**kwargs)
586
+ if page is None:
587
+ logger.warning(f"{BrowserAction.SCROLL_DOWN.name} page is none")
588
+ return ActionResult(content="scroll no page", keep=True), page
589
+
590
+ amount = action.params.get("amount")
591
+ if not amount:
592
+ page.evaluate('window.scrollBy(0, window.innerHeight);')
593
+ else:
594
+ amount = int(amount)
595
+ page.evaluate(f'window.scrollBy(0, {amount});')
596
+
597
+ amount = f'{amount} pixels' if amount else 'one page'
598
+ msg = f'Scrolled down the page by {amount}'
599
+ logger.info(msg)
600
+ return ActionResult(content=msg, keep=True), page
601
+
602
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
603
+ logger.info(f"exec {BrowserAction.SCROLL_DOWN.value.name} action")
604
+ page = get_page(**kwargs)
605
+ if page is None:
606
+ logger.warning(f"{BrowserAction.SCROLL_DOWN.name} page is none")
607
+ return ActionResult(content="scroll no page", keep=True), page
608
+
609
+ amount = action.params.get("amount")
610
+ if not amount:
611
+ await page.evaluate('window.scrollBy(0, window.innerHeight);')
612
+ else:
613
+ amount = int(amount)
614
+ await page.evaluate(f'window.scrollBy(0, {amount});')
615
+
616
+ amount = f'{amount} pixels' if amount else 'one page'
617
+ msg = f'Scrolled down the page by {amount}'
618
+ logger.info(msg)
619
+ return ActionResult(content=msg, keep=True), page
620
+
621
+
622
+ @ActionFactory.register(name=BrowserAction.SCROLL_UP.value.name,
623
+ desc=BrowserAction.SCROLL_UP.value.desc,
624
+ tool_name="browser")
625
+ class ScrollUp(ExecutableAction):
626
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
627
+ logger.info(f"exec {BrowserAction.SCROLL_UP.value.name} action")
628
+ page = get_page(**kwargs)
629
+ if page is None:
630
+ logger.warning(f"{BrowserAction.SCROLL_UP.name} page is none")
631
+ return ActionResult(content="scroll no page", keep=True), page
632
+
633
+ amount = action.params.get("amount")
634
+ if not amount:
635
+ page.evaluate('window.scrollBy(0, -window.innerHeight);')
636
+ else:
637
+ amount = int(amount)
638
+ page.evaluate(f'window.scrollBy(0, -{amount});')
639
+
640
+ amount = f'{amount} pixels' if amount else 'one page'
641
+ msg = f'Scrolled down the page by {amount}'
642
+ logger.info(msg)
643
+ return ActionResult(content=msg, keep=True), page
644
+
645
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
646
+ logger.info(f"exec {BrowserAction.SCROLL_UP.value.name} action")
647
+ page = get_page(**kwargs)
648
+ if page is None:
649
+ logger.warning(f"{BrowserAction.SCROLL_UP.name} page is none")
650
+ return ActionResult(content="scroll no page", keep=True), page
651
+
652
+ amount = action.params.get("amount")
653
+ if not amount:
654
+ await page.evaluate('window.scrollBy(0, -window.innerHeight);')
655
+ else:
656
+ amount = int(amount)
657
+ await page.evaluate(f'window.scrollBy(0, -{amount});')
658
+
659
+ amount = f'{amount} pixels' if amount else 'one page'
660
+ msg = f'Scrolled down the page by {amount}'
661
+ logger.info(msg)
662
+ return ActionResult(content=msg, keep=True), page
663
+
664
+
665
+ @ActionFactory.register(name=BrowserAction.WAIT.value.name,
666
+ desc=BrowserAction.WAIT.value.desc,
667
+ tool_name="browser")
668
+ class Wait(ExecutableAction):
669
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
670
+ seconds = action.params.get("seconds")
671
+ if not seconds:
672
+ seconds = action.params.get("duration", 0)
673
+ seconds = int(seconds)
674
+ msg = f'Waiting for {seconds} seconds'
675
+ logger.info(msg)
676
+ time.sleep(seconds)
677
+ return ActionResult(content=msg, keep=True), kwargs.get('page')
678
+
679
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
680
+ seconds = action.params.get("seconds")
681
+ if not seconds:
682
+ seconds = action.params.get("duration", 0)
683
+ seconds = int(seconds)
684
+ msg = f'Waiting for {seconds} seconds'
685
+ logger.info(msg)
686
+ await asyncio.sleep(seconds)
687
+ return ActionResult(content=msg, keep=True), kwargs.get('page')
688
+
689
+
690
+ @ActionFactory.register(name=BrowserAction.SWITCH_TAB.value.name,
691
+ desc=BrowserAction.SWITCH_TAB.value.desc,
692
+ tool_name="browser")
693
+ class SwitchTab(ExecutableAction):
694
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
695
+ logger.info(f"exec {BrowserAction.SWITCH_TAB.value.name} action")
696
+ browser = get_browser(**kwargs)
697
+ if browser is None:
698
+ logger.warning(f"{BrowserAction.SWITCH_TAB.name} browser context is none")
699
+ return ActionResult(content="switch tab no browser context", keep=True), get_page(**kwargs)
700
+
701
+ page_id = action.params.get("page_id", 0)
702
+ page_id = int(page_id)
703
+ pages = browser.pages
704
+
705
+ if page_id >= len(pages):
706
+ raise RuntimeError(f'No tab found with page_id: {page_id}')
707
+
708
+ page = pages[page_id]
709
+ page.bring_to_front()
710
+ page.wait_for_load_state()
711
+ msg = f'Switched to tab {page_id}'
712
+ logger.info(msg)
713
+ return ActionResult(content=msg, keep=True), page
714
+
715
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
716
+ logger.info(f"exec {BrowserAction.SWITCH_TAB.value.name} action")
717
+ browser = get_browser(**kwargs)
718
+ if browser is None:
719
+ logger.warning(f"{BrowserAction.SWITCH_TAB.name} browser context is none")
720
+ return ActionResult(content="switch tab no browser context", keep=True), get_page(**kwargs)
721
+
722
+ page_id = action.params.get("page_id", 0)
723
+ page_id = int(page_id)
724
+ pages = browser.pages
725
+
726
+ if page_id >= len(pages):
727
+ raise RuntimeError(f'No tab found with page_id: {page_id}')
728
+
729
+ page = pages[page_id]
730
+ await page.bring_to_front()
731
+ await page.wait_for_load_state()
732
+ msg = f'Switched to tab {page_id}'
733
+ logger.info(msg)
734
+ return ActionResult(content=msg, keep=True), page
735
+
736
+
737
+ @ActionFactory.register(name=BrowserAction.SEND_KEYS.value.name,
738
+ desc=BrowserAction.SEND_KEYS.value.desc,
739
+ tool_name="browser")
740
+ class SendKeys(ExecutableAction):
741
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
742
+ logger.info(f"exec {BrowserAction.SEND_KEYS.value.name} action")
743
+ page = get_page(**kwargs)
744
+ if page is None:
745
+ logger.warning(f"{BrowserAction.SEND_KEYS.name} page is none")
746
+ return ActionResult(content="scroll no page", keep=True), page
747
+
748
+ keys = action.params.get("keys")
749
+ if not keys:
750
+ return ActionResult(success=False, content="no keys", keep=True), page
751
+
752
+ try:
753
+ page.keyboard.press(keys)
754
+ except Exception as e:
755
+ logger.warning(f"{keys} press fail. \n{traceback.format_exc()}")
756
+ raise e
757
+ return ActionResult(content=f"Sent keys: {keys}", keep=True), page
758
+
759
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
760
+ logger.info(f"exec {BrowserAction.SEND_KEYS.value.name} action")
761
+ page = get_page(**kwargs)
762
+ if page is None:
763
+ logger.warning(f"{BrowserAction.SEND_KEYS.name} page is none")
764
+ return ActionResult(content="scroll no page", keep=True), page
765
+
766
+ keys = action.params.get("keys")
767
+ if not keys:
768
+ return ActionResult(success=False, content="no keys", keep=True), page
769
+
770
+ try:
771
+ await page.keyboard.press(keys)
772
+ except Exception as e:
773
+ logger.warning(f"{keys} press fail. \n{traceback.format_exc()}")
774
+ raise e
775
+
776
+ return ActionResult(content=f"Sent keys: {keys}", keep=True), page
777
+
778
+
779
+ @ActionFactory.register(name=BrowserAction.WRITE_TO_FILE.value.name,
780
+ desc=BrowserAction.WRITE_TO_FILE.value.desc,
781
+ tool_name="browser")
782
+ class WriteToFile(ExecutableAction):
783
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
784
+ # 设置默认文件路径
785
+ file_path = "tmp_result.md"
786
+ # 检查参数中是否有file_path
787
+ if "file_path" in action.params:
788
+ file_path = action.params.get("file_path", "tmp_result.md")
789
+ # 检查参数中是否有file_name
790
+ elif "file_name" in action.params:
791
+ file_path = action.params.get("file_name", "tmp_result.md")
792
+ elif "filename" in action.params:
793
+ file_path = action.params.get("filename", "tmp_result.md")
794
+ content = action.params.get("content", "")
795
+ mode = action.params.get("mode", "a") # Default to append mode
796
+ # 获取文件的绝对路径
797
+ abs_file_path = os.path.abspath(file_path)
798
+ try:
799
+ with open(file_path, mode, encoding='utf-8') as f:
800
+ f.write(content + '\n')
801
+ msg = f'Successfully wrote content to {abs_file_path}'
802
+ logger.info(msg)
803
+ return ActionResult(content=msg, keep=True), get_page(**kwargs)
804
+ except Exception as e:
805
+ error_msg = f'Failed to write to file {abs_file_path}: {str(e)}'
806
+ logger.error(error_msg)
807
+ return ActionResult(content=error_msg, keep=True, error=error_msg), get_page(**kwargs)
808
+
809
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
810
+ # For file operations, we don't need to make this asynchronous
811
+ return self.act(action, **kwargs)
812
+
813
+
814
+ @ActionFactory.register(name=BrowserAction.DONE.value.name,
815
+ desc=BrowserAction.DONE.value.desc,
816
+ tool_name="browser")
817
+ class Done(ExecutableAction):
818
+ def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
819
+ logger.info(f"exec {BrowserAction.DONE.value.name} action")
820
+ return ActionResult(is_done=True, success=True, content="done", keep=True), get_page(**kwargs)
821
+
822
+ async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
823
+ logger.info(f"exec {BrowserAction.DONE.value.name} action")
824
+ return ActionResult(is_done=True, success=True, content="done", keep=True), get_page(**kwargs)
examples/tools/browsers/action/executor.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+ # Copyright (c) 2025 inclusionAI.
3
+
4
+ from typing import Tuple, List, Any
5
+
6
+ from aworld.core.tool.action_factory import ActionFactory
7
+ from aworld.core.common import ActionModel, ActionResult, Observation
8
+ from aworld.logs.util import logger
9
+ from aworld.core.tool.base import Tool, ToolActionExecutor
10
+
11
+
12
+ class BrowserToolActionExecutor(ToolActionExecutor):
13
+ def __init__(self, tool: Tool = None):
14
+ super(BrowserToolActionExecutor, self).__init__(tool)
15
+
16
+ def execute_action(self, actions: List[ActionModel], **kwargs) -> Tuple[
17
+ List[ActionResult], Any]:
18
+ """Execute the specified browser action sequence by agent policy.
19
+
20
+ Args:
21
+ actions: Tool action sequence.
22
+
23
+ Returns:
24
+ Browser page and action result list.
25
+ """
26
+ action_results = []
27
+ page = self.tool.page
28
+ for action in actions:
29
+ action_result, page = self._exec(action, **kwargs)
30
+ action_results.append(action_result)
31
+ return action_results, page
32
+
33
+ async def async_execute_action(self, actions: List[ActionModel], **kwargs) -> Tuple[
34
+ List[ActionResult], Any]:
35
+ """Execute the specified browser action sequence by agent policy.
36
+
37
+ Args:
38
+ actions: Tool action sequence.
39
+
40
+ Returns:
41
+ Browser page and action result list.
42
+ """
43
+ action_results = []
44
+ page = self.tool.page
45
+ for action in actions:
46
+ action_result, page = await self._async_exec(action, **kwargs)
47
+ action_results.append(action_result)
48
+ return action_results, page
49
+
50
+ def _exec(self, action_model: ActionModel, **kwargs):
51
+ action_name = action_model.action_name
52
+ if action_name not in ActionFactory:
53
+ raise ValueError(f'Action {action_name} not found')
54
+
55
+ action = ActionFactory(action_name)
56
+ action_result, page = action.act(action_model, page=self.tool.page, browser=self.tool.context, **kwargs)
57
+ logger.info(f"{action_name} execute finished")
58
+ return action_result, page
59
+
60
+ async def _async_exec(self, action_model: ActionModel, **kwargs):
61
+ action_name = action_model.action_name
62
+ if action_name not in ActionFactory:
63
+ action_name = action_model.tool_name + action_model.action_name
64
+ if action_name not in ActionFactory:
65
+ raise ValueError(f'Action {action_name} not found')
66
+
67
+ action = ActionFactory(action_name)
68
+ action_result, page = await action.async_act(action_model, page=self.tool.page, browser=self.tool.context, **kwargs)
69
+ logger.info(f"{action_name} execute finished")
70
+ return action_result, page
examples/tools/browsers/action/utils.py ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+ # Copyright (c) 2025 inclusionAI.
3
+
4
+ import re
5
+ import time
6
+ import traceback
7
+ from typing import Optional
8
+
9
+ from examples.tools.browsers.util.dom import DOMElementNode
10
+ from aworld.logs.util import logger
11
+ from aworld.utils import import_package
12
+
13
+
14
+ class DomUtil:
15
+ def __init__(self):
16
+ import_package("playwright")
17
+
18
+ @staticmethod
19
+ async def async_click_element(page, element_node: DOMElementNode, **kwargs) -> Optional[str]:
20
+ from playwright.async_api import ElementHandle as AElementHandle, BrowserContext as ABrowserContext
21
+
22
+ try:
23
+ element_handle: AElementHandle = await DomUtil.async_get_locate_element(page, element_node)
24
+ if element_handle is None:
25
+ raise Exception(f'Element: {repr(element_node)} not found')
26
+
27
+ bound = await element_handle.bounding_box()
28
+ try:
29
+ # todo: iframe.
30
+ center_x = bound['x'] + bound['width'] / 2
31
+ center_y = bound['y'] + bound['height'] / 2
32
+
33
+ try:
34
+ browser: ABrowserContext = kwargs.get('browser')
35
+ async with browser.expect_page() as new_page_info:
36
+ await page.mouse.click(center_x, center_y)
37
+ await page.mouse.click(center_x, center_y)
38
+ await page.wait_for_load_state()
39
+ except:
40
+ logger.warning(traceback.format_exc())
41
+ except:
42
+ logger.info(f"click {element_handle}!!")
43
+ if await element_handle.text_content():
44
+ browser: ABrowserContext = kwargs.get('browser')
45
+ if browser:
46
+ try:
47
+ async with browser.expect_page() as new_page_info:
48
+ await page.click(f"text={element_handle.text_content()}")
49
+ page = await new_page_info.value
50
+ await page.wait_for_load_state()
51
+ except:
52
+ logger.warning(traceback.format_exc())
53
+ else:
54
+ await element_handle.click()
55
+ await page.wait_for_load_state()
56
+ else:
57
+ await element_handle.click()
58
+ await page.wait_for_load_state()
59
+ except Exception as e:
60
+ logger.error(traceback.format_exc())
61
+ raise Exception(f'Failed to click element: {repr(element_node)}. Error: {str(e)}')
62
+
63
+ @staticmethod
64
+ def click_element(page, element_node: DOMElementNode, **kwargs) -> Optional[str]:
65
+ from playwright.sync_api import ElementHandle, BrowserContext
66
+
67
+ try:
68
+ element_handle: ElementHandle = DomUtil.get_locate_element(page, element_node)
69
+ if element_handle is None:
70
+ raise Exception(f'Element: {repr(element_node)} not found')
71
+
72
+ bound = element_handle.bounding_box()
73
+ try:
74
+ # todo: iframe.
75
+ center_x = bound['x'] + bound['width'] / 2
76
+ center_y = bound['y'] + bound['height'] / 2
77
+
78
+ try:
79
+ browser: BrowserContext = kwargs.get('browser')
80
+ with browser.expect_page() as new_page_info:
81
+ page.mouse.click(center_x, center_y)
82
+ page = new_page_info.value
83
+ page.wait_for_load_state()
84
+ except:
85
+ logger.warning(traceback.format_exc())
86
+ except:
87
+ logger.info(f"click {element_handle}!!")
88
+ if element_handle.text_content():
89
+ browser: BrowserContext = kwargs.get('browser')
90
+ if browser:
91
+ try:
92
+ with browser.expect_page() as new_page_info:
93
+ page.click(f"text={element_handle.text_content()}")
94
+ page = new_page_info.value
95
+ page.wait_for_load_state()
96
+ except:
97
+ logger.warning(traceback.format_exc())
98
+ else:
99
+ element_handle.click()
100
+ page.wait_for_load_state()
101
+ else:
102
+ element_handle.click()
103
+ page.wait_for_load_state()
104
+ except Exception as e:
105
+ logger.error(traceback.format_exc())
106
+ raise Exception(f'Failed to click element: {repr(element_node)}. Error: {str(e)}')
107
+
108
+ @staticmethod
109
+ async def async_get_locate_element(current_frame, element: DOMElementNode):
110
+ # Start with the target element and collect all parents, return Optional[AElementHandle]
111
+ from playwright.async_api import FrameLocator as AFrameLocator
112
+
113
+ parents: list[DOMElementNode] = []
114
+ current = element
115
+ while current.parent is not None:
116
+ parent = current.parent
117
+ parents.append(parent)
118
+ current = parent
119
+
120
+ # Reverse the parents list to process from top to bottom
121
+ parents.reverse()
122
+
123
+ # Process all iframe parents in sequence
124
+ iframes = [item for item in parents if item.tag_name == 'iframe']
125
+ for parent in iframes:
126
+ css_selector = DomUtil._enhanced_css_selector_for_element(
127
+ parent,
128
+ include_dynamic_attributes=True,
129
+ )
130
+ current_frame = current_frame.frame_locator(css_selector)
131
+
132
+ css_selector = DomUtil._enhanced_css_selector_for_element(
133
+ element, include_dynamic_attributes=True
134
+ )
135
+
136
+ try:
137
+ if isinstance(current_frame, AFrameLocator):
138
+ element_handle = await current_frame.locator(css_selector).element_handle()
139
+ return element_handle
140
+ else:
141
+ # Try to scroll into view if hidden
142
+ element_handle = await current_frame.query_selector(css_selector)
143
+ if element_handle:
144
+ await element_handle.scroll_into_view_if_needed()
145
+ return element_handle
146
+ return None
147
+ except Exception as e:
148
+ logger.error(f'Failed to locate element: {str(e)}')
149
+ return None
150
+
151
+ @staticmethod
152
+ def get_locate_element(current_frame, element: DOMElementNode):
153
+ # Start with the target element and collect all parents
154
+ from playwright.sync_api import FrameLocator
155
+
156
+ parents: list[DOMElementNode] = []
157
+ current = element
158
+ while current.parent is not None:
159
+ parent = current.parent
160
+ parents.append(parent)
161
+ current = parent
162
+
163
+ # Reverse the parents list to process from top to bottom
164
+ parents.reverse()
165
+
166
+ # Process all iframe parents in sequence
167
+ iframes = [item for item in parents if item.tag_name == 'iframe']
168
+ for parent in iframes:
169
+ css_selector = DomUtil._enhanced_css_selector_for_element(
170
+ parent,
171
+ include_dynamic_attributes=True,
172
+ )
173
+ current_frame = current_frame.frame_locator(css_selector)
174
+
175
+ css_selector = DomUtil._enhanced_css_selector_for_element(
176
+ element, include_dynamic_attributes=True
177
+ )
178
+
179
+ try:
180
+ if isinstance(current_frame, FrameLocator):
181
+ element_handle = current_frame.locator(css_selector).element_handle()
182
+ return element_handle
183
+ else:
184
+ # Try to scroll into view if hidden
185
+ element_handle = current_frame.query_selector(css_selector)
186
+ if element_handle:
187
+ element_handle.scroll_into_view_if_needed()
188
+ return element_handle
189
+ return None
190
+ except Exception as e:
191
+ logger.error(f'Failed to locate element: {str(e)}')
192
+ return None
193
+
194
+ @staticmethod
195
+ def wait_for_stable_network(page, **kwargs):
196
+ pending_requests = set()
197
+ last_activity = time.time()
198
+
199
+ # Define relevant resource types and content types
200
+ RELEVANT_RESOURCE_TYPES = {
201
+ 'document',
202
+ 'stylesheet',
203
+ 'image',
204
+ 'font',
205
+ 'script',
206
+ 'iframe',
207
+ }
208
+
209
+ RELEVANT_CONTENT_TYPES = {
210
+ 'text/html',
211
+ 'text/css',
212
+ 'application/javascript',
213
+ 'image/',
214
+ 'font/',
215
+ 'application/json',
216
+ }
217
+
218
+ # Additional patterns to filter out
219
+ IGNORED_URL_PATTERNS = {
220
+ # Analytics and tracking
221
+ 'analytics',
222
+ 'tracking',
223
+ 'telemetry',
224
+ 'beacon',
225
+ 'metrics',
226
+ # Ad-related
227
+ 'doubleclick',
228
+ 'adsystem',
229
+ 'adserver',
230
+ 'advertising',
231
+ # Social media widgets
232
+ 'facebook.com/plugins',
233
+ 'platform.twitter',
234
+ 'linkedin.com/embed',
235
+ # Live chat and support
236
+ 'livechat',
237
+ 'zendesk',
238
+ 'intercom',
239
+ 'crisp.chat',
240
+ 'hotjar',
241
+ # Push notifications
242
+ 'push-notifications',
243
+ 'onesignal',
244
+ 'pushwoosh',
245
+ # Background sync/heartbeat
246
+ 'heartbeat',
247
+ 'ping',
248
+ 'alive',
249
+ # WebRTC and streaming
250
+ 'webrtc',
251
+ 'rtmp://',
252
+ 'wss://',
253
+ # Common CDNs for dynamic content
254
+ 'cloudfront.net',
255
+ 'fastly.net',
256
+ }
257
+
258
+ def on_request(request):
259
+ # Filter by resource type
260
+ if request.resource_type not in RELEVANT_RESOURCE_TYPES:
261
+ return
262
+
263
+ # Filter out streaming, websocket, and other real-time requests
264
+ if request.resource_type in {
265
+ 'websocket',
266
+ 'media',
267
+ 'eventsource',
268
+ 'manifest',
269
+ 'other',
270
+ }:
271
+ return
272
+
273
+ # Filter out by URL patterns
274
+ url = request.url.lower()
275
+ if any(pattern in url for pattern in IGNORED_URL_PATTERNS):
276
+ return
277
+
278
+ # Filter out data URLs and blob URLs
279
+ if url.startswith(('data:', 'blob:')):
280
+ return
281
+
282
+ # Filter out requests with certain headers
283
+ headers = request.headers
284
+ if headers.get('purpose') == 'prefetch' or headers.get('sec-fetch-dest') in [
285
+ 'video',
286
+ 'audio',
287
+ ]:
288
+ return
289
+
290
+ nonlocal last_activity
291
+ pending_requests.add(request)
292
+ last_activity = time.time()
293
+
294
+ def on_response(response):
295
+ request = response.request
296
+ if request not in pending_requests:
297
+ return
298
+
299
+ # Filter by content type if available
300
+ content_type = response.headers.get('content-type', '').lower()
301
+
302
+ # Skip if content type indicates streaming or real-time data
303
+ if any(t in content_type
304
+ for t in [
305
+ 'streaming',
306
+ 'video',
307
+ 'audio',
308
+ 'webm',
309
+ 'mp4',
310
+ 'event-stream',
311
+ 'websocket',
312
+ 'protobuf']):
313
+ pending_requests.remove(request)
314
+ return
315
+
316
+ # Only process relevant content types
317
+ if not any(ct in content_type for ct in RELEVANT_CONTENT_TYPES):
318
+ pending_requests.remove(request)
319
+ return
320
+
321
+ # Skip if response is too large (likely not essential for page load)
322
+ content_length = response.headers.get('content-length')
323
+ if content_length and int(content_length) > 5 * 1024 * 1024: # 5MB
324
+ pending_requests.remove(request)
325
+ return
326
+
327
+ nonlocal last_activity
328
+ pending_requests.remove(request)
329
+ last_activity = time.time()
330
+
331
+ # Attach event listeners
332
+ page.on('request', on_request)
333
+ page.on('response', on_response)
334
+
335
+ try:
336
+ start_time = time.time()
337
+ while True:
338
+ time.sleep(0.1)
339
+ now = time.time()
340
+
341
+ if len(pending_requests) == 0 and (now - last_activity) >= kwargs.get('idle_wait_time', 0.5):
342
+ break
343
+ if now - start_time > kwargs.get('max_wait_time', 5):
344
+ logger.debug(
345
+ f'Network timeout after {kwargs.get("max_wait_time", 5)}s with {len(pending_requests)} '
346
+ f'pending requests: {[r.url for r in pending_requests]}'
347
+ )
348
+ break
349
+
350
+ finally:
351
+ # Clean up event listeners
352
+ page.remove_listener('request', on_request)
353
+ page.remove_listener('response', on_response)
354
+ logger.debug(f'Network stabilized for {kwargs.get("idle_wait_time", 0.5)} seconds')
355
+
356
+ @staticmethod
357
+ def _enhanced_css_selector_for_element(element: DOMElementNode, include_dynamic_attributes: bool = True) -> str:
358
+ """Creates a CSS selector for a DOM element, handling various edge cases and special characters.
359
+
360
+ Args:
361
+ element: The DOM element to create a selector for
362
+
363
+ Returns:
364
+ A valid CSS selector string
365
+ """
366
+ try:
367
+ # Get base selector from XPath
368
+ css_selector = DomUtil._convert_simple_xpath_to_css_selector(element.xpath)
369
+
370
+ # Handle class attributes
371
+ if 'class' in element.attributes and element.attributes['class'] and include_dynamic_attributes:
372
+ # Define a regex pattern for valid class names in CSS
373
+ valid_class_name_pattern = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_-]*$')
374
+
375
+ # Iterate through the class attribute values
376
+ classes = element.attributes['class'].split()
377
+ for class_name in classes:
378
+ # Skip empty class names
379
+ if not class_name.strip():
380
+ continue
381
+
382
+ # Check if the class name is valid
383
+ if valid_class_name_pattern.match(class_name):
384
+ # Append the valid class name to the CSS selector
385
+ css_selector += f'.{class_name}'
386
+ else:
387
+ # Skip invalid class names
388
+ continue
389
+
390
+ # Expanded set of safe attributes that are stable and useful for selection
391
+ SAFE_ATTRIBUTES = {
392
+ # Data attributes (if they're stable in your application)
393
+ 'id',
394
+ # Standard HTML attributes
395
+ 'name',
396
+ 'type',
397
+ 'placeholder',
398
+ # Accessibility attributes
399
+ 'aria-label',
400
+ 'aria-labelledby',
401
+ 'aria-describedby',
402
+ 'role',
403
+ # Common form attributes
404
+ 'for',
405
+ 'autocomplete',
406
+ 'required',
407
+ 'readonly',
408
+ # Media attributes
409
+ 'alt',
410
+ 'title',
411
+ 'src',
412
+ # Custom stable attributes (add any application-specific ones)
413
+ 'href',
414
+ 'target',
415
+ }
416
+
417
+ if include_dynamic_attributes:
418
+ dynamic_attributes = {
419
+ 'data-id',
420
+ 'data-qa',
421
+ 'data-cy',
422
+ 'data-testid',
423
+ }
424
+ SAFE_ATTRIBUTES.update(dynamic_attributes)
425
+
426
+ # Handle other attributes
427
+ for attribute, value in element.attributes.items():
428
+ if attribute == 'class':
429
+ continue
430
+
431
+ # Skip invalid attribute names
432
+ if not attribute.strip():
433
+ continue
434
+
435
+ if attribute not in SAFE_ATTRIBUTES:
436
+ continue
437
+
438
+ # Escape special characters in attribute names
439
+ safe_attribute = attribute.replace(':', r'\:')
440
+
441
+ # Handle different value cases
442
+ if value == '':
443
+ css_selector += f'[{safe_attribute}]'
444
+ elif any(char in value for char in '"\'<>`\n\r\t'):
445
+ # Use contains for values with special characters
446
+ # Regex-substitute *any* whitespace with a single space, then strip.
447
+ collapsed_value = re.sub(r'\s+', ' ', value).strip()
448
+ # Escape embedded double-quotes.
449
+ safe_value = collapsed_value.replace('"', '\\"')
450
+ css_selector += f'[{safe_attribute}*="{safe_value}"]'
451
+ else:
452
+ css_selector += f'[{safe_attribute}="{value}"]'
453
+
454
+ return css_selector
455
+
456
+ except Exception:
457
+ # Fallback to a more basic selector if something goes wrong
458
+ tag_name = element.tag_name or '*'
459
+ return f"{tag_name}[highlight_index='{element.highlight_index}']"
460
+
461
+ @staticmethod
462
+ def _convert_simple_xpath_to_css_selector(xpath: str) -> str:
463
+ """Converts simple XPath expressions to CSS selectors."""
464
+ if not xpath:
465
+ return ''
466
+
467
+ # Remove leading slash if present
468
+ xpath = xpath.lstrip('/')
469
+
470
+ # Split into parts
471
+ parts = xpath.split('/')
472
+ css_parts = []
473
+
474
+ for part in parts:
475
+ if not part:
476
+ continue
477
+
478
+ # Handle index notation [n]
479
+ if '[' in part:
480
+ base_part = part[: part.find('[')]
481
+ index_part = part[part.find('['):]
482
+
483
+ # Handle multiple indices
484
+ indices = [i.strip('[]') for i in index_part.split(']')[:-1]]
485
+
486
+ for idx in indices:
487
+ try:
488
+ # Handle numeric indices
489
+ if idx.isdigit():
490
+ index = int(idx) - 1
491
+ base_part += f':nth-of-type({index + 1})'
492
+ # Handle last() function
493
+ elif idx == 'last()':
494
+ base_part += ':last-of-type'
495
+ # Handle position() functions
496
+ elif 'position()' in idx:
497
+ if '>1' in idx:
498
+ base_part += ':nth-of-type(n+2)'
499
+ except ValueError:
500
+ continue
501
+
502
+ css_parts.append(base_part)
503
+ else:
504
+ css_parts.append(part)
505
+
506
+ base_selector = ' > '.join(css_parts)
507
+ return base_selector