Final_Assignment_AWorld

Sleeping

App Files Files Community

Duibonduil commited on Jun 30

Commit

ea7486e

verified ·

1 Parent(s): fa5256f

Upload 3 files

Browse files

Files changed (3) hide show

examples/tools/browsers/action/actions.py +824 -0
examples/tools/browsers/action/executor.py +70 -0
examples/tools/browsers/action/utils.py +507 -0

examples/tools/browsers/action/actions.py ADDED Viewed

	@@ -0,0 +1,824 @@

+# coding: utf-8
+# Copyright (c) 2025 inclusionAI.
+import os
+import traceback
+import asyncio
+import time
+from typing import Tuple, Any
+from examples.tools.tool_action import BrowserAction
+from aworld.core.tool.action_factory import ActionFactory
+from aworld.core.common import ActionModel, ActionResult, Observation
+from examples.tools.browsers.util.dom import DOMElementNode
+from aworld.logs.util import logger
+from examples.tools.browsers.action.utils import DomUtil
+from aworld.core.tool.action import ExecutableAction
+from aworld.utils import import_packages
+from aworld.models.llm import get_llm_model, call_llm_model
+def get_page(**kwargs):
+    tool = kwargs.get("tool")
+    if tool is None:
+        page = kwargs.get('page')
+    else:
+        page = tool.page
+    return page
+def get_browser(**kwargs):
+    tool = kwargs.get("tool")
+    if tool is None:
+        page = kwargs.get('browser')
+    else:
+        page = tool.context
+    return page
+@ActionFactory.register(name=BrowserAction.GO_TO_URL.value.name,
+                        desc=BrowserAction.GO_TO_URL.value.desc,
+                        tool_name="browser")
+class GotoUrl(ExecutableAction):
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.GO_TO_URL.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.GO_TO_URL.name} page is none")
+            return ActionResult(content="no page", keep=True), page
+        params = action.params
+        url = params.get("url")
+        if not url:
+            logger.warning("empty url, go to nothing.")
+            return ActionResult(content="empty url", keep=True), page
+        items = url.split('://')
+        if len(items) == 1:
+            if items[0][0] != '/':
+                url = "file://" + os.path.join(os.getcwd(), url)
+        page.goto(url)
+        page.wait_for_load_state()
+        msg = f'Navigated to {url}'
+        logger.info(msg)
+        return ActionResult(content=msg, keep=True), page
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.GO_TO_URL.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.GO_TO_URL.name} page is none")
+            return ActionResult(content="no page", keep=True), page
+        url = action.params.get("url")
+        if not url:
+            logger.warning("empty url, go to nothing.")
+            return ActionResult(content="empty url", keep=True), page
+        items = url.split('://')
+        if len(items) == 1:
+            if items[0][0] != '/':
+                url = "file://" + os.path.join(os.getcwd(), url)
+        await page.goto(url)
+        await page.wait_for_load_state()
+        msg = f'Navigated to {url}'
+        logger.info(msg)
+        return ActionResult(content=msg, keep=True), page
+@ActionFactory.register(name=BrowserAction.INPUT_TEXT.value.name,
+                        desc=BrowserAction.INPUT_TEXT.value.desc,
+                        tool_name="browser")
+class InputText(ExecutableAction):
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.INPUT_TEXT.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.INPUT_TEXT.name} page is none")
+            return ActionResult(content="input text no page", keep=True), page
+        params = action.params
+        index = params.get("index", 0)
+        # compatible with int and str datatype
+        index = int(index)
+        input = params.get("text", "")
+        ob: Observation = kwargs.get("observation")
+        if not ob or index not in ob.dom_tree.element_map:
+            raise RuntimeError(f'Element index {index} does not exist')
+        if not input:
+            raise ValueError(f'No input to the page')
+        element_node = ob.dom_tree.element_map[index]
+        self.input_to_element(input, page, element_node)
+        msg = f'Input {input} into index {index}'
+        logger.info(f"action {msg}")
+        logger.debug(f'Element xpath: {element_node.xpath}')
+        return ActionResult(content=msg, keep=True), page
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.INPUT_TEXT.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.INPUT_TEXT.name} page is none")
+            return ActionResult(content="input text no page", keep=True), page
+        params = action.params
+        index = params.get("index")
+        # compatible with int and str datatype
+        index = int(index)
+        input = params.get("text", "")
+        ob: Observation = kwargs.get("observation")
+        if not ob or index not in ob.dom_tree.element_map:
+            raise RuntimeError(f'Element index {index} does not exist')
+        if not input:
+            raise ValueError(f'No input to the page')
+        element_node = ob.dom_tree.element_map[index]
+        await self.async_input_to_element(input, page, element_node)
+        msg = f'Input {input} into index {index}'
+        logger.info(f"action {msg}")
+        logger.debug(f'Element xpath: {element_node.xpath}')
+        return ActionResult(content=msg, keep=True), page
+    def input_to_element(self, input: str, page, element_node: DOMElementNode):
+        try:
+            # Highlight before typing
+            # if element_node.highlight_index is not None:
+            # 	await self._update_state(focus_element=element_node.highlight_index)
+            element_handle = DomUtil.get_locate_element(page, element_node)
+            if element_handle is None:
+                raise RuntimeError(f'Element: {repr(element_node)} not found')
+            # Ensure element is ready for input
+            try:
+                element_handle.wait_for_element_state('stable', timeout=1000)
+                element_handle.scroll_into_view_if_needed(timeout=1000)
+            except Exception:
+                pass
+            # Get element properties to determine input method
+            is_contenteditable = element_handle.get_property('isContentEditable')
+            # Different handling for contenteditable vs input fields
+            if is_contenteditable.json_value():
+                element_handle.evaluate('el => el.textContent = ""')
+                element_handle.type(input, delay=5)
+            else:
+                element_handle.fill(input)
+        except Exception as e:
+            logger.warning(f'Failed to input text into element: {repr(element_node)}. Error: {str(e)}')
+            raise RuntimeError(f'Failed to input text into index {element_node.highlight_index}')
+    async def async_input_to_element(self, input: str, page, element_node: DOMElementNode):
+        try:
+            element_handle = await DomUtil.async_get_locate_element(page, element_node)
+            if element_handle is None:
+                raise RuntimeError(f'Element: {repr(element_node)} not found')
+            # Ensure element is ready for input
+            try:
+                await element_handle.wait_for_element_state('stable', timeout=1000)
+                await element_handle.scroll_into_view_if_needed(timeout=1000)
+            except Exception:
+                pass
+            # Get element properties to determine input method
+            is_contenteditable = await element_handle.get_property('isContentEditable')
+            # Different handling for contenteditable vs input fields
+            if await is_contenteditable.json_value():
+                await element_handle.evaluate('el => el.textContent = ""')
+                await element_handle.type(input, delay=5)
+            else:
+                await element_handle.fill(input)
+        except Exception as e:
+            logger.warning(f'Failed to input text into element: {repr(element_node)}. Error: {str(e)}')
+            raise RuntimeError(f'Failed to input text into index {element_node.highlight_index}')
+@ActionFactory.register(name=BrowserAction.CLICK_ELEMENT.value.name,
+                        desc=BrowserAction.CLICK_ELEMENT.value.desc,
+                        tool_name="browser")
+class ClickElement(ExecutableAction):
+    def __init__(self):
+        import_packages(['playwright', 'markdownify'])
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        from playwright.sync_api import BrowserContext
+        logger.info(f"exec {BrowserAction.CLICK_ELEMENT.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.CLICK_ELEMENT.name} page is none")
+            return ActionResult(content="input text no page", keep=True), page
+        browser: BrowserContext = get_browser(**kwargs)
+        if browser is None:
+            logger.warning(f"{BrowserAction.CLICK_ELEMENT.name} browser context is none")
+            return ActionResult(content="none browser context", keep=True), page
+        index = action.params.get("index")
+        # compatible with int and str datatype
+        index = int(index)
+        ob: Observation = kwargs.get("observation")
+        if not ob or index not in ob.dom_tree.element_map:
+            raise RuntimeError(f'Element index {index} does not exist')
+        if not input:
+            raise ValueError(f'No input to the page')
+        element_node = ob.dom_tree.element_map[index]
+        try:
+            pages = len(browser.pages)
+            msg = f'Clicked button with index {index}: {element_node.get_all_text_till_next_clickable_element(max_depth=2)}'
+            logger.info(msg)
+            DomUtil.click_element(page, element_node, browser=browser)
+            logger.debug(f'Element xpath: {element_node.xpath}')
+            if len(browser.pages) > pages:
+                new_tab_msg = 'Open the new tab'
+                msg += f' - {new_tab_msg}'
+                logger.info(new_tab_msg)
+                page = browser.pages[-1]
+                page.bring_to_front()
+                page.wait_for_load_state(timeout=60000)
+            return ActionResult(content=msg, keep=True), page
+        except Exception as e:
+            logger.warning(f'Element not clickable with index {index} - most likely the page changed')
+            return ActionResult(error=str(e)), page
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.CLICK_ELEMENT.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warn(f"{BrowserAction.CLICK_ELEMENT.name} page is none")
+            return ActionResult(content="input text no page", keep=True), page
+        browser = get_browser(**kwargs)
+        if browser is None:
+            logger.warning(f"{BrowserAction.CLICK_ELEMENT.name} browser context is none")
+            return ActionResult(content="none browser context", keep=True), page
+        index = action.params.get("index")
+        # compatible with int and str datatype
+        index = int(index)
+        ob: Observation = kwargs.get("observation")
+        if not ob or index not in ob.dom_tree.element_map:
+            raise RuntimeError(f'Element index {index} does not exist')
+        if not input:
+            raise ValueError(f'No input to the page')
+        element_node = ob.dom_tree.element_map[index]
+        pages = len(browser.pages)
+        try:
+            await DomUtil.async_click_element(page, element_node, browser=browser)
+            msg = f'Clicked button with index {index}: {element_node.get_all_text_till_next_clickable_element(max_depth=2)}'
+            logger.info(msg)
+            logger.debug(f'Element xpath: {element_node.xpath}')
+            if len(browser.pages) > pages:
+                new_tab_msg = 'Open the new tab'
+                msg += f' - {new_tab_msg}'
+                logger.info(new_tab_msg)
+                page = browser.pages[-1]
+                await page.bring_to_front()
+                await page.wait_for_load_state(timeout=60000)
+            return ActionResult(content=msg, keep=True), page
+        except Exception as e:
+            logger.warning(f'Element not clickable with index {index} - most likely the page changed')
+            return ActionResult(error=str(e)), page
+# SEARCH_ENGINE = {"": "https://www.google.com/search?udm=14&q=",
+#                  "google": "https://www.google.com/search?udm=14&q="}
+SEARCH_ENGINE = {"": "https://www.bing.com/search?q=",
+                 "google": "https://www.bing.com/search?q="}
+@ActionFactory.register(name=BrowserAction.SEARCH.value.name,
+                        desc=BrowserAction.SEARCH.value.desc,
+                        tool_name="browser")
+class Search(ExecutableAction):
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.SEARCH.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.SEARCH.name} page is none")
+            return ActionResult(content="search no page", keep=True), page
+        params = action.params if action.params else {}
+        engine = params.get("engine", "")
+        url = SEARCH_ENGINE.get(engine)
+        query = params.get("query")
+        page.goto(f'{url}{query}')
+        page.wait_for_load_state()
+        msg = f'Searched for "{query}" in {url}'
+        logger.info(msg)
+        return ActionResult(content=msg, keep=True), page
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.SEARCH.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.SEARCH.name} page is none")
+            return ActionResult(content="search no page", keep=True), page
+        params = action.params if action.params else {}
+        engine = params.get("engine", "")
+        url = SEARCH_ENGINE.get(engine)
+        query = params.get("query")
+        await page.goto(f'{url}{query}')
+        await page.wait_for_load_state()
+        msg = f'Searched for "{query}" in {url}'
+        logger.info(msg)
+        return ActionResult(content=msg, keep=True), page
+@ActionFactory.register(name=BrowserAction.SEARCH_GOOGLE.value.name,
+                        desc=BrowserAction.SEARCH_GOOGLE.value.desc,
+                        tool_name="browser")
+class SearchGoogle(ExecutableAction):
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.SEARCH_GOOGLE.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.SEARCH_GOOGLE.name} page is none")
+            return ActionResult(content="search no page", keep=True), page
+        query = action.params.get("query")
+        page.goto(f'{SEARCH_ENGINE.get("")}{query}')
+        page.wait_for_load_state()
+        msg = f'Searched for "{query}" in Google'
+        logger.info(msg)
+        return ActionResult(content=msg, keep=True), page
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.SEARCH_GOOGLE.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.SEARCH_GOOGLE.name} page is none")
+            return ActionResult(content="search no page", keep=True), page
+        query = action.params.get("query")
+        await page.goto(f'{SEARCH_ENGINE.get("")}{query}')
+        await page.wait_for_load_state()
+        msg = f'Searched for "{query}" in Google'
+        logger.info(msg)
+        return ActionResult(content=msg, keep=True), page
+@ActionFactory.register(name=BrowserAction.NEW_TAB.value.name,
+                        desc=BrowserAction.NEW_TAB.value.desc,
+                        tool_name="browser")
+class NewTab(ExecutableAction):
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.NEW_TAB.value.name} action")
+        browser = get_browser(**kwargs)
+        url = action.params.get("url")
+        new_page = browser.new_page()
+        new_page.wait_for_load_state()
+        if url:
+            new_page.goto(url)
+            DomUtil.wait_for_stable_network(new_page)
+        msg = f'Opened new tab with {url}'
+        logger.debug(msg)
+        return ActionResult(content=msg, keep=True), new_page
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.NEW_TAB.value.name} action")
+        browser = get_browser(**kwargs)
+        url = action.params.get("url")
+        new_page = await browser.new_page()
+        await new_page.wait_for_load_state()
+        if url:
+            await new_page.goto(url)
+            DomUtil.wait_for_stable_network(new_page)
+        msg = f'Opened new tab with {url}'
+        logger.debug(msg)
+        return ActionResult(content=msg, keep=True), get_page(**kwargs)
+@ActionFactory.register(name=BrowserAction.GO_BACK.value.name,
+                        desc=BrowserAction.GO_BACK.value.desc,
+                        tool_name="browser")
+class GoBack(ExecutableAction):
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.GO_BACK.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.GO_BACK.name} page is none")
+            return ActionResult(content="search no page", keep=True), page
+        page.go_back()
+        msg = 'Navigated back'
+        logger.info(msg)
+        return ActionResult(content=msg, keep=True), page
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.GO_BACK.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.GO_BACK.name} page is none")
+            return ActionResult(content="search no page", keep=True), page
+        await page.go_back()
+        msg = 'Navigated back'
+        logger.info(msg)
+        return ActionResult(content=msg, keep=True), page
+@ActionFactory.register(name=BrowserAction.EXTRACT_CONTENT.value.name,
+                        desc=BrowserAction.EXTRACT_CONTENT.value.desc,
+                        tool_name="browser")
+class ExtractContent(ExecutableAction):
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        import markdownify
+        from langchain_core.prompts import PromptTemplate
+        logger.info(f"exec {BrowserAction.EXTRACT_CONTENT.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.EXTRACT_CONTENT.name} page is none")
+            return ActionResult(content="extract content no page", keep=True), page
+        goal = action.params.get("goal")
+        llm_config = kwargs.get("llm_config")
+        if llm_config and llm_config.llm_api_key:
+            llm = get_llm_model(llm_config)
+        max_extract_content_output_tokens = kwargs.get("max_extract_content_output_tokens")
+        max_extract_content_input_tokens = kwargs.get("max_extract_content_input_tokens")
+        content = markdownify.markdownify(page.content())
+        # Truncate content if it exceeds max input tokens
+        if max_extract_content_input_tokens and len(content) > max_extract_content_input_tokens:
+            logger.warning(
+                f"Content length ({len(content)}) exceeds max input tokens ({max_extract_content_input_tokens}). Truncating content.")
+            content = content[:max_extract_content_input_tokens]
+        prompt = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page}'
+        prompt_with_outputlimit = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page} \n\n#The length of the returned result must be less than {max_extract_content_output_tokens} characters.'
+        template = PromptTemplate(input_variables=['goal', 'page'], template=prompt)
+        messages = [{'role': 'user', 'content': template.format(goal=goal, page=content)}]
+        try:
+            output = call_llm_model(llm,
+                                    messages=messages,
+                                    model=llm_config.llm_model_name,
+                                    temperature=llm_config.llm_temperature)
+            result_content = output.content
+            # Check if output exceeds the token limit and retry with length-limited prompt if needed
+            if max_extract_content_output_tokens and len(result_content) > max_extract_content_output_tokens:
+                logger.warning(
+                    f"Output exceeds maximum length ({len(result_content)} > {max_extract_content_output_tokens}). Retrying with limited prompt.")
+                template_with_limit = PromptTemplate(
+                    input_variables=['goal', 'page', 'max_extract_content_output_tokens'],
+                    template=prompt_with_outputlimit
+                )
+                messages = [{'role': 'user', 'content': template_with_limit.format(
+                    goal=goal,
+                    page=content,
+                    max_extract_content_output_tokens=max_extract_content_output_tokens,
+                    max_tokens=max_extract_content_output_tokens
+                )}]
+                # extract content with length limit
+                output = call_llm_model(llm,
+                                        messages=messages,
+                                        model=llm_config.llm_model_name,
+                                        temperature=llm_config.llm_temperature)
+                result_content = output.content
+            msg = f'Extracted from page\n: {result_content}\n'
+            logger.info(msg)
+            return ActionResult(content=msg, keep=True), page
+        except Exception as e:
+            logger.debug(f'Error extracting content: {e}')
+            msg = f'Extracted from page\n: {content}\n'
+            logger.info(msg)
+            return ActionResult(content=msg), page
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        import markdownify
+        from langchain_core.prompts import PromptTemplate
+        logger.info(f"exec {BrowserAction.EXTRACT_CONTENT.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.EXTRACT_CONTENT.name} page is none")
+            return ActionResult(content="extract content no page", keep=True), page
+        goal = action.params.get("goal")
+        llm_config = kwargs.get("llm_config")
+        if llm_config and llm_config.llm_api_key:
+            llm = get_llm_model(llm_config)
+        content = markdownify.markdownify(await page.content())
+        max_extract_content_output_tokens = kwargs.get("max_extract_content_output_tokens")
+        max_extract_content_input_tokens = kwargs.get("max_extract_content_input_tokens")
+        # Truncate content if it exceeds max input tokens
+        if max_extract_content_input_tokens and len(content) > max_extract_content_input_tokens:
+            logger.warning(
+                f"Content length ({len(content)}) exceeds max input tokens ({max_extract_content_input_tokens}). Truncating content.")
+            content = content[:max_extract_content_input_tokens]
+        prompt = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page}'
+        prompt_with_outputlimit = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page} \n\n#The length of the returned result must be less than {max_extract_content_output_tokens} characters.'
+        template = PromptTemplate(input_variables=['goal', 'page'], template=prompt)
+        messages = [{'role': 'user', 'content': template.format(goal=goal, page=content)}]
+        try:
+            output = call_llm_model(llm,
+                                    messages=messages,
+                                    model=llm_config.llm_model_name,
+                                    temperature=llm_config.llm_temperature)
+            result_content = output.content
+            # Check if output exceeds the token limit and retry with length-limited prompt if needed
+            if max_extract_content_output_tokens and len(result_content) > max_extract_content_output_tokens:
+                logger.info(
+                    f"Output exceeds maximum length ({len(result_content)} > {max_extract_content_output_tokens}). Retrying with limited prompt.")
+                template_with_limit = PromptTemplate(
+                    input_variables=['goal', 'page', 'max_extract_content_output_tokens'],
+                    template=prompt_with_outputlimit
+                )
+                messages = [{'role': 'user', 'content': template_with_limit.format(
+                    goal=goal,
+                    page=content,
+                    max_extract_content_output_tokens=max_extract_content_output_tokens,
+                    max_tokens=max_extract_content_output_tokens
+                )}]
+                # extract content with length limit
+                output = call_llm_model(llm,
+                                        messages=messages,
+                                        model=llm_config.llm_model_name,
+                                        temperature=llm_config.llm_temperature)
+                result_content = output.content
+            msg = f'Extracted from page\n: {result_content}\n'
+            logger.info(msg)
+            return ActionResult(content=msg, keep=True), page
+        except Exception as e:
+            logger.debug(f'Error extracting content: {e}')
+            msg = f'Extracted from page\n: {content}\n'
+            logger.info(msg)
+            return ActionResult(content=msg), page
+@ActionFactory.register(name=BrowserAction.SCROLL_DOWN.value.name,
+                        desc=BrowserAction.SCROLL_DOWN.value.desc,
+                        tool_name="browser")
+class ScrollDown(ExecutableAction):
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.SCROLL_DOWN.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.SCROLL_DOWN.name} page is none")
+            return ActionResult(content="scroll no page", keep=True), page
+        amount = action.params.get("amount")
+        if not amount:
+            page.evaluate('window.scrollBy(0, window.innerHeight);')
+        else:
+            amount = int(amount)
+            page.evaluate(f'window.scrollBy(0, {amount});')
+        amount = f'{amount} pixels' if amount else 'one page'
+        msg = f'Scrolled down the page by {amount}'
+        logger.info(msg)
+        return ActionResult(content=msg, keep=True), page
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.SCROLL_DOWN.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.SCROLL_DOWN.name} page is none")
+            return ActionResult(content="scroll no page", keep=True), page
+        amount = action.params.get("amount")
+        if not amount:
+            await page.evaluate('window.scrollBy(0, window.innerHeight);')
+        else:
+            amount = int(amount)
+            await page.evaluate(f'window.scrollBy(0, {amount});')
+        amount = f'{amount} pixels' if amount else 'one page'
+        msg = f'Scrolled down the page by {amount}'
+        logger.info(msg)
+        return ActionResult(content=msg, keep=True), page
+@ActionFactory.register(name=BrowserAction.SCROLL_UP.value.name,
+                        desc=BrowserAction.SCROLL_UP.value.desc,
+                        tool_name="browser")
+class ScrollUp(ExecutableAction):
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.SCROLL_UP.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.SCROLL_UP.name} page is none")
+            return ActionResult(content="scroll no page", keep=True), page
+        amount = action.params.get("amount")
+        if not amount:
+            page.evaluate('window.scrollBy(0, -window.innerHeight);')
+        else:
+            amount = int(amount)
+            page.evaluate(f'window.scrollBy(0, -{amount});')
+        amount = f'{amount} pixels' if amount else 'one page'
+        msg = f'Scrolled down the page by {amount}'
+        logger.info(msg)
+        return ActionResult(content=msg, keep=True), page
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.SCROLL_UP.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.SCROLL_UP.name} page is none")
+            return ActionResult(content="scroll no page", keep=True), page
+        amount = action.params.get("amount")
+        if not amount:
+            await page.evaluate('window.scrollBy(0, -window.innerHeight);')
+        else:
+            amount = int(amount)
+            await page.evaluate(f'window.scrollBy(0, -{amount});')
+        amount = f'{amount} pixels' if amount else 'one page'
+        msg = f'Scrolled down the page by {amount}'
+        logger.info(msg)
+        return ActionResult(content=msg, keep=True), page
+@ActionFactory.register(name=BrowserAction.WAIT.value.name,
+                        desc=BrowserAction.WAIT.value.desc,
+                        tool_name="browser")
+class Wait(ExecutableAction):
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        seconds = action.params.get("seconds")
+        if not seconds:
+            seconds = action.params.get("duration", 0)
+        seconds = int(seconds)
+        msg = f'Waiting for {seconds} seconds'
+        logger.info(msg)
+        time.sleep(seconds)
+        return ActionResult(content=msg, keep=True), kwargs.get('page')
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        seconds = action.params.get("seconds")
+        if not seconds:
+            seconds = action.params.get("duration", 0)
+        seconds = int(seconds)
+        msg = f'Waiting for {seconds} seconds'
+        logger.info(msg)
+        await asyncio.sleep(seconds)
+        return ActionResult(content=msg, keep=True), kwargs.get('page')
+@ActionFactory.register(name=BrowserAction.SWITCH_TAB.value.name,
+                        desc=BrowserAction.SWITCH_TAB.value.desc,
+                        tool_name="browser")
+class SwitchTab(ExecutableAction):
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.SWITCH_TAB.value.name} action")
+        browser = get_browser(**kwargs)
+        if browser is None:
+            logger.warning(f"{BrowserAction.SWITCH_TAB.name} browser context is none")
+            return ActionResult(content="switch tab no browser context", keep=True), get_page(**kwargs)
+        page_id = action.params.get("page_id", 0)
+        page_id = int(page_id)
+        pages = browser.pages
+        if page_id >= len(pages):
+            raise RuntimeError(f'No tab found with page_id: {page_id}')
+        page = pages[page_id]
+        page.bring_to_front()
+        page.wait_for_load_state()
+        msg = f'Switched to tab {page_id}'
+        logger.info(msg)
+        return ActionResult(content=msg, keep=True), page
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.SWITCH_TAB.value.name} action")
+        browser = get_browser(**kwargs)
+        if browser is None:
+            logger.warning(f"{BrowserAction.SWITCH_TAB.name} browser context is none")
+            return ActionResult(content="switch tab no browser context", keep=True), get_page(**kwargs)
+        page_id = action.params.get("page_id", 0)
+        page_id = int(page_id)
+        pages = browser.pages
+        if page_id >= len(pages):
+            raise RuntimeError(f'No tab found with page_id: {page_id}')
+        page = pages[page_id]
+        await page.bring_to_front()
+        await page.wait_for_load_state()
+        msg = f'Switched to tab {page_id}'
+        logger.info(msg)
+        return ActionResult(content=msg, keep=True), page
+@ActionFactory.register(name=BrowserAction.SEND_KEYS.value.name,
+                        desc=BrowserAction.SEND_KEYS.value.desc,
+                        tool_name="browser")
+class SendKeys(ExecutableAction):
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.SEND_KEYS.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.SEND_KEYS.name} page is none")
+            return ActionResult(content="scroll no page", keep=True), page
+        keys = action.params.get("keys")
+        if not keys:
+            return ActionResult(success=False, content="no keys", keep=True), page
+        try:
+            page.keyboard.press(keys)
+        except Exception as e:
+            logger.warning(f"{keys} press fail. \n{traceback.format_exc()}")
+            raise e
+        return ActionResult(content=f"Sent keys: {keys}", keep=True), page
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.SEND_KEYS.value.name} action")
+        page = get_page(**kwargs)
+        if page is None:
+            logger.warning(f"{BrowserAction.SEND_KEYS.name} page is none")
+            return ActionResult(content="scroll no page", keep=True), page
+        keys = action.params.get("keys")
+        if not keys:
+            return ActionResult(success=False, content="no keys", keep=True), page
+        try:
+            await page.keyboard.press(keys)
+        except Exception as e:
+            logger.warning(f"{keys} press fail. \n{traceback.format_exc()}")
+            raise e
+        return ActionResult(content=f"Sent keys: {keys}", keep=True), page
+@ActionFactory.register(name=BrowserAction.WRITE_TO_FILE.value.name,
+                        desc=BrowserAction.WRITE_TO_FILE.value.desc,
+                        tool_name="browser")
+class WriteToFile(ExecutableAction):
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        # 设置默认文件路径
+        file_path = "tmp_result.md"
+        # 检查参数中是否有file_path
+        if "file_path" in action.params:
+            file_path = action.params.get("file_path", "tmp_result.md")
+        # 检查参数中是否有file_name
+        elif "file_name" in action.params:
+            file_path = action.params.get("file_name", "tmp_result.md")
+        elif "filename" in action.params:
+            file_path = action.params.get("filename", "tmp_result.md")
+        content = action.params.get("content", "")
+        mode = action.params.get("mode", "a")  # Default to append mode
+        # 获取文件的绝对路径
+        abs_file_path = os.path.abspath(file_path)
+        try:
+            with open(file_path, mode, encoding='utf-8') as f:
+                f.write(content + '\n')
+            msg = f'Successfully wrote content to {abs_file_path}'
+            logger.info(msg)
+            return ActionResult(content=msg, keep=True), get_page(**kwargs)
+        except Exception as e:
+            error_msg = f'Failed to write to file {abs_file_path}: {str(e)}'
+            logger.error(error_msg)
+            return ActionResult(content=error_msg, keep=True, error=error_msg), get_page(**kwargs)
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        # For file operations, we don't need to make this asynchronous
+        return self.act(action, **kwargs)
+@ActionFactory.register(name=BrowserAction.DONE.value.name,
+                        desc=BrowserAction.DONE.value.desc,
+                        tool_name="browser")
+class Done(ExecutableAction):
+    def act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.DONE.value.name} action")
+        return ActionResult(is_done=True, success=True, content="done", keep=True), get_page(**kwargs)
+    async def async_act(self, action: ActionModel, **kwargs) -> Tuple[ActionResult, Any]:
+        logger.info(f"exec {BrowserAction.DONE.value.name} action")
+        return ActionResult(is_done=True, success=True, content="done", keep=True), get_page(**kwargs)

examples/tools/browsers/action/executor.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# coding: utf-8
+# Copyright (c) 2025 inclusionAI.
+from typing import Tuple, List, Any
+from aworld.core.tool.action_factory import ActionFactory
+from aworld.core.common import ActionModel, ActionResult, Observation
+from aworld.logs.util import logger
+from aworld.core.tool.base import Tool, ToolActionExecutor
+class BrowserToolActionExecutor(ToolActionExecutor):
+    def __init__(self, tool: Tool = None):
+        super(BrowserToolActionExecutor, self).__init__(tool)
+    def execute_action(self, actions: List[ActionModel], **kwargs) -> Tuple[
+        List[ActionResult], Any]:
+        """Execute the specified browser action sequence by agent policy.
+        Args:
+            actions: Tool action sequence.
+        Returns:
+            Browser page and action result list.
+        """
+        action_results = []
+        page = self.tool.page
+        for action in actions:
+            action_result, page = self._exec(action, **kwargs)
+            action_results.append(action_result)
+        return action_results, page
+    async def async_execute_action(self, actions: List[ActionModel], **kwargs) -> Tuple[
+        List[ActionResult], Any]:
+        """Execute the specified browser action sequence by agent policy.
+        Args:
+            actions: Tool action sequence.
+        Returns:
+            Browser page and action result list.
+        """
+        action_results = []
+        page = self.tool.page
+        for action in actions:
+            action_result, page = await self._async_exec(action, **kwargs)
+            action_results.append(action_result)
+        return action_results, page
+    def _exec(self, action_model: ActionModel, **kwargs):
+        action_name = action_model.action_name
+        if action_name not in ActionFactory:
+            raise ValueError(f'Action {action_name} not found')
+        action = ActionFactory(action_name)
+        action_result, page = action.act(action_model, page=self.tool.page, browser=self.tool.context, **kwargs)
+        logger.info(f"{action_name} execute finished")
+        return action_result, page
+    async def _async_exec(self, action_model: ActionModel, **kwargs):
+        action_name = action_model.action_name
+        if action_name not in ActionFactory:
+            action_name = action_model.tool_name + action_model.action_name
+            if action_name not in ActionFactory:
+                raise ValueError(f'Action {action_name} not found')
+        action = ActionFactory(action_name)
+        action_result, page = await action.async_act(action_model, page=self.tool.page, browser=self.tool.context, **kwargs)
+        logger.info(f"{action_name} execute finished")
+        return action_result, page

examples/tools/browsers/action/utils.py ADDED Viewed

	@@ -0,0 +1,507 @@

+# coding: utf-8
+# Copyright (c) 2025 inclusionAI.
+import re
+import time
+import traceback
+from typing import Optional
+from examples.tools.browsers.util.dom import DOMElementNode
+from aworld.logs.util import logger
+from aworld.utils import import_package
+class DomUtil:
+    def __init__(self):
+        import_package("playwright")
+    @staticmethod
+    async def async_click_element(page, element_node: DOMElementNode, **kwargs) -> Optional[str]:
+        from playwright.async_api import ElementHandle as AElementHandle, BrowserContext as ABrowserContext
+        try:
+            element_handle: AElementHandle = await DomUtil.async_get_locate_element(page, element_node)
+            if element_handle is None:
+                raise Exception(f'Element: {repr(element_node)} not found')
+            bound = await element_handle.bounding_box()
+            try:
+                # todo: iframe.
+                center_x = bound['x'] + bound['width'] / 2
+                center_y = bound['y'] + bound['height'] / 2
+                try:
+                    browser: ABrowserContext = kwargs.get('browser')
+                    async with browser.expect_page() as new_page_info:
+                        await page.mouse.click(center_x, center_y)
+                    await page.mouse.click(center_x, center_y)
+                    await page.wait_for_load_state()
+                except:
+                    logger.warning(traceback.format_exc())
+            except:
+                logger.info(f"click {element_handle}!!")
+                if await element_handle.text_content():
+                    browser: ABrowserContext = kwargs.get('browser')
+                    if browser:
+                        try:
+                            async with browser.expect_page() as new_page_info:
+                                await page.click(f"text={element_handle.text_content()}")
+                            page = await new_page_info.value
+                            await page.wait_for_load_state()
+                        except:
+                            logger.warning(traceback.format_exc())
+                    else:
+                        await element_handle.click()
+                        await page.wait_for_load_state()
+                else:
+                    await element_handle.click()
+                    await page.wait_for_load_state()
+        except Exception as e:
+            logger.error(traceback.format_exc())
+            raise Exception(f'Failed to click element: {repr(element_node)}. Error: {str(e)}')
+    @staticmethod
+    def click_element(page, element_node: DOMElementNode, **kwargs) -> Optional[str]:
+        from playwright.sync_api import ElementHandle, BrowserContext
+        try:
+            element_handle: ElementHandle = DomUtil.get_locate_element(page, element_node)
+            if element_handle is None:
+                raise Exception(f'Element: {repr(element_node)} not found')
+            bound = element_handle.bounding_box()
+            try:
+                # todo: iframe.
+                center_x = bound['x'] + bound['width'] / 2
+                center_y = bound['y'] + bound['height'] / 2
+                try:
+                    browser: BrowserContext = kwargs.get('browser')
+                    with browser.expect_page() as new_page_info:
+                        page.mouse.click(center_x, center_y)
+                    page = new_page_info.value
+                    page.wait_for_load_state()
+                except:
+                    logger.warning(traceback.format_exc())
+            except:
+                logger.info(f"click {element_handle}!!")
+                if element_handle.text_content():
+                    browser: BrowserContext = kwargs.get('browser')
+                    if browser:
+                        try:
+                            with browser.expect_page() as new_page_info:
+                                page.click(f"text={element_handle.text_content()}")
+                            page = new_page_info.value
+                            page.wait_for_load_state()
+                        except:
+                            logger.warning(traceback.format_exc())
+                    else:
+                        element_handle.click()
+                        page.wait_for_load_state()
+                else:
+                    element_handle.click()
+                    page.wait_for_load_state()
+        except Exception as e:
+            logger.error(traceback.format_exc())
+            raise Exception(f'Failed to click element: {repr(element_node)}. Error: {str(e)}')
+    @staticmethod
+    async def async_get_locate_element(current_frame, element: DOMElementNode):
+        # Start with the target element and collect all parents, return Optional[AElementHandle]
+        from playwright.async_api import FrameLocator as AFrameLocator
+        parents: list[DOMElementNode] = []
+        current = element
+        while current.parent is not None:
+            parent = current.parent
+            parents.append(parent)
+            current = parent
+        # Reverse the parents list to process from top to bottom
+        parents.reverse()
+        # Process all iframe parents in sequence
+        iframes = [item for item in parents if item.tag_name == 'iframe']
+        for parent in iframes:
+            css_selector = DomUtil._enhanced_css_selector_for_element(
+                parent,
+                include_dynamic_attributes=True,
+            )
+            current_frame = current_frame.frame_locator(css_selector)
+        css_selector = DomUtil._enhanced_css_selector_for_element(
+            element, include_dynamic_attributes=True
+        )
+        try:
+            if isinstance(current_frame, AFrameLocator):
+                element_handle = await current_frame.locator(css_selector).element_handle()
+                return element_handle
+            else:
+                # Try to scroll into view if hidden
+                element_handle = await current_frame.query_selector(css_selector)
+                if element_handle:
+                    await element_handle.scroll_into_view_if_needed()
+                    return element_handle
+                return None
+        except Exception as e:
+            logger.error(f'Failed to locate element: {str(e)}')
+            return None
+    @staticmethod
+    def get_locate_element(current_frame, element: DOMElementNode):
+        # Start with the target element and collect all parents
+        from playwright.sync_api import FrameLocator
+        parents: list[DOMElementNode] = []
+        current = element
+        while current.parent is not None:
+            parent = current.parent
+            parents.append(parent)
+            current = parent
+        # Reverse the parents list to process from top to bottom
+        parents.reverse()
+        # Process all iframe parents in sequence
+        iframes = [item for item in parents if item.tag_name == 'iframe']
+        for parent in iframes:
+            css_selector = DomUtil._enhanced_css_selector_for_element(
+                parent,
+                include_dynamic_attributes=True,
+            )
+            current_frame = current_frame.frame_locator(css_selector)
+        css_selector = DomUtil._enhanced_css_selector_for_element(
+            element, include_dynamic_attributes=True
+        )
+        try:
+            if isinstance(current_frame, FrameLocator):
+                element_handle = current_frame.locator(css_selector).element_handle()
+                return element_handle
+            else:
+                # Try to scroll into view if hidden
+                element_handle = current_frame.query_selector(css_selector)
+                if element_handle:
+                    element_handle.scroll_into_view_if_needed()
+                    return element_handle
+                return None
+        except Exception as e:
+            logger.error(f'Failed to locate element: {str(e)}')
+            return None
+    @staticmethod
+    def wait_for_stable_network(page, **kwargs):
+        pending_requests = set()
+        last_activity = time.time()
+        # Define relevant resource types and content types
+        RELEVANT_RESOURCE_TYPES = {
+            'document',
+            'stylesheet',
+            'image',
+            'font',
+            'script',
+            'iframe',
+        }
+        RELEVANT_CONTENT_TYPES = {
+            'text/html',
+            'text/css',
+            'application/javascript',
+            'image/',
+            'font/',
+            'application/json',
+        }
+        # Additional patterns to filter out
+        IGNORED_URL_PATTERNS = {
+            # Analytics and tracking
+            'analytics',
+            'tracking',
+            'telemetry',
+            'beacon',
+            'metrics',
+            # Ad-related
+            'doubleclick',
+            'adsystem',
+            'adserver',
+            'advertising',
+            # Social media widgets
+            'facebook.com/plugins',
+            'platform.twitter',
+            'linkedin.com/embed',
+            # Live chat and support
+            'livechat',
+            'zendesk',
+            'intercom',
+            'crisp.chat',
+            'hotjar',
+            # Push notifications
+            'push-notifications',
+            'onesignal',
+            'pushwoosh',
+            # Background sync/heartbeat
+            'heartbeat',
+            'ping',
+            'alive',
+            # WebRTC and streaming
+            'webrtc',
+            'rtmp://',
+            'wss://',
+            # Common CDNs for dynamic content
+            'cloudfront.net',
+            'fastly.net',
+        }
+        def on_request(request):
+            # Filter by resource type
+            if request.resource_type not in RELEVANT_RESOURCE_TYPES:
+                return
+            # Filter out streaming, websocket, and other real-time requests
+            if request.resource_type in {
+                'websocket',
+                'media',
+                'eventsource',
+                'manifest',
+                'other',
+            }:
+                return
+            # Filter out by URL patterns
+            url = request.url.lower()
+            if any(pattern in url for pattern in IGNORED_URL_PATTERNS):
+                return
+            # Filter out data URLs and blob URLs
+            if url.startswith(('data:', 'blob:')):
+                return
+            # Filter out requests with certain headers
+            headers = request.headers
+            if headers.get('purpose') == 'prefetch' or headers.get('sec-fetch-dest') in [
+                'video',
+                'audio',
+            ]:
+                return
+            nonlocal last_activity
+            pending_requests.add(request)
+            last_activity = time.time()
+        def on_response(response):
+            request = response.request
+            if request not in pending_requests:
+                return
+            # Filter by content type if available
+            content_type = response.headers.get('content-type', '').lower()
+            # Skip if content type indicates streaming or real-time data
+            if any(t in content_type
+                   for t in [
+                       'streaming',
+                       'video',
+                       'audio',
+                       'webm',
+                       'mp4',
+                       'event-stream',
+                       'websocket',
+                       'protobuf']):
+                pending_requests.remove(request)
+                return
+            # Only process relevant content types
+            if not any(ct in content_type for ct in RELEVANT_CONTENT_TYPES):
+                pending_requests.remove(request)
+                return
+            # Skip if response is too large (likely not essential for page load)
+            content_length = response.headers.get('content-length')
+            if content_length and int(content_length) > 5 * 1024 * 1024:  # 5MB
+                pending_requests.remove(request)
+                return
+            nonlocal last_activity
+            pending_requests.remove(request)
+            last_activity = time.time()
+        # Attach event listeners
+        page.on('request', on_request)
+        page.on('response', on_response)
+        try:
+            start_time = time.time()
+            while True:
+                time.sleep(0.1)
+                now = time.time()
+                if len(pending_requests) == 0 and (now - last_activity) >= kwargs.get('idle_wait_time', 0.5):
+                    break
+                if now - start_time > kwargs.get('max_wait_time', 5):
+                    logger.debug(
+                        f'Network timeout after {kwargs.get("max_wait_time", 5)}s with {len(pending_requests)} '
+                        f'pending requests: {[r.url for r in pending_requests]}'
+                    )
+                    break
+        finally:
+            # Clean up event listeners
+            page.remove_listener('request', on_request)
+            page.remove_listener('response', on_response)
+        logger.debug(f'Network stabilized for {kwargs.get("idle_wait_time", 0.5)} seconds')
+    @staticmethod
+    def _enhanced_css_selector_for_element(element: DOMElementNode, include_dynamic_attributes: bool = True) -> str:
+        """Creates a CSS selector for a DOM element, handling various edge cases and special characters.
+        Args:
+                element: The DOM element to create a selector for
+        Returns:
+                A valid CSS selector string
+        """
+        try:
+            # Get base selector from XPath
+            css_selector = DomUtil._convert_simple_xpath_to_css_selector(element.xpath)
+            # Handle class attributes
+            if 'class' in element.attributes and element.attributes['class'] and include_dynamic_attributes:
+                # Define a regex pattern for valid class names in CSS
+                valid_class_name_pattern = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_-]*$')
+                # Iterate through the class attribute values
+                classes = element.attributes['class'].split()
+                for class_name in classes:
+                    # Skip empty class names
+                    if not class_name.strip():
+                        continue
+                    # Check if the class name is valid
+                    if valid_class_name_pattern.match(class_name):
+                        # Append the valid class name to the CSS selector
+                        css_selector += f'.{class_name}'
+                    else:
+                        # Skip invalid class names
+                        continue
+            # Expanded set of safe attributes that are stable and useful for selection
+            SAFE_ATTRIBUTES = {
+                # Data attributes (if they're stable in your application)
+                'id',
+                # Standard HTML attributes
+                'name',
+                'type',
+                'placeholder',
+                # Accessibility attributes
+                'aria-label',
+                'aria-labelledby',
+                'aria-describedby',
+                'role',
+                # Common form attributes
+                'for',
+                'autocomplete',
+                'required',
+                'readonly',
+                # Media attributes
+                'alt',
+                'title',
+                'src',
+                # Custom stable attributes (add any application-specific ones)
+                'href',
+                'target',
+            }
+            if include_dynamic_attributes:
+                dynamic_attributes = {
+                    'data-id',
+                    'data-qa',
+                    'data-cy',
+                    'data-testid',
+                }
+                SAFE_ATTRIBUTES.update(dynamic_attributes)
+            # Handle other attributes
+            for attribute, value in element.attributes.items():
+                if attribute == 'class':
+                    continue
+                # Skip invalid attribute names
+                if not attribute.strip():
+                    continue
+                if attribute not in SAFE_ATTRIBUTES:
+                    continue
+                # Escape special characters in attribute names
+                safe_attribute = attribute.replace(':', r'\:')
+                # Handle different value cases
+                if value == '':
+                    css_selector += f'[{safe_attribute}]'
+                elif any(char in value for char in '"\'<>`\n\r\t'):
+                    # Use contains for values with special characters
+                    # Regex-substitute *any* whitespace with a single space, then strip.
+                    collapsed_value = re.sub(r'\s+', ' ', value).strip()
+                    # Escape embedded double-quotes.
+                    safe_value = collapsed_value.replace('"', '\\"')
+                    css_selector += f'[{safe_attribute}*="{safe_value}"]'
+                else:
+                    css_selector += f'[{safe_attribute}="{value}"]'
+            return css_selector
+        except Exception:
+            # Fallback to a more basic selector if something goes wrong
+            tag_name = element.tag_name or '*'
+            return f"{tag_name}[highlight_index='{element.highlight_index}']"
+    @staticmethod
+    def _convert_simple_xpath_to_css_selector(xpath: str) -> str:
+        """Converts simple XPath expressions to CSS selectors."""
+        if not xpath:
+            return ''
+        # Remove leading slash if present
+        xpath = xpath.lstrip('/')
+        # Split into parts
+        parts = xpath.split('/')
+        css_parts = []
+        for part in parts:
+            if not part:
+                continue
+            # Handle index notation [n]
+            if '[' in part:
+                base_part = part[: part.find('[')]
+                index_part = part[part.find('['):]
+                # Handle multiple indices
+                indices = [i.strip('[]') for i in index_part.split(']')[:-1]]
+                for idx in indices:
+                    try:
+                        # Handle numeric indices
+                        if idx.isdigit():
+                            index = int(idx) - 1
+                            base_part += f':nth-of-type({index + 1})'
+                        # Handle last() function
+                        elif idx == 'last()':
+                            base_part += ':last-of-type'
+                        # Handle position() functions
+                        elif 'position()' in idx:
+                            if '>1' in idx:
+                                base_part += ':nth-of-type(n+2)'
+                    except ValueError:
+                        continue
+                css_parts.append(base_part)
+            else:
+                css_parts.append(part)
+        base_selector = ' > '.join(css_parts)
+        return base_selector