Final_Assignment_AWorld

Sleeping

App Files Files Community

Duibonduil commited on Jun 30

Commit

9afc9c8

verified ·

1 Parent(s): cd33f4d

Upload 6 files

Browse files

Files changed (6) hide show

examples/android/README.md +3 -0
examples/android/agent.py +252 -0
examples/android/prompts.py +58 -0
examples/android/requirements.txt +6 -0
examples/android/run.py +34 -0
examples/android/utils.py +277 -0

examples/android/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Android Agents
2	+
3	+ Agents specialized in Android device automation.

examples/android/agent.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# coding: utf-8
+# Copyright (c) 2025 inclusionAI.
+import json
+import time
+import traceback
+from typing import Dict, Any, Optional, List, Union
+from langchain_core.messages import HumanMessage, BaseMessage, SystemMessage
+from examples.android.prompts import SYSTEM_PROMPT, LAST_STEP_PROMPT
+from examples.android.utils import (
+    AgentState,
+    AgentHistory,
+    AgentHistoryList,
+    ActionResult,
+    PolicyMetadata,
+    AgentBrain,
+    Trajectory
+)
+from examples.browsers.common import AgentStepInfo
+from aworld.config.conf import AgentConfig, ConfigDict
+from aworld.core.agent.base import AgentResult
+from aworld.agents.llm_agent import Agent
+from aworld.core.common import Observation, ActionModel, ToolActionInfo
+from aworld.logs.util import logger
+from examples.tools.tool_action import AndroidAction
+class AndroidAgent(Agent):
+    def __init__(self, conf: Union[Dict[str, Any], ConfigDict, AgentConfig], **kwargs):
+        super(AndroidAgent, self).__init__(conf, **kwargs)
+        provider = self.conf.llm_config.llm_provider if self.conf.llm_config.llm_provider else self.conf.llm_provider
+        if self.conf.llm_config.llm_provider:
+            self.conf.llm_config.llm_provider = "chat" + provider
+        else:
+            self.conf.llm_provider = "chat" + provider
+        self.available_actions_desc = self._build_action_prompt()
+        # Settings
+        self.settings = self.conf
+    def reset(self, options: Dict[str, Any]):
+        super(AndroidAgent, self).__init__(options)
+        # State
+        self.state = AgentState()
+        # History
+        self.history = AgentHistoryList(history=[])
+        self.trajectory = Trajectory(history=[])
+    def _build_action_prompt(self) -> str:
+        def _prompt(info: ToolActionInfo) -> str:
+            s = f'{info.desc}:\n'
+            s += '{' + str(info.name) + ': '
+            if info.input_params:
+                s += str({k: {"title": k, "type": v} for k, v in info.input_params.items()})
+            s += '}'
+            return s
+        # Iterate over all android actions
+        val = "\n".join([_prompt(v.value) for k, v in AndroidAction.__members__.items()])
+        return val
+    def policy(self,
+               observation: Observation,
+               info: Dict[str, Any] = None,
+               **kwargs) -> Union[List[ActionModel], None]:
+        self._finished = False
+        step_info = AgentStepInfo(number=self.state.n_steps, max_steps=self.conf.max_steps)
+        last_step_msg = None
+        if step_info and step_info.is_last_step():
+            # Add last step warning if needed
+            last_step_msg = HumanMessage(
+                content=LAST_STEP_PROMPT)
+            logger.info('Last step finishing up')
+        logger.info(f'[agent] 📍 Step {self.state.n_steps}')
+        step_start_time = time.time()
+        try:
+            xml_content, base64_img = observation.dom_tree, observation.image
+            if xml_content is None:
+                logger.error("[agent] ⚠ Failed to get UI state, stopping task")
+                self.stop()
+                return None
+            self.state.last_result = (xml_content, base64_img if base64_img else "")
+            logger.info("[agent] 🤖 Analyzing current state with LLM...")
+            a_step_msg = HumanMessage(content=[
+                {
+                    "type": "text",
+                    "text": f"""
+                        Task: {self.task}
+                        Current Step: {self.state.n_steps}
+                        Please analyze the current interface and decide the next action. Please directly return the response in JSON format without any other text or code block markers.
+                    """
+                },
+                {
+                    "type": "image_url",
+                    "image_url": f"data:image/jpeg;base64,{self.state.image}"
+                }
+            ])
+            messages = [SystemMessage(content=SYSTEM_PROMPT)]
+            if last_step_msg:
+                messages.append(last_step_msg)
+            messages.append(a_step_msg)
+            logger.info(f"[agent] VLM Input last message: {messages[-1]}")
+            llm_result = None
+            try:
+                llm_result = self._do_policy(messages)
+                if self.state.stopped or self.state.paused:
+                    logger.info('Android agent paused after getting state')
+                    return [ActionModel(tool_name='android', action_name="stop")]
+                tool_action = llm_result.actions
+                step_metadata = PolicyMetadata(
+                    start_time=step_start_time,
+                    end_time=time.time(),
+                    number=self.state.n_steps,
+                    input_tokens=1
+                )
+                history_item = AgentHistory(
+                    result=[ActionResult(success=True)],
+                    metadata=step_metadata,
+                    content=xml_content,
+                    base64_img=base64_img
+                )
+                self.history.history.append(history_item)
+                if self.settings.save_history and self.settings.history_path:
+                    self.history.save_to_file(self.settings.history_path)
+                logger.info(f'📍 Step {self.state.n_steps} starts to execute')
+                self.state.n_steps += 1
+                self.state.consecutive_failures = 0
+                return tool_action
+            except Exception as e:
+                logger.warning(traceback.format_exc())
+                raise RuntimeError("Android agent encountered exception while making the policy.", e)
+            finally:
+                if llm_result:
+                    self.trajectory.add_step(observation, info, llm_result)
+                    metadata = PolicyMetadata(
+                        number=self.state.n_steps,
+                        start_time=step_start_time,
+                        end_time=time.time(),
+                        input_tokens=1
+                    )
+                    self._make_history_item(llm_result, observation, metadata)
+                else:
+                    logger.warning("no result to record!")
+        except json.JSONDecodeError as e:
+            logger.error("[agent] ❌ JSON parsing error")
+            raise
+        except Exception as e:
+            logger.error(f"[agent] ❌ Action execution error: {str(e)}")
+            raise
+    def _do_policy(self, input_messages: list[BaseMessage]) -> AgentResult:
+        response = self.llm.invoke(input_messages)
+        content = response.content
+        if content.startswith("```json"):
+            content = content[7:]
+        if content.startswith("```"):
+            content = content[3:]
+        if content.endswith("```"):
+            content = content[:-3]
+        content = content.strip()
+        action_data = json.loads(content)
+        brain_state = AgentBrain(**action_data["current_state"])
+        logger.info(f"[agent] ⚠ Eval: {brain_state.evaluation_previous_goal}")
+        logger.info(f"[agent] 🧠 Memory: {brain_state.memory}")
+        logger.info(f"[agent] 🎯 Next goal: {brain_state.next_goal}")
+        actions = action_data.get('action')
+        result = []
+        if not actions:
+            actions = action_data.get("actions")
+        # print actions
+        logger.info(f"[agent] VLM Output actions: {actions}")
+        for action in actions:
+            action_type = action.get('type')
+            if not action_type:
+                logger.warning(f"Action missing type: {action}")
+                continue
+            params = {}
+            if 'type' == action_type:
+                action_type = 'input_text'
+            if 'params' in action:
+                params = action['params']
+            if 'index' in action:
+                params['index'] = action['index']
+            if 'type' in action:
+                params['type'] = action['type']
+            if 'text' in action:
+                params['text'] = action['text']
+            action_model = ActionModel(
+                tool_name='android',
+                action_name=action_type,
+                params=params
+            )
+            result.append(action_model)
+        return AgentResult(current_state=brain_state, actions=result)
+    def _make_history_item(self,
+                           model_output: AgentResult | None,
+                           state: Observation,
+                           metadata: Optional[PolicyMetadata] = None) -> None:
+        if isinstance(state, dict):
+            state = Observation(**state)
+        history_item = AgentHistory(
+            model_output=model_output,
+            result=state.action_result,
+            metadata=metadata,
+            content=state.dom_tree,
+            base64_img=state.image
+        )
+        self.state.history.history.append(history_item)
+    def pause(self) -> None:
+        """Pause the agent"""
+        logger.info('🔄 Pausing Agent')
+        self.state.paused = True
+    def resume(self) -> None:
+        """Resume the agent"""
+        logger.info('▶️ Agent resuming')
+        self.state.paused = False
+    def stop(self) -> None:
+        """Stop the agent"""
+        logger.info('⏹️ Agent stopping')
+        self.state.stopped = True

examples/android/prompts.py ADDED Viewed

	@@ -0,0 +1,58 @@

+SYSTEM_PROMPT = """
+You are an Android device automation assistant. Your task is to help users perform various operations on Android devices.
+You can perform the following actions:
+1.Tap Element (tap) - Requires parameter: index (element number)
+2.Input Text (input_text) - Requires parameter: text (text content to input)
+3.Long Press Element (long_press) - Requires parameter: index (element number)
+4.Swipe Element (swipe) - Requires parameter: index (element number), params.direction (direction: "up", "down", "left", "right"), params.dist (distance: "short", "medium", "long", optional, default is "medium")
+5.Task Completion (done) - Requires parameter: success (whether the task was successfully completed, values are true/false)
+Each interactive element has a number. You need to perform operations based on the element numbers displayed on the interface. Element numbers start from 1; 0 is not a valid element number. The current interface's XML and screenshot will be your input. Please carefully analyze the interface elements and choose the correct operation.
+Important Note: Please directly return the response in JSON format without any other text, explanations, or code block markers. The response must be a valid JSON object, formatted as follows:
+{
+    "current_state": {
+        "evaluation_previous_goal": "Analyze the result of the previous step",
+        "memory": "Remember important context information",
+        "next_goal": "The specific goal to execute next"
+    },
+    "action": [
+        {
+            "type": "tap",
+            "index": "Element number"
+        },
+        {
+            "type": "input_text",
+            "text": "Text content to input"
+        },
+        {
+            "type": "long_press",
+            "index": "Element number"
+        },
+        {
+            "type": "swipe",
+            "index": "Element number",
+            "params": {
+                "direction": "Swipe direction (up/down/left/right)",
+                "dist": "Swipe distance (short/medium/long, optional)"
+            }
+        },
+        {
+            "type": "done",
+            "success": "Whether the task was successfully completed (true/false)"
+        }
+    ]
+}
+Note:
+The index must be a valid integer starting from 1
+Do not add any other text or markers before or after the JSON
+Ensure the JSON format is entirely correct
+Each action type must include all necessary required parameters
+"""
+LAST_STEP_PROMPT = """Now comes your last step. Use only the "done" action now. No other actions - so here your action sequence must have length 1.
+If the task is not yet fully finished as requested by the user, set success in "done" to false! E.g. if not all steps are fully completed.
+If the task is fully finished, set success in "done" to true.
+Include everything you found out for the ultimate task in the done text."""

examples/android/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+langchain~=0.3.20
+langchain-openai~=0.3.8
+langchain-ollama~=0.2.3
+langchain-anthropic~=0.3.9
+langchain-mistralai~=0.2.7
+langchain-google-genai~=2.1.0

examples/android/run.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# coding: utf-8
+# Copyright (c) 2025 inclusionAI.
+from aworld.config import AgentConfig
+from examples.android.agent import AndroidAgent
+from examples.tools.common import Agents, Tools
+from aworld.core.task import Task
+from aworld.runner import Runners
+from examples.tools.conf import AndroidToolConfig
+def main():
+    android_tool_config = AndroidToolConfig(avd_name='8ABX0PHWU',
+                                            headless=False,
+                                            max_retry=2)
+    agent_config: AgentConfig = AgentConfig(
+        name=Agents.ANDROID.value,
+        llm_provider="openai",
+        llm_model_name="gpt-4o",
+        llm_temperature=1,
+    )
+    agent = AndroidAgent(name=Agents.ANDROID.value, conf=agent_config)
+    task = Task(
+        input="""open rednote""",
+        agent=agent,
+        tools_conf={Tools.ANDROID.value, android_tool_config}
+    )
+    Runners.sync_run_task(task)
+if __name__ == '__main__':
+    main()

examples/android/utils.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# coding: utf-8
+import json
+import traceback
+import uuid
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Optional, Dict, List
+from langchain_core.load import dumpd, load
+from langchain_core.messages import BaseMessage, AIMessage, ToolMessage, SystemMessage, HumanMessage
+from openai import RateLimitError
+from pydantic import BaseModel, ConfigDict, Field, model_serializer, model_validator
+from aworld.core.agent.base import AgentResult
+from aworld.core.common import ActionResult, Observation
+class MessageMetadata(BaseModel):
+    """Metadata for a message"""
+    tokens: int = 0
+class ManagedMessage(BaseModel):
+    """A message with its metadata"""
+    message: BaseMessage
+    metadata: MessageMetadata = Field(default_factory=MessageMetadata)
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    # https://github.com/pydantic/pydantic/discussions/7558
+    @model_serializer(mode='wrap')
+    def to_json(self, original_dump):
+        """
+        Returns the JSON representation of the model.
+        It uses langchain's `dumps` function to serialize the `message`
+        property before encoding the overall dict with json.dumps.
+        """
+        data = original_dump(self)
+        # NOTE: We override the message field to use langchain JSON serialization.
+        data['message'] = dumpd(self.message)
+        return data
+    @model_validator(mode='before')
+    @classmethod
+    def validate(
+            cls,
+            value: Any,
+            *,
+            strict: bool | None = None,
+            from_attributes: bool | None = None,
+            context: Any | None = None,
+    ) -> Any:
+        """
+        Custom validator that uses langchain's `loads` function
+        to parse the message if it is provided as a JSON string.
+        """
+        if isinstance(value, dict) and 'message' in value:
+            # NOTE: We use langchain's load to convert the JSON string back into a BaseMessage object.
+            value['message'] = load(value['message'])
+        return value
+class MessageHistory(BaseModel):
+    """History of messages with metadata"""
+    messages: list[ManagedMessage] = Field(default_factory=list)
+    current_tokens: int = 0
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    def add_message(self, message: BaseMessage, metadata: MessageMetadata, position: int | None = None) -> None:
+        """Add message with metadata to history"""
+        if position is None:
+            self.messages.append(ManagedMessage(message=message, metadata=metadata))
+        else:
+            self.messages.insert(position, ManagedMessage(message=message, metadata=metadata))
+        self.current_tokens += metadata.tokens
+    def add_model_output(self, output) -> None:
+        """Add model output as AI message"""
+        tool_calls = [
+            {
+                'name': 'AgentOutput',
+                'args': output.model_dump(mode='json', exclude_unset=True),
+                'id': '1',
+                'type': 'tool_call',
+            }
+        ]
+        msg = AIMessage(
+            content='',
+            tool_calls=tool_calls,
+        )
+        self.add_message(msg, MessageMetadata(tokens=100))  # Estimate tokens for tool calls
+        # Empty tool response
+        tool_message = ToolMessage(content='', tool_call_id='1')
+        self.add_message(tool_message, MessageMetadata(tokens=10))  # Estimate tokens for empty response
+    def get_messages(self) -> list[BaseMessage]:
+        """Get all messages"""
+        return [m.message for m in self.messages]
+    def get_total_tokens(self) -> int:
+        """Get total tokens in history"""
+        return self.current_tokens
+    def remove_oldest_message(self) -> None:
+        """Remove oldest non-system message"""
+        for i, msg in enumerate(self.messages):
+            if not isinstance(msg.message, SystemMessage):
+                self.current_tokens -= msg.metadata.tokens
+                self.messages.pop(i)
+                break
+    def remove_last_state_message(self) -> None:
+        """Remove last state message from history"""
+        if len(self.messages) > 2 and isinstance(self.messages[-1].message, HumanMessage):
+            self.current_tokens -= self.messages[-1].metadata.tokens
+            self.messages.pop()
+class MessageManagerState(BaseModel):
+    """Holds the state for MessageManager"""
+    history: MessageHistory = Field(default_factory=MessageHistory)
+    tool_id: int = 1
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+class AgentSettings(BaseModel):
+    """Options for the agent"""
+    max_failures: int = 3
+    retry_delay: int = 10
+    save_history: bool = True
+    history_path: Optional[str] = None
+    max_actions_per_step: int = 10
+    validate_output: bool = False
+    message_context: Optional[str] = None
+class PolicyMetadata(BaseModel):
+    """Metadata for a single step including timing information"""
+    start_time: float
+    end_time: float
+    number: int
+    input_tokens: int
+    @property
+    def duration_seconds(self) -> float:
+        """Calculate step duration in seconds"""
+        return self.end_time - self.start_time
+class AgentBrain(BaseModel):
+    """Current state of the agent"""
+    evaluation_previous_goal: str
+    memory: str
+    next_goal: str
+class AgentHistory(BaseModel):
+    """History item for agent actions"""
+    model_output: Optional[BaseModel] = None
+    result: List[ActionResult]
+    metadata: Optional[PolicyMetadata] = None
+    content: Optional[str] = None
+    base64_img: Optional[str] = None
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    def model_dump(self, **kwargs) -> Dict[str, Any]:
+        """Custom serialization handling"""
+        return {
+            'model_output': self.model_output.model_dump() if self.model_output else None,
+            'result': [r.model_dump(exclude_none=True) for r in self.result],
+            'metadata': self.metadata.model_dump() if self.metadata else None,
+            'content': self.xml_content,
+            'base64_img': self.base64_img
+        }
+class AgentHistoryList(BaseModel):
+    """List of agent history items"""
+    history: List[AgentHistory]
+    def total_duration_seconds(self) -> float:
+        """Get total duration of all steps in seconds"""
+        total = 0.0
+        for h in self.history:
+            if h.metadata:
+                total += h.metadata.duration_seconds
+        return total
+    def save_to_file(self, filepath: str | Path) -> None:
+        """Save history to JSON file with proper serialization"""
+        try:
+            Path(filepath).parent.mkdir(parents=True, exist_ok=True)
+            data = self.model_dump()
+            with open(filepath, 'w', encoding='utf-8') as f:
+                json.dump(data, f, indent=2)
+        except Exception as e:
+            raise e
+    def model_dump(self, **kwargs) -> Dict[str, Any]:
+        """Custom serialization that properly uses AgentHistory's model_dump"""
+        return {
+            'history': [h.model_dump(**kwargs) for h in self.history],
+        }
+    @classmethod
+    def load_from_file(cls, filepath: str | Path) -> 'AgentHistoryList':
+        """Load history from JSON file"""
+        with open(filepath, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        return cls.model_validate(data)
+class AgentError:
+    """Container for agent error handling"""
+    VALIDATION_ERROR = 'Invalid model output format. Please follow the correct schema.'
+    RATE_LIMIT_ERROR = 'Rate limit reached. Waiting before retry.'
+    NO_VALID_ACTION = 'No valid action found'
+    @staticmethod
+    def format_error(error: Exception, include_trace: bool = False) -> str:
+        """Format error message based on error type and optionally include trace"""
+        if isinstance(error, RateLimitError):
+            return AgentError.RATE_LIMIT_ERROR
+        if include_trace:
+            return f'{str(error)}\nStacktrace:\n{traceback.format_exc()}'
+        return f'{str(error)}'
+class AgentState(BaseModel):
+    """Holds all state information for an Agent"""
+    agent_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    n_steps: int = 1
+    consecutive_failures: int = 0
+    last_result: Optional[List['ActionResult']] = None
+    history: AgentHistoryList = Field(default_factory=lambda: AgentHistoryList(history=[]))
+    last_plan: Optional[str] = None
+    paused: bool = False
+    stopped: bool = False
+    message_manager_state: MessageManagerState = Field(default_factory=MessageManagerState)
+@dataclass
+class AgentStepInfo:
+    number: int
+    max_steps: int
+    def is_last_step(self) -> bool:
+        """Check if this is the last step"""
+        return self.number >= self.max_steps - 1
+@dataclass
+class Trajectory:
+    """Stores the agent's history, including all observations, info, and AgentResults."""
+    history: List[tuple[Observation, Dict[str, Any], AgentResult]] = field(default_factory=list)
+    def add_step(self, observation: Observation, info: Dict[str, Any], agent_result: AgentResult):
+        """Add a step to the history"""
+        self.history.append((observation, info, agent_result))
+    def get_history(self) -> List[tuple[Observation, Dict[str, Any], AgentResult]]:
+        """Retrieve the complete history"""
+        return self.history