Duibonduil commited on
Commit
a27d8ba
·
verified ·
1 Parent(s): 4735d1f

Upload 8 files

Browse files
examples/browsers/README.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Browser Agents
2
+
3
+ Agents specialized in web browser automation.
4
+ The implementation of browser agent version is now derived from [browser use](https://github.com/browser-use/browser-use), which we have made a lot of modifications to integrate into our own framework
examples/browsers/agent.py ADDED
@@ -0,0 +1,561 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+ # Copyright (c) 2025 inclusionAI.
3
+
4
+ import re
5
+ import time
6
+ import traceback
7
+ import json
8
+ from typing import Dict, Any, Optional, List, Union, Tuple
9
+ from dataclasses import dataclass, field
10
+
11
+ from langchain_core.messages import HumanMessage, BaseMessage, AIMessage, ToolMessage
12
+ from pydantic import ValidationError
13
+
14
+ from aworld.core.agent.base import AgentFactory, AgentResult
15
+ from aworld.agents.llm_agent import Agent
16
+ from examples.browsers.prompts import SystemPrompt
17
+ from examples.browsers.utils import convert_input_messages, extract_json_from_model_output, estimate_messages_tokens
18
+ from examples.browsers.common import AgentState, AgentStepInfo, AgentHistory, PolicyMetadata, AgentBrain
19
+ from aworld.config.conf import AgentConfig, ConfigDict
20
+ from aworld.core.common import Observation, ActionModel, ToolActionInfo, ActionResult
21
+ from aworld.logs.util import logger
22
+ from examples.browsers.prompts import AgentMessagePrompt
23
+ from examples.tools.tool_action import BrowserAction
24
+
25
+
26
+ @dataclass
27
+ class Trajectory:
28
+ """A class to store agent history records, including all observations, info and AgentResult"""
29
+ history: List[tuple[List[BaseMessage], Observation, Dict[str, Any], AIMessage, AgentResult]] = field(
30
+ default_factory=list)
31
+
32
+ def add_step(self, input_messages: List[BaseMessage], observation: Observation, info: Dict[str, Any],
33
+ output_message: AIMessage, agent_result: AgentResult):
34
+ """Add a step to the history"""
35
+ self.history.append((input_messages, observation, info, output_message, agent_result))
36
+
37
+ def get_history(self) -> List[tuple[List[BaseMessage], Observation, Dict[str, Any], AIMessage, AgentResult]]:
38
+ """Get the complete history"""
39
+ return self.history
40
+
41
+ def save_history(self, file_path: str):
42
+ his_li = []
43
+ for input_messages, observation, info, output_message, agent_result in self.get_history():
44
+ llm_input = [{"type": input_message.type, "content": input_message.content} for input_message in
45
+ input_messages]
46
+ llm_output = output_message.content
47
+ his_li.append({"llm_input": llm_input, "llm_output": llm_output})
48
+ with open(file_path, 'w', encoding='utf-8') as f:
49
+ json.dump(his_li, f, ensure_ascii=False, indent=4)
50
+
51
+
52
+ @AgentFactory.register(name='browser_agent', desc="browser agent")
53
+ class BrowserAgent(Agent):
54
+ def __init__(self, conf: Union[Dict[str, Any], ConfigDict, AgentConfig], **kwargs):
55
+ super(BrowserAgent, self).__init__(conf, **kwargs)
56
+ self.state = AgentState()
57
+ self.settings = self.conf
58
+ provider = self.conf.llm_config.llm_provider if self.conf.llm_config.llm_provider else self.conf.llm_provider
59
+ if self.conf.llm_config.llm_provider:
60
+ self.conf.llm_config.llm_provider = "chat" + provider
61
+ else:
62
+ self.conf.llm_provider = "chat" + provider
63
+
64
+ self.save_file_path = self.conf.save_file_path
65
+ self.available_actions = self._build_action_prompt()
66
+ # Note: Removed _message_manager initialization as it's no longer used
67
+ # Initialize trajectory
68
+ self.trajectory = Trajectory()
69
+ self._init = False
70
+
71
+ def reset(self, options: Dict[str, Any]):
72
+ super(BrowserAgent, self).reset(options)
73
+
74
+ # Reset trajectory
75
+ self.trajectory = Trajectory()
76
+
77
+ # Note: Removed _message_manager initialization as it's no longer used
78
+ # _estimate_tokens_for_messages method now directly uses functions from utils.py
79
+
80
+ self._init = True
81
+
82
+ def _build_action_prompt(self) -> str:
83
+ def _prompt(info: ToolActionInfo) -> str:
84
+ s = f'{info.desc}: \n'
85
+ s += '{' + str(info.name) + ': '
86
+ if info.input_params:
87
+ s += str({k: {"title": k, "type": v.type} for k, v in info.input_params.items()})
88
+ s += '}'
89
+ return s
90
+
91
+ val = "\n".join([_prompt(v.value) for k, v in BrowserAction.__members__.items()])
92
+ return val
93
+
94
+ def _log_message_sequence(self, input_messages: List[BaseMessage]) -> None:
95
+ """Log the sequence of messages for debugging purposes"""
96
+ logger.info(f"[agent] 🔍 Invoking LLM with {len(input_messages)} messages")
97
+ logger.info("[agent] 📝 Messages sequence:")
98
+ for i, msg in enumerate(input_messages):
99
+ prefix = msg.type
100
+ logger.info(f"[agent] Message {i + 1}: {prefix} ===================================")
101
+ if isinstance(msg.content, list):
102
+ for item in msg.content:
103
+ if item.get('type') == 'text':
104
+ logger.info(f"[agent] Text content: {item.get('text')}")
105
+ elif item.get('type') == 'image_url':
106
+ # Only print the first 30 characters of image URL to avoid printing entire base64
107
+ image_url = item.get('image_url', {}).get('url', '')
108
+ if image_url.startswith('data:image'):
109
+ logger.info(f"[agent] Image: [Base64 image data]")
110
+ else:
111
+ logger.info(f"[agent] Image URL: {image_url[:30]}...")
112
+ else:
113
+ content = str(msg.content)
114
+ chunk_size = 500
115
+ for j in range(0, len(content), chunk_size):
116
+ chunk = content[j:j + chunk_size]
117
+ if j == 0:
118
+ logger.info(f"[agent] Content: {chunk}")
119
+ else:
120
+ logger.info(f"[agent] Content (continued): {chunk}")
121
+
122
+ if isinstance(msg, AIMessage) and hasattr(msg, 'tool_calls') and msg.tool_calls:
123
+ for tool_call in msg.tool_calls:
124
+ logger.info(f"[agent] Tool call: {tool_call.get('name')} - ID: {tool_call.get('id')}")
125
+ args = str(tool_call.get('args', {}))[:1000]
126
+ logger.info(f"[agent] Tool args: {args}...")
127
+
128
+ def save_process(self, file_path: str):
129
+ self.trajectory.save_history(file_path)
130
+
131
+ def policy(self,
132
+ observation: Observation,
133
+ info: Dict[str, Any] = None, **kwargs) -> Union[List[ActionModel], None]:
134
+ start_time = time.time()
135
+
136
+ if self._init is False:
137
+ self.reset({"task": observation.content})
138
+
139
+ self._finished = False
140
+ # Save current observation to state for message construction
141
+ self.state.last_result = observation.action_result
142
+
143
+ if self.conf.max_steps <= self.state.n_steps:
144
+ logger.info('Last step finishing up')
145
+
146
+ logger.info(f'[agent] step {self.state.n_steps}')
147
+
148
+ # Use the new method to build messages, passing the current observation
149
+ input_messages = self.build_messages_from_trajectory_and_observation(observation=observation)
150
+
151
+ # Note: Special message addition has been moved to build_messages_from_trajectory_and_observation
152
+
153
+ # Estimate token count
154
+ tokens = self._estimate_tokens_for_messages(input_messages)
155
+
156
+ llm_result = None
157
+ output_message = None
158
+ try:
159
+ # Log the message sequence
160
+ self._log_message_sequence(input_messages)
161
+
162
+ output_message, llm_result = self._do_policy(input_messages)
163
+
164
+ if not llm_result:
165
+ logger.error("[agent] ❌ Failed to parse LLM response")
166
+ return [ActionModel(tool_name=Tools.BROWSER.value, action_name="stop")]
167
+
168
+ self.state.n_steps += 1
169
+
170
+ # No longer need to remove the last state message
171
+ # self._message_manager._remove_last_state_message()
172
+
173
+ if self.state.stopped or self.state.paused:
174
+ logger.info('Browser gent paused after getting state')
175
+ return [ActionModel(tool_name=Tools.BROWSER.value, action_name="stop")]
176
+
177
+ tool_action = llm_result.actions
178
+
179
+ # Add the current step to the trajectory
180
+ self.trajectory.add_step(input_messages, observation, info, output_message, llm_result)
181
+
182
+ except Exception as e:
183
+ logger.warning(traceback.format_exc())
184
+ # No longer need to remove the last state message
185
+ # self._message_manager._remove_last_state_message()
186
+ logger.error(f"[agent] ❌ Error parsing LLM response: {str(e)}")
187
+
188
+ # Create an AgentResult object with an empty actions list
189
+ error_result = AgentResult(
190
+ current_state=AgentBrain(
191
+ evaluation_previous_goal="Failed due to error",
192
+ memory=f"Error occurred: {str(e)}",
193
+ thought="Recover from error",
194
+ next_goal="Recover from error"
195
+ ),
196
+ actions=[] # Empty actions list
197
+ )
198
+
199
+ # Add the error state to the trajectory
200
+ self.trajectory.add_step(input_messages, observation, info, output_message, error_result)
201
+
202
+ raise RuntimeError("Browser agent encountered exception while making the policy.", e)
203
+ finally:
204
+ if llm_result:
205
+ # Only keep the history_item creation part
206
+ metadata = PolicyMetadata(
207
+ number=self.state.n_steps,
208
+ start_time=start_time,
209
+ end_time=time.time(),
210
+ input_tokens=tokens,
211
+ )
212
+ self._make_history_item(llm_result, observation, observation.action_result, metadata)
213
+ else:
214
+ logger.warning("no result to record!")
215
+
216
+ return tool_action
217
+
218
+ def _do_policy(self, input_messages: list[BaseMessage]) -> Tuple[AIMessage, AgentResult]:
219
+ THINK_TAGS = re.compile(r'<think>.*?</think>', re.DOTALL)
220
+
221
+ def _remove_think_tags(text: str) -> str:
222
+ """Remove think tags from text"""
223
+ return re.sub(THINK_TAGS, '', text)
224
+
225
+ input_messages = self._convert_input_messages(input_messages)
226
+ output_message = None
227
+ try:
228
+
229
+ output_message = self.llm.invoke(input_messages)
230
+
231
+ if not output_message or not output_message.content:
232
+ logger.warning("[agent] LLM returned empty response")
233
+ return output_message, AgentResult(
234
+ current_state=AgentBrain(evaluation_previous_goal="", memory="", thought="", next_goal=""),
235
+ actions=[ActionModel(agent_name=self.id(), tool_name='browser', action_name="stop")])
236
+ except:
237
+ logger.error(f"[agent] Response content: {output_message}")
238
+ raise RuntimeError('call llm fail, please check llm conf and network.')
239
+
240
+ if self.model_name == 'deepseek-reasoner':
241
+ output_message.content = _remove_think_tags(output_message.content)
242
+ try:
243
+ # Get max retries from config
244
+ max_retries = self.settings.get('max_llm_json_retries', 3)
245
+ retry_count = 0
246
+ json_parse_error = None
247
+
248
+ while retry_count < max_retries:
249
+ try:
250
+ parsed_json = extract_json_from_model_output(output_message.content)
251
+ # If parsing succeeds, break out of the retry loop
252
+ json_parse_error = None
253
+ break
254
+ except ValueError as e:
255
+ # Store the error and retry
256
+ json_parse_error = e
257
+ retry_count += 1
258
+ logger.warning(f"[agent] Failed to parse JSON (attempt {retry_count}/{max_retries}): {str(e)}")
259
+
260
+ if retry_count < max_retries:
261
+ # Add a reminder message about JSON format with specific structure guidance
262
+ format_reminder = HumanMessage(
263
+ content="Your responses must be always JSON with the specified format. Make sure your response includes a 'current_state' object with 'evaluation_previous_goal', 'memory', and 'next_goal' fields, and an 'action' array with the actions to perform. Do not include any explanatory text, only return the raw JSON.")
264
+ retry_messages = input_messages.copy()
265
+ retry_messages.append(format_reminder)
266
+
267
+ # Retry with the updated messages
268
+ logger.info(
269
+ f"[agent] Retrying LLM invocation ({retry_count}/{max_retries}) with format reminder")
270
+ output_message = self.llm.invoke(retry_messages)
271
+
272
+ # Check for empty response during retry
273
+ if not output_message or not output_message.content:
274
+ logger.warning(
275
+ f"[agent] LLM returned empty response on retry attempt {retry_count}/{max_retries}")
276
+ # Continue to next retry instead of immediately returning
277
+ continue
278
+
279
+ if self.model_name == 'deepseek-reasoner':
280
+ output_message.content = _remove_think_tags(output_message.content)
281
+
282
+ # If all retries failed, raise the last error
283
+ if json_parse_error:
284
+ logger.error(f"[agent] ❌ All {max_retries} attempts to parse JSON failed")
285
+ raise json_parse_error
286
+
287
+ logger.info((f"llm response: {parsed_json}"))
288
+ try:
289
+ agent_brain = AgentBrain(**parsed_json['current_state'])
290
+ except:
291
+ agent_brain = None
292
+ actions = parsed_json.get('action')
293
+ result = []
294
+ if not actions:
295
+ actions = parsed_json.get("actions")
296
+ if not actions:
297
+ logger.warning("agent not policy an action.")
298
+ self._finished = True
299
+ return output_message, AgentResult(current_state=agent_brain,
300
+ actions=[ActionModel(tool_name='browser',
301
+ agent_name=self.id(),
302
+ action_name="done")])
303
+
304
+ for action in actions:
305
+ if "action_name" in action:
306
+ action_name = action['action_name']
307
+ browser_action = BrowserAction.get_value_by_name(action_name)
308
+ if not browser_action:
309
+ logger.warning(f"Unsupported action: {action_name}")
310
+ if action_name == "done":
311
+ self._finished = True
312
+ action_model = ActionModel(agent_name=self.id(),
313
+ tool_name='browser',
314
+ action_name=action_name,
315
+ params=action.get('params', {}))
316
+ result.append(action_model)
317
+ else:
318
+ for k, v in action.items():
319
+ browser_action = BrowserAction.get_value_by_name(k)
320
+ if not browser_action:
321
+ logger.warning(f"Unsupported action: {k}")
322
+
323
+ action_model = ActionModel(agent_name=self.id(), tool_name='browser', action_name=k, params=v)
324
+ result.append(action_model)
325
+ if k == "done":
326
+ self._finished = True
327
+ return output_message, AgentResult(current_state=agent_brain, actions=result)
328
+ except (ValueError, ValidationError) as e:
329
+ logger.warning(f'Failed to parse model output: {output_message} {str(e)}')
330
+ raise ValueError('Could not parse response.')
331
+
332
+ def _convert_input_messages(self, input_messages: list[BaseMessage]) -> list[BaseMessage]:
333
+ """Convert input messages to the correct format"""
334
+ if self.model_name == 'deepseek-reasoner' or self.model_name.startswith('deepseek-r1'):
335
+ return convert_input_messages(input_messages, self.model_name)
336
+ else:
337
+ return input_messages
338
+
339
+ def _make_history_item(self,
340
+ model_output: AgentResult | None,
341
+ state: Observation,
342
+ result: list[ActionResult],
343
+ metadata: Optional[PolicyMetadata] = None) -> None:
344
+ content = ""
345
+ if hasattr(state, 'dom_tree') and state.dom_tree is not None:
346
+ if hasattr(state.dom_tree, 'element_tree'):
347
+ content = state.dom_tree.element_tree.__repr__()
348
+ else:
349
+ content = str(state.dom_tree)
350
+
351
+ history_item = AgentHistory(model_output=model_output,
352
+ result=state.action_result,
353
+ metadata=metadata,
354
+ content=content,
355
+ base64_img=state.image if hasattr(state, 'image') else None)
356
+
357
+ self.state.history.history.append(history_item)
358
+
359
+ def _process_action_result(self, action_result, messages, tool_call=None):
360
+ """Helper method to process an action result and add appropriate messages"""
361
+ if action_result.content is not None:
362
+ messages.append(HumanMessage(content='Action result: ' + action_result.content))
363
+ elif action_result.error is not None:
364
+ # Assemble error message when error information exists
365
+ messages.append(HumanMessage(content='Action result: ' + action_result.error))
366
+ if tool_call is not None:
367
+ logger.warning(f"Action {tool_call} failed: {action_result.error}")
368
+ else:
369
+ logger.warning(f"Action failed: {action_result.error}")
370
+ # If there is an error but success is true, log the error and terminate the program as the result is invalid
371
+ if action_result.success is True:
372
+ error_msg = f"Invalid result: success=True but error message exists: {action_result.error}"
373
+ logger.error(error_msg)
374
+ raise ValueError(error_msg)
375
+ return action_result.error is not None
376
+
377
+ def build_messages_from_trajectory_and_observation(self, observation: Optional[Observation] = None) -> List[
378
+ BaseMessage]:
379
+ """
380
+ Build complete message history from trajectory and current observation
381
+
382
+ Args:
383
+ observation: Current observation object, if None current observation won't be added
384
+ """
385
+ messages = []
386
+ # Add system message
387
+ system_message = SystemPrompt(
388
+ max_actions_per_step=self.settings.get('max_actions_per_step')
389
+ ).get_system_message()
390
+ if isinstance(system_message, tuple):
391
+ system_message = system_message[0]
392
+ messages.append(system_message)
393
+
394
+ tool_calling_method = self.settings.get("tool_calling_method")
395
+ llm_provider = self.conf.llm_provider if self.conf.llm_provider else self.conf.llm_config.llm_provider
396
+
397
+ if tool_calling_method == 'raw' or (tool_calling_method == 'auto' and (
398
+ llm_provider == 'deepseek-reasoner' or llm_provider.startswith('deepseek-r1'))):
399
+ message_context = f'\n\nAvailable actions: {self.available_actions}'
400
+ else:
401
+ message_context = None
402
+
403
+ # Add task context (if any)
404
+ if message_context:
405
+ context_message = HumanMessage(content='Context for the task' + message_context)
406
+ messages.append(context_message)
407
+
408
+ # Add task message
409
+ task_message = HumanMessage(
410
+ content=f'Your ultimate task is: """{self.task}""". If you achieved your ultimate task, stop everything and use the done action in the next step to complete the task. If not, continue as usual.'
411
+ )
412
+ messages.append(task_message)
413
+
414
+ # Add example output
415
+ placeholder_message = HumanMessage(content='Example output:')
416
+ messages.append(placeholder_message)
417
+
418
+ # Add example tool call
419
+ tool_calls = [
420
+ {
421
+ 'name': 'AgentOutput',
422
+ 'args': {
423
+ 'current_state': {
424
+ 'evaluation_previous_goal': 'Success - I opend the first page',
425
+ 'memory': 'Starting with the new task. I have completed 1/10 steps',
426
+ 'thought': 'From the current page I can get information about all the companies.',
427
+ 'next_goal': 'Click on company a',
428
+ },
429
+ 'action': [{'click_element': {'index': 0}}],
430
+ },
431
+ 'id': '1',
432
+ 'type': 'tool_call',
433
+ }
434
+ ]
435
+ example_tool_call = AIMessage(
436
+ content='',
437
+ tool_calls=tool_calls,
438
+ )
439
+ messages.append(example_tool_call)
440
+
441
+ # Add first tool message with "Browser started" content
442
+ messages.append(ToolMessage(content='Browser started', tool_call_id='1'))
443
+
444
+ # Add task history marker
445
+ messages.append(HumanMessage(content='[Your task history memory starts here]'))
446
+
447
+ # Add available file paths (if any)
448
+ if self.settings.get('available_file_paths'):
449
+ filepaths_msg = HumanMessage(
450
+ content=f'Here are file paths you can use: {self.settings.get("available_file_paths")}')
451
+ messages.append(filepaths_msg)
452
+ previous_action_entries = []
453
+ # Add messages from the history trajectory
454
+ for input_msgs, obs, info, output_msg, llm_result in self.trajectory.get_history():
455
+ # Check the previous step's actionResult
456
+ has_error = False
457
+ if obs.action_result is not None:
458
+ # The previous action entries should match with action results
459
+ if len(previous_action_entries) == 0:
460
+ # if previous_action_entries is empty,process action_result directly
461
+ logger.info(
462
+ f"History item with action_result count ({len(obs.action_result)}) with empty previous actions - skipping count check")
463
+ elif len(previous_action_entries) == len(obs.action_result):
464
+ for i, one_action_result in enumerate(obs.action_result):
465
+ has_error = self._process_action_result(one_action_result, messages,
466
+ previous_action_entries[i]) or has_error
467
+ else:
468
+ # If sizes don't match, this is a critical error
469
+ error_msg = f"Action results count ({len(obs.action_result)}) doesn't match action entries count ({len(previous_action_entries)})"
470
+ logger.error(error_msg)
471
+ has_error = True
472
+ # raise ValueError(error_msg)
473
+
474
+ # Add agent response
475
+ if llm_result:
476
+ # Create AI message
477
+ output_data = llm_result.model_dump(mode='json', exclude_unset=True)
478
+ action_entries = [{action.action_name: action.params} for action in llm_result.actions]
479
+ output_data["action"] = action_entries
480
+ if "actions" in output_data:
481
+ del output_data["actions"]
482
+
483
+ # Calculate tool_id based on trajectory history. If no actions yet, start with ID 1
484
+ tool_id = 1 if len(self.trajectory.get_history()) == 0 else len(self.trajectory.get_history()) + 1
485
+ tool_calls = [
486
+ {
487
+ 'name': 'AgentOutput',
488
+ 'args': output_data,
489
+ 'id': str(tool_id),
490
+ 'type': 'tool_call',
491
+ }
492
+ ]
493
+ previous_action_entries = action_entries
494
+ ai_message = AIMessage(
495
+ content='',
496
+ tool_calls=tool_calls,
497
+ )
498
+ messages.append(ai_message)
499
+
500
+ # Add empty tool message after each AIMessage
501
+ messages.append(ToolMessage(content='', tool_call_id=str(tool_id)))
502
+
503
+ # Add current observation - using the passed observation parameter instead of self.state.current_observation
504
+ if observation:
505
+ # Check if the current observation has an action_result with error
506
+ has_error = False
507
+ if hasattr(observation, 'action_result') and observation.action_result is not None:
508
+ # Match action results with previous actions
509
+ if len(previous_action_entries) == 0:
510
+ # if previous_action_entries is empty,process action_result directly
511
+ logger.info(
512
+ f"Current observation with action_result count ({len(observation.action_result)}) with empty previous actions - skipping count check")
513
+ elif len(previous_action_entries) == len(observation.action_result):
514
+ for i, one_action_result in enumerate(observation.action_result):
515
+ has_error = self._process_action_result(one_action_result, messages,
516
+ previous_action_entries[i]) or has_error
517
+ else:
518
+ # If sizes don't match, this is a critical error
519
+ error_msg = f"Action results count ({len(observation.action_result)}) doesn't match action entries count ({len(previous_action_entries)})"
520
+ logger.error(error_msg)
521
+ has_error = True
522
+
523
+ # If there's an error, append observation content outside the loop
524
+ if has_error and observation.content:
525
+ messages.append(HumanMessage(content=observation.content))
526
+ # If no error, process the observation normally
527
+ elif not has_error:
528
+ step_info = AgentStepInfo(number=self.state.n_steps, max_steps=self.conf.max_steps)
529
+ if hasattr(observation, 'dom_tree') and observation.dom_tree:
530
+ state_message = AgentMessagePrompt(
531
+ observation,
532
+ self.state.last_result,
533
+ include_attributes=self.settings.get('include_attributes'),
534
+ step_info=step_info,
535
+ ).get_user_message(self.settings.get('use_vision'))
536
+ messages.append(state_message)
537
+ elif observation.content:
538
+ messages.append(HumanMessage(content=observation.content))
539
+
540
+ # Add special message for the last step
541
+ # Note: Moved here from policy method to centralize all message building logic
542
+ if self.conf.max_steps <= self.state.n_steps:
543
+ last_step_message = f"""
544
+ Now comes your last step. Use only the "done" action now. No other actions - so here your action sequence must have length 1.
545
+ \nIf the task is not yet fully finished as requested by the user, set success in "done" to false! E.g. if not all steps are fully completed.
546
+ \nIf the task is fully finished, set success in "done" to true.
547
+ \nInclude everything you found out for the ultimate task in the done text.
548
+ """
549
+ messages.append(HumanMessage(content=[{'type': 'text', 'text': last_step_message}]))
550
+
551
+ return messages
552
+
553
+ def _estimate_tokens_for_messages(self, messages: List[BaseMessage]) -> int:
554
+ """Roughly estimate token count for message list"""
555
+ # Note: Using estimate_messages_tokens function from utils.py instead of calling _message_manager
556
+ # This decouples the dependency on MessageManager
557
+ return estimate_messages_tokens(
558
+ messages,
559
+ image_tokens=self.settings.get('image_tokens', 800),
560
+ estimated_characters_per_token=self.settings.get('estimated_characters_per_token', 3)
561
+ )
examples/browsers/common.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+
3
+ import json
4
+ import traceback
5
+ import uuid
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any, Optional, Dict, List
9
+
10
+ from openai import RateLimitError
11
+ from pydantic import BaseModel, ConfigDict, Field
12
+
13
+ from aworld.core.common import ActionResult
14
+
15
+
16
+ class PolicyMetadata(BaseModel):
17
+ """Metadata for a single step including timing information"""
18
+ start_time: float
19
+ end_time: float
20
+ number: int
21
+ input_tokens: int
22
+
23
+ @property
24
+ def duration_seconds(self) -> float:
25
+ """Calculate step duration in seconds"""
26
+ return self.end_time - self.start_time
27
+
28
+
29
+ class AgentBrain(BaseModel):
30
+ """Current state of the agent"""
31
+ evaluation_previous_goal: str = None
32
+ memory: str = None
33
+ thought: str = None
34
+ next_goal: str = None
35
+
36
+
37
+ class AgentHistory(BaseModel):
38
+ """History item for agent actions"""
39
+ model_output: Optional[BaseModel] = None
40
+ result: List[ActionResult]
41
+ metadata: Optional[PolicyMetadata] = None
42
+ content: Optional[str] = None
43
+ base64_img: Optional[str] = None
44
+
45
+ model_config = ConfigDict(arbitrary_types_allowed=True)
46
+
47
+ def model_dump(self, **kwargs) -> Dict[str, Any]:
48
+ """Custom serialization handling"""
49
+ return {
50
+ 'model_output': self.model_output.model_dump() if self.model_output else None,
51
+ 'result': [r.model_dump(exclude_none=True) for r in self.result],
52
+ 'metadata': self.metadata.model_dump() if self.metadata else None,
53
+ 'content': self.xml_content,
54
+ 'base64_img': self.base64_img
55
+ }
56
+
57
+
58
+ class AgentHistoryList(BaseModel):
59
+ """List of agent history items"""
60
+ history: List[AgentHistory]
61
+
62
+ def total_duration_seconds(self) -> float:
63
+ """Get total duration of all steps in seconds"""
64
+ total = 0.0
65
+ for h in self.history:
66
+ if h.metadata:
67
+ total += h.metadata.duration_seconds
68
+ return total
69
+
70
+ def save_to_file(self, filepath: str | Path) -> None:
71
+ """Save history to JSON file with proper serialization"""
72
+ try:
73
+ Path(filepath).parent.mkdir(parents=True, exist_ok=True)
74
+ data = self.model_dump()
75
+ with open(filepath, 'w', encoding='utf-8') as f:
76
+ json.dump(data, f, indent=2)
77
+ except Exception as e:
78
+ raise e
79
+
80
+ def model_dump(self, **kwargs) -> Dict[str, Any]:
81
+ """Custom serialization that properly uses AgentHistory's model_dump"""
82
+ return {
83
+ 'history': [h.model_dump(**kwargs) for h in self.history],
84
+ }
85
+
86
+ @classmethod
87
+ def load_from_file(cls, filepath: str | Path) -> 'AgentHistoryList':
88
+ """Load history from JSON file"""
89
+ with open(filepath, 'r', encoding='utf-8') as f:
90
+ data = json.load(f)
91
+ return cls.model_validate(data)
92
+
93
+
94
+ class AgentError:
95
+ """Container for agent error handling"""
96
+ VALIDATION_ERROR = 'Invalid model output format. Please follow the correct schema.'
97
+ RATE_LIMIT_ERROR = 'Rate limit reached. Waiting before retry.'
98
+ NO_VALID_ACTION = 'No valid action found'
99
+
100
+ @staticmethod
101
+ def format_error(error: Exception, include_trace: bool = False) -> str:
102
+ """Format error message based on error type and optionally include trace"""
103
+ if isinstance(error, RateLimitError):
104
+ return AgentError.RATE_LIMIT_ERROR
105
+ if include_trace:
106
+ return f'{str(error)}\nStacktrace:\n{traceback.format_exc()}'
107
+ return f'{str(error)}'
108
+
109
+
110
+ class AgentState(BaseModel):
111
+ """Holds all state information for an Agent"""
112
+
113
+ agent_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
114
+ n_steps: int = 1
115
+ consecutive_failures: int = 0
116
+ last_result: Optional[List['ActionResult']] = None
117
+ history: AgentHistoryList = Field(default_factory=lambda: AgentHistoryList(history=[]))
118
+ last_plan: Optional[str] = None
119
+ paused: bool = False
120
+ stopped: bool = False
121
+
122
+
123
+ @dataclass
124
+ class AgentStepInfo:
125
+ number: int
126
+ max_steps: int
127
+
128
+ def is_last_step(self) -> bool:
129
+ """Check if this is the last step"""
130
+ return self.number >= self.max_steps - 1
examples/browsers/config.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+
3
+ from typing import Optional
4
+ from aworld.config.conf import AgentConfig
5
+ from typing import Literal
6
+
7
+ ToolCallingMethod = Literal['function_calling', 'json_mode', 'raw', 'auto']
8
+
9
+
10
+ class BrowserAgentConfig(AgentConfig):
11
+ use_vision: bool = True
12
+ use_vision_for_planner: bool = False
13
+ save_conversation_path: Optional[str] = None
14
+ save_conversation_path_encoding: Optional[str] = 'utf-8'
15
+ max_failures: int = 3
16
+ retry_delay: int = 10
17
+ validate_output: bool = False
18
+ message_context: Optional[str] = None
19
+ generate_gif: bool | str = False
20
+ available_file_paths: Optional[list[str]] = None
21
+ override_system_message: Optional[str] = None
22
+ extend_system_message: Optional[str] = None
23
+ tool_calling_method: Optional[ToolCallingMethod] = 'auto'
24
+ max_llm_json_retries: int = 3
25
+ save_file_path: str = "browser_agent_history.json"
examples/browsers/prompts.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+
3
+ from datetime import datetime
4
+ from typing import List, Optional
5
+
6
+ from langchain_core.messages import HumanMessage, SystemMessage
7
+
8
+ from examples.browsers.common import AgentStepInfo
9
+ from aworld.core.common import Observation, ActionResult
10
+
11
+ PROMPT_TEMPLATE = """
12
+ You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task following the rules.
13
+
14
+ # Input Format
15
+ Task
16
+ Previous steps
17
+ Current URL
18
+ Open Tabs
19
+ Interactive Elements
20
+ [index]<type>text</type>
21
+ - index: Numeric identifier for interaction
22
+ - type: HTML element type (button, input, etc.)
23
+ - text: Element description
24
+ Example:
25
+ [33]<button>Submit Form</button>
26
+
27
+ - Only elements with numeric indexes in [] are interactive
28
+ - elements without [] provide only context
29
+
30
+ # Response Rules
31
+ 1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
32
+ {{"current_state": {{"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not",
33
+ "memory": "Description of what has been done and what you need to remember. Be very specific. Count here ALWAYS how many times you have done something and how many remain. E.g. 0 out of 10 websites analyzed. Continue with abc and xyz",
34
+ "thought": "Your thought or reasoning based on the ultimate task and current observations",
35
+ "next_goal": "What needs to be done with the next immediate action"}},
36
+ "action":[{{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence]}}
37
+
38
+ 2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {max_actions} actions per sequence.
39
+ Common action sequences:
40
+ - Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}]
41
+ - Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}]
42
+ - Actions are executed in the given order
43
+ - If the page changes after an action, the sequence is interrupted and you get the new state.
44
+ - Only provide the action sequence until an action which changes the page state significantly.
45
+ - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page
46
+ - only use multiple actions if it makes sense.
47
+
48
+ 3. ELEMENT INTERACTION:
49
+ - Only use indexes of the interactive elements
50
+ - Elements marked with "[]Non-interactive text" are non-interactive
51
+
52
+ 4. NAVIGATION & ERROR HANDLING:
53
+ - If no suitable elements exist, use other functions to complete the task
54
+ - If stuck, try alternative approaches - like going back to a previous page, new search, new tab etc.
55
+ - Handle popups/cookies by accepting or closing them
56
+ - Use scroll to find elements you are looking for
57
+ - If you want to research something, open a new tab instead of using the current tab
58
+ - If captcha pops up, try to solve it - else try a different approach
59
+ - If the page is not fully loaded, use wait action
60
+
61
+ 5. TASK COMPLETION:
62
+ - Use the done action as the last action as soon as the ultimate task is complete
63
+ - Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps.
64
+ - If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completly finished set success to true. If not everything the user asked for is completed set success in done to false!
65
+ - If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step.
66
+ - Don't hallucinate actions
67
+ - Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task.
68
+
69
+ 6. VISUAL CONTEXT:
70
+ - When an image is provided, use it to understand the page layout
71
+ - Bounding boxes with labels on their top right corner correspond to element indexes
72
+
73
+ 7. Form filling:
74
+ - If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
75
+
76
+ 8. Long tasks:
77
+ - Keep track of the status and subresults in the memory.
78
+
79
+ 9. Extraction:
80
+ - If your task is to find information - call extract_content on the specific pages to get and store the information.
81
+ Your responses must be always JSON with the specified format.
82
+ """
83
+
84
+
85
+ class SystemPrompt:
86
+ def __init__(self,
87
+ max_actions_per_step: int = 10,
88
+ override_system_message: Optional[str] = None,
89
+ extend_system_message: Optional[str] = None):
90
+ self.max_actions_per_step = max_actions_per_step
91
+ if override_system_message:
92
+ prompt = override_system_message
93
+ else:
94
+ prompt = PROMPT_TEMPLATE.format(max_actions=self.max_actions_per_step)
95
+
96
+ if extend_system_message:
97
+ prompt += f'\n{extend_system_message}'
98
+
99
+ self.system_message = SystemMessage(content=prompt)
100
+
101
+ def get_system_message(self) -> SystemMessage:
102
+ """
103
+ Get the system prompt for the agent.
104
+
105
+ Returns:
106
+ SystemMessage: Formatted system prompt
107
+ """
108
+ return self.system_message
109
+
110
+
111
+ class AgentMessagePrompt:
112
+ def __init__(
113
+ self,
114
+ state: Observation,
115
+ result: Optional[List[ActionResult]] = None,
116
+ include_attributes: list[str] = [],
117
+ step_info: Optional[AgentStepInfo] = None,
118
+ ):
119
+ self.state = state
120
+ self.result = result
121
+ self.include_attributes = include_attributes
122
+ self.step_info = step_info
123
+
124
+ def get_user_message(self, use_vision: bool = True) -> HumanMessage:
125
+ elements_text = self.state.dom_tree.element_tree.clickable_elements_to_string(
126
+ include_attributes=self.include_attributes)
127
+
128
+ pixels_above = self.state.info.get('pixels_above', 0)
129
+ pixels_below = self.state.info.get('pixels_below', 0)
130
+
131
+ if elements_text != '':
132
+ if pixels_above > 0:
133
+ elements_text = (
134
+ f'... {pixels_above} pixels above - scroll or extract content to see more ...\n{elements_text}'
135
+ )
136
+ else:
137
+ elements_text = f'[Start of page]\n{elements_text}'
138
+ if pixels_below > 0:
139
+ elements_text = (
140
+ f'{elements_text}\n... {pixels_below} pixels below - scroll or extract content to see more ...'
141
+ )
142
+ else:
143
+ elements_text = f'{elements_text}\n[End of page]'
144
+ else:
145
+ elements_text = 'empty page'
146
+
147
+ if self.step_info:
148
+ step_info_description = f'Current step: {self.step_info.number}/{self.step_info.max_steps}'
149
+ else:
150
+ step_info_description = ''
151
+ time_str = datetime.now().strftime('%Y-%m-%d %H:%M')
152
+ step_info_description += f'Current date and time: {time_str}'
153
+
154
+ state_description = f"""
155
+ [Task history memory ends]
156
+ [Current state starts here]
157
+ The following is one-time information - if you need to remember it write it to memory:
158
+ Current url: {self.state.info.get("url")}
159
+ Interactive elements from top layer of the current page inside the viewport:
160
+ {elements_text}
161
+ {step_info_description}
162
+ """
163
+
164
+ if self.result:
165
+ for i, result in enumerate(self.result):
166
+ if result.content:
167
+ state_description += f'\nAction result {i + 1}/{len(self.result)}: {result.content}'
168
+ if result.error:
169
+ # only use last line of error
170
+ error = result.error.split('\n')[-1]
171
+ state_description += f'\nAction error {i + 1}/{len(self.result)}: ...{error}'
172
+
173
+ if self.state.image and use_vision == True:
174
+ # Format message for vision model
175
+ return HumanMessage(
176
+ content=[
177
+ {'type': 'text', 'text': state_description},
178
+ {
179
+ 'type': 'image_url',
180
+ 'image_url': {'url': f'data:image/png;base64,{self.state.image}'}, # , 'detail': 'low'
181
+ },
182
+ ]
183
+ )
184
+
185
+ return HumanMessage(content=state_description)
186
+
187
+
188
+ class PlannerPrompt(SystemPrompt):
189
+ def get_system_message(self) -> SystemMessage:
190
+ return SystemMessage(
191
+ content="""You are a planning agent that helps break down tasks into smaller steps and reason about the current state.
192
+ Your role is to:
193
+ 1. Analyze the current state and history
194
+ 2. Evaluate progress towards the ultimate goal
195
+ 3. Identify potential challenges or roadblocks
196
+ 4. Suggest the next high-level steps to take
197
+
198
+ Inside your messages, there will be AI messages from different agents with different formats.
199
+
200
+ Your output format should be always a JSON object with the following fields:
201
+ {
202
+ "state_analysis": "Brief analysis of the current state and what has been done so far",
203
+ "progress_evaluation": "Evaluation of progress towards the ultimate goal (as percentage and description)",
204
+ "challenges": "List any potential challenges or roadblocks",
205
+ "next_steps": "List 2-3 concrete next steps to take",
206
+ "reasoning": "Explain your reasoning for the suggested next steps"
207
+ }
208
+
209
+ Ignore the other AI messages output structures.
210
+ don't forget the index param for input_text action.
211
+ Keep your responses concise and focused on actionable insights."""
212
+ )
examples/browsers/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ langchain~=0.3.20
2
+ langchain-openai~=0.3.8
3
+ langchain-ollama~=0.2.3
4
+ langchain-anthropic~=0.3.9
5
+ langchain-mistralai~=0.2.7
6
+ langchain-google-genai~=2.1.0
examples/browsers/run.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+ # Copyright (c) 2025 inclusionAI.
3
+
4
+ from aworld.config.conf import ModelConfig
5
+ from aworld.core.task import Task
6
+ from aworld.runner import Runners
7
+ from examples.browsers.agent import BrowserAgent
8
+ from examples.browsers.config import BrowserAgentConfig
9
+ from examples.tools.common import Agents, Tools
10
+ from examples.tools.conf import BrowserToolConfig
11
+
12
+ if __name__ == '__main__':
13
+ llm_config = ModelConfig(
14
+ llm_provider="openai",
15
+ llm_model_name="gpt-4o",
16
+ llm_temperature=0.3,
17
+
18
+ )
19
+ browser_tool_config = BrowserToolConfig(width=1280,
20
+ height=720,
21
+ headless=False,
22
+ keep_browser_open=True,
23
+ use_async=True,
24
+ llm_config=llm_config)
25
+ agent_config = BrowserAgentConfig(
26
+ name=Agents.BROWSER.value,
27
+ tool_calling_method="raw",
28
+ llm_config=llm_config,
29
+ max_actions_per_step=10,
30
+ max_input_tokens=128000,
31
+ working_dir=".",
32
+ # llm model not supported vision, need to set `False`
33
+ # use_vision=False
34
+ )
35
+
36
+ task_config = {
37
+ 'max_steps': 100,
38
+ 'max_actions_per_step': 100
39
+ }
40
+
41
+ task = Task(
42
+ input="""step1: first go to https://www.dangdang.com/ and search for 'the little prince' and rank by sales from high to low, get the first 5 results and put the products info in memory.
43
+ step 2: write each product's title, price, discount, and publisher information to a fully structured HTML document with write_to_file, ensuring that the data is presented in a table with visible grid lines.
44
+ step3: open the html file in browser by go_to_url""",
45
+ agent=BrowserAgent(conf=agent_config, tool_names=[Tools.BROWSER.name]),
46
+ tools_conf={Tools.BROWSER.value: browser_tool_config},
47
+ conf=task_config
48
+ )
49
+ Runners.sync_run_task(task)
examples/browsers/utils.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+ import requests
3
+ import json
4
+ from io import BytesIO
5
+ import os
6
+ from typing import Any, Optional, Type
7
+ import base64
8
+
9
+ from langchain_core.messages import (
10
+ AIMessage,
11
+ BaseMessage,
12
+ HumanMessage,
13
+ SystemMessage,
14
+ ToolMessage,
15
+ )
16
+
17
+ from aworld.logs.util import logger
18
+
19
+
20
+ def extract_json_from_model_output(content: str) -> dict:
21
+ """Extract JSON from model output, handling both plain JSON and code-block-wrapped JSON."""
22
+ try:
23
+ # If content is wrapped in code blocks, extract just the JSON part
24
+ if '```' in content:
25
+ # Find the JSON content between code blocks
26
+ content = content.split('```')[1]
27
+ # Remove language identifier if present (e.g., 'json\n')
28
+ if '\n' in content:
29
+ content = content.split('\n', 1)[1]
30
+ # Parse the cleaned content
31
+ return json.loads(content)
32
+ except json.JSONDecodeError as e:
33
+ logger.warning(f'Failed to parse model output: {content} {str(e)}')
34
+ raise ValueError('Could not parse response.')
35
+
36
+
37
+ def convert_input_messages(input_messages: list[BaseMessage], model_name: Optional[str]) -> list[BaseMessage]:
38
+ """Convert input messages to a format that is compatible with the planner model"""
39
+ if model_name is None:
40
+ return input_messages
41
+ if model_name == 'deepseek-reasoner' or model_name.startswith('deepseek-r1'):
42
+ converted_input_messages = _convert_messages_for_non_function_calling_models(input_messages)
43
+ merged_input_messages = _merge_successive_messages(converted_input_messages, HumanMessage)
44
+ merged_input_messages = _merge_successive_messages(merged_input_messages, AIMessage)
45
+ return merged_input_messages
46
+ return input_messages
47
+
48
+
49
+ def _convert_messages_for_non_function_calling_models(input_messages: list[BaseMessage]) -> list[BaseMessage]:
50
+ """Convert messages for non-function-calling models"""
51
+ output_messages = []
52
+ for message in input_messages:
53
+ if isinstance(message, HumanMessage):
54
+ output_messages.append(message)
55
+ elif isinstance(message, SystemMessage):
56
+ output_messages.append(message)
57
+ elif isinstance(message, ToolMessage):
58
+ output_messages.append(HumanMessage(content=message.content))
59
+ elif isinstance(message, AIMessage):
60
+ # check if tool_calls is a valid JSON object
61
+ if message.tool_calls:
62
+ tool_calls = json.dumps(message.tool_calls)
63
+ output_messages.append(AIMessage(content=tool_calls))
64
+ else:
65
+ output_messages.append(message)
66
+ else:
67
+ raise ValueError(f'Unknown message type: {type(message)}')
68
+ return output_messages
69
+
70
+
71
+ def _merge_successive_messages(messages: list[BaseMessage], class_to_merge: Type[BaseMessage]) -> list[BaseMessage]:
72
+ """Some models like deepseek-reasoner dont allow multiple human messages in a row. This function merges them into one."""
73
+ merged_messages = []
74
+ streak = 0
75
+ for message in messages:
76
+ if isinstance(message, class_to_merge):
77
+ streak += 1
78
+ if streak > 1:
79
+ if isinstance(message.content, list):
80
+ merged_messages[-1].content += message.content[0]['text'] # type:ignore
81
+ else:
82
+ merged_messages[-1].content += message.content
83
+ else:
84
+ merged_messages.append(message)
85
+ else:
86
+ merged_messages.append(message)
87
+ streak = 0
88
+ return merged_messages
89
+
90
+
91
+ def save_conversation(input_messages: list[BaseMessage], response: Any, target: str,
92
+ encoding: Optional[str] = None) -> None:
93
+ """Save conversation history to file."""
94
+
95
+ # create folders if not exists
96
+ os.makedirs(os.path.dirname(target), exist_ok=True)
97
+
98
+ with open(
99
+ target,
100
+ 'w',
101
+ encoding=encoding,
102
+ ) as f:
103
+ _write_messages_to_file(f, input_messages)
104
+ _write_response_to_file(f, response)
105
+
106
+
107
+ def _write_messages_to_file(f: Any, messages: list[BaseMessage]) -> None:
108
+ """Write messages to conversation file"""
109
+ for message in messages:
110
+ f.write(f' {message.__class__.__name__} \n')
111
+
112
+ if isinstance(message.content, list):
113
+ for item in message.content:
114
+ if isinstance(item, dict) and item.get('type') == 'text':
115
+ f.write(item['text'].strip() + '\n')
116
+ elif isinstance(message.content, str):
117
+ try:
118
+ content = json.loads(message.content)
119
+ f.write(json.dumps(content, indent=2) + '\n')
120
+ except json.JSONDecodeError:
121
+ f.write(message.content.strip() + '\n')
122
+
123
+ f.write('\n')
124
+
125
+
126
+ def _write_response_to_file(f: Any, response: Any) -> None:
127
+ """Write model response to conversation file"""
128
+ f.write(' RESPONSE\n')
129
+ f.write(json.dumps(json.loads(response.model_dump_json(exclude_unset=True)), indent=2))
130
+
131
+
132
+ # Add token counting related functions
133
+ # Note: These functions have been moved from memory.py and agent.py to utils.py, removing the dependency on MessageManager class
134
+
135
+ def estimate_text_tokens(text: str, estimated_characters_per_token: int = 3) -> int:
136
+ """Roughly estimate token count in text
137
+
138
+ Args:
139
+ text: The text to estimate tokens for
140
+ estimated_characters_per_token: Estimated characters per token, default is 3
141
+
142
+ Returns:
143
+ Estimated token count
144
+ """
145
+ if not text:
146
+ return 0
147
+ # Use character count divided by average characters per token to estimate tokens
148
+ return len(text) // estimated_characters_per_token
149
+
150
+
151
+ def estimate_message_tokens(message: BaseMessage, image_tokens: int = 800,
152
+ estimated_characters_per_token: int = 3) -> int:
153
+ """Roughly estimate token count for a single message
154
+
155
+ Args:
156
+ message: The message to estimate tokens for
157
+ image_tokens: Estimated tokens per image, default is 800
158
+ estimated_characters_per_token: Estimated characters per token, default is 3
159
+
160
+ Returns:
161
+ Estimated token count
162
+ """
163
+ tokens = 0
164
+ # Handle tuple case
165
+ if isinstance(message, tuple):
166
+ # Convert to string and estimate tokens
167
+ message_str = str(message)
168
+ return estimate_text_tokens(message_str, estimated_characters_per_token)
169
+
170
+ if isinstance(message.content, list):
171
+ for item in message.content:
172
+ if 'image_url' in item:
173
+ tokens += image_tokens
174
+ elif isinstance(item, dict) and 'text' in item:
175
+ tokens += estimate_text_tokens(item['text'], estimated_characters_per_token)
176
+ else:
177
+ msg = message.content
178
+ if hasattr(message, 'tool_calls'):
179
+ msg += str(message.tool_calls) # type: ignore
180
+ tokens += estimate_text_tokens(msg, estimated_characters_per_token)
181
+ return tokens
182
+
183
+
184
+ def estimate_messages_tokens(messages: list[BaseMessage], image_tokens: int = 800,
185
+ estimated_characters_per_token: int = 3) -> int:
186
+ """Roughly estimate total token count for a list of messages
187
+
188
+ Args:
189
+ messages: The list of messages to estimate tokens for
190
+ image_tokens: Estimated tokens per image, default is 800
191
+ estimated_characters_per_token: Estimated characters per token, default is 3
192
+
193
+ Returns:
194
+ Estimated total token count
195
+ """
196
+ total_tokens = 0
197
+ for msg in messages:
198
+ total_tokens += estimate_message_tokens(msg, image_tokens, estimated_characters_per_token)
199
+ return total_tokens