Added readme, visualwebarena
Browse files- app.py +1 -1
- results/GenericAgent-Claude-3.5-Sonnet/README.md +3 -1
- results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json +16 -0
- results/GenericAgent-GPT-4o-mini/visualwebarena.json +16 -0
- results/GenericAgent-GPT-4o/README.md +46 -1
- results/GenericAgent-GPT-4o/visualwebarena.json +16 -0
- results/GenericAgent-GPT-o1-mini/README.md +46 -1
- results/GenericAgent-Llama-3.1-405b/README.md +46 -1
- results/GenericAgent-Llama-3.1-70b/README.md +46 -1
app.py
CHANGED
|
@@ -17,7 +17,7 @@ import re
|
|
| 17 |
import html
|
| 18 |
from typing import Dict, Any
|
| 19 |
|
| 20 |
-
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "AssistantBench"]
|
| 21 |
|
| 22 |
def sanitize_agent_name(agent_name):
|
| 23 |
# Only allow alphanumeric chars, hyphen, underscore
|
|
|
|
| 17 |
import html
|
| 18 |
from typing import Dict, Any
|
| 19 |
|
| 20 |
+
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "VisualWebArena", "AssistantBench"]
|
| 21 |
|
| 22 |
def sanitize_agent_name(agent_name):
|
| 23 |
# Only allow alphanumeric chars, hyphen, underscore
|
results/GenericAgent-Claude-3.5-Sonnet/README.md
CHANGED
|
@@ -41,4 +41,6 @@ BASE_FLAGS = GenericPromptFlags(
|
|
| 41 |
be_cautious=True,
|
| 42 |
extra_instructions=None,
|
| 43 |
)
|
| 44 |
-
```
|
|
|
|
|
|
|
|
|
| 41 |
be_cautious=True,
|
| 42 |
extra_instructions=None,
|
| 43 |
)
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
|
results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "VisualWebArena",
|
| 6 |
+
"score": 21.0,
|
| 7 |
+
"std_err": 1.3,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/GenericAgent-GPT-4o-mini/visualwebarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "VisualWebArena",
|
| 7 |
+
"score": 16.9,
|
| 8 |
+
"std_err": 1.2,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/GenericAgent-GPT-4o/README.md
CHANGED
|
@@ -1 +1,46 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### GenericAgent-GPT-4o
|
| 2 |
+
|
| 3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
| 4 |
+
|
| 5 |
+
It uses GPT-4o as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
| 6 |
+
```python
|
| 7 |
+
BASE_FLAGS = GenericPromptFlags(
|
| 8 |
+
obs=dp.ObsFlags(
|
| 9 |
+
use_html=False,
|
| 10 |
+
use_ax_tree=True,
|
| 11 |
+
use_focused_element=True,
|
| 12 |
+
use_error_logs=True,
|
| 13 |
+
use_history=True,
|
| 14 |
+
use_past_error_logs=False,
|
| 15 |
+
use_action_history=True,
|
| 16 |
+
use_think_history=True, # gpt-4o config except for this line
|
| 17 |
+
use_diff=False,
|
| 18 |
+
html_type="pruned_html",
|
| 19 |
+
use_screenshot=False,
|
| 20 |
+
use_som=False,
|
| 21 |
+
extract_visible_tag=True,
|
| 22 |
+
extract_clickable_tag=True,
|
| 23 |
+
extract_coords="False",
|
| 24 |
+
filter_visible_elements_only=False,
|
| 25 |
+
),
|
| 26 |
+
action=dp.ActionFlags(
|
| 27 |
+
multi_actions=False,
|
| 28 |
+
action_set="bid",
|
| 29 |
+
long_description=False,
|
| 30 |
+
individual_examples=False,
|
| 31 |
+
),
|
| 32 |
+
use_plan=False,
|
| 33 |
+
use_criticise=False,
|
| 34 |
+
use_thinking=True,
|
| 35 |
+
use_memory=False,
|
| 36 |
+
use_concrete_example=True,
|
| 37 |
+
use_abstract_example=True,
|
| 38 |
+
use_hints=True,
|
| 39 |
+
enable_chat=False,
|
| 40 |
+
max_prompt_tokens=40_000,
|
| 41 |
+
be_cautious=True,
|
| 42 |
+
extra_instructions=None,
|
| 43 |
+
)
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
|
results/GenericAgent-GPT-4o/visualwebarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "VisualWebArena",
|
| 7 |
+
"score": 26.7,
|
| 8 |
+
"std_err": 1.5,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/GenericAgent-GPT-o1-mini/README.md
CHANGED
|
@@ -1 +1,46 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### GenericAgent-GPT-o1-mini
|
| 2 |
+
|
| 3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
| 4 |
+
|
| 5 |
+
It uses o1-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
| 6 |
+
```python
|
| 7 |
+
BASE_FLAGS = GenericPromptFlags(
|
| 8 |
+
obs=dp.ObsFlags(
|
| 9 |
+
use_html=False,
|
| 10 |
+
use_ax_tree=True,
|
| 11 |
+
use_focused_element=True,
|
| 12 |
+
use_error_logs=True,
|
| 13 |
+
use_history=True,
|
| 14 |
+
use_past_error_logs=False,
|
| 15 |
+
use_action_history=True,
|
| 16 |
+
use_think_history=True, # gpt-4o config except for this line
|
| 17 |
+
use_diff=False,
|
| 18 |
+
html_type="pruned_html",
|
| 19 |
+
use_screenshot=False,
|
| 20 |
+
use_som=False,
|
| 21 |
+
extract_visible_tag=True,
|
| 22 |
+
extract_clickable_tag=True,
|
| 23 |
+
extract_coords="False",
|
| 24 |
+
filter_visible_elements_only=False,
|
| 25 |
+
),
|
| 26 |
+
action=dp.ActionFlags(
|
| 27 |
+
multi_actions=False,
|
| 28 |
+
action_set="bid",
|
| 29 |
+
long_description=False,
|
| 30 |
+
individual_examples=False,
|
| 31 |
+
),
|
| 32 |
+
use_plan=False,
|
| 33 |
+
use_criticise=False,
|
| 34 |
+
use_thinking=True,
|
| 35 |
+
use_memory=False,
|
| 36 |
+
use_concrete_example=True,
|
| 37 |
+
use_abstract_example=True,
|
| 38 |
+
use_hints=True,
|
| 39 |
+
enable_chat=False,
|
| 40 |
+
max_prompt_tokens=40_000,
|
| 41 |
+
be_cautious=True,
|
| 42 |
+
extra_instructions=None,
|
| 43 |
+
)
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
|
results/GenericAgent-Llama-3.1-405b/README.md
CHANGED
|
@@ -1 +1,46 @@
|
|
| 1 |
-
### Llama-3.1-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### GenericAgent-Llama-3.1-405b
|
| 2 |
+
|
| 3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
| 4 |
+
|
| 5 |
+
It uses Llama-3.1-405b as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
| 6 |
+
```python
|
| 7 |
+
BASE_FLAGS = GenericPromptFlags(
|
| 8 |
+
obs=dp.ObsFlags(
|
| 9 |
+
use_html=False,
|
| 10 |
+
use_ax_tree=True,
|
| 11 |
+
use_focused_element=True,
|
| 12 |
+
use_error_logs=True,
|
| 13 |
+
use_history=True,
|
| 14 |
+
use_past_error_logs=False,
|
| 15 |
+
use_action_history=True,
|
| 16 |
+
use_think_history=True, # gpt-4o config except for this line
|
| 17 |
+
use_diff=False,
|
| 18 |
+
html_type="pruned_html",
|
| 19 |
+
use_screenshot=False,
|
| 20 |
+
use_som=False,
|
| 21 |
+
extract_visible_tag=True,
|
| 22 |
+
extract_clickable_tag=True,
|
| 23 |
+
extract_coords="False",
|
| 24 |
+
filter_visible_elements_only=False,
|
| 25 |
+
),
|
| 26 |
+
action=dp.ActionFlags(
|
| 27 |
+
multi_actions=False,
|
| 28 |
+
action_set="bid",
|
| 29 |
+
long_description=False,
|
| 30 |
+
individual_examples=False,
|
| 31 |
+
),
|
| 32 |
+
use_plan=False,
|
| 33 |
+
use_criticise=False,
|
| 34 |
+
use_thinking=True,
|
| 35 |
+
use_memory=False,
|
| 36 |
+
use_concrete_example=True,
|
| 37 |
+
use_abstract_example=True,
|
| 38 |
+
use_hints=True,
|
| 39 |
+
enable_chat=False,
|
| 40 |
+
max_prompt_tokens=40_000,
|
| 41 |
+
be_cautious=True,
|
| 42 |
+
extra_instructions=None,
|
| 43 |
+
)
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
|
results/GenericAgent-Llama-3.1-70b/README.md
CHANGED
|
@@ -1 +1,46 @@
|
|
| 1 |
-
### Llama-3.1-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### GenericAgent-Llama-3.1-70b
|
| 2 |
+
|
| 3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
| 4 |
+
|
| 5 |
+
It uses Llama-3.1-70b as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
| 6 |
+
```python
|
| 7 |
+
BASE_FLAGS = GenericPromptFlags(
|
| 8 |
+
obs=dp.ObsFlags(
|
| 9 |
+
use_html=False,
|
| 10 |
+
use_ax_tree=True,
|
| 11 |
+
use_focused_element=True,
|
| 12 |
+
use_error_logs=True,
|
| 13 |
+
use_history=True,
|
| 14 |
+
use_past_error_logs=False,
|
| 15 |
+
use_action_history=True,
|
| 16 |
+
use_think_history=True, # gpt-4o config except for this line
|
| 17 |
+
use_diff=False,
|
| 18 |
+
html_type="pruned_html",
|
| 19 |
+
use_screenshot=False,
|
| 20 |
+
use_som=False,
|
| 21 |
+
extract_visible_tag=True,
|
| 22 |
+
extract_clickable_tag=True,
|
| 23 |
+
extract_coords="False",
|
| 24 |
+
filter_visible_elements_only=False,
|
| 25 |
+
),
|
| 26 |
+
action=dp.ActionFlags(
|
| 27 |
+
multi_actions=False,
|
| 28 |
+
action_set="bid",
|
| 29 |
+
long_description=False,
|
| 30 |
+
individual_examples=False,
|
| 31 |
+
),
|
| 32 |
+
use_plan=False,
|
| 33 |
+
use_criticise=False,
|
| 34 |
+
use_thinking=True,
|
| 35 |
+
use_memory=False,
|
| 36 |
+
use_concrete_example=True,
|
| 37 |
+
use_abstract_example=True,
|
| 38 |
+
use_hints=True,
|
| 39 |
+
enable_chat=False,
|
| 40 |
+
max_prompt_tokens=40_000,
|
| 41 |
+
be_cautious=True,
|
| 42 |
+
extra_instructions=None,
|
| 43 |
+
)
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
|