Add A3-Qwen3.5-9B WorkArena-L2 results (9.7%)
Browse filesAdding WorkArena++ L2 (test split, 185 tasks) evaluation results for A3-Qwen3.5-9B.
Score: 9.7% (±2.2 std err)
Model not trained on ServiceNow data.
Follows standard GenericAgent + BrowserGym evaluation protocol.
results/A3-Qwen3.5-9B/workarena-l2.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "A3-Qwen3.5-9B",
|
| 4 |
+
"study_id": "2026-03-14_12-38-08_genericagent-checkpoints-qwen-qwen3-5-9b-web-pro-low-8903051-checkpoint-latest-on-workarena-l2-test-test",
|
| 5 |
+
"date_time": "2026-03-14 12:38:08",
|
| 6 |
+
"benchmark": "WorkArena-L2",
|
| 7 |
+
"score": 9.7,
|
| 8 |
+
"std_err": 2.2,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "185 tasks (test split). Model not trained on ServiceNow data.",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|