Denis Davydov
commited on
Commit
Β·
2e79a34
1
Parent(s):
a5c9e62
fix submitted_answer
Browse files- app.py +2 -2
- test_agent_format.py +0 -99
app.py
CHANGED
@@ -74,7 +74,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
74 |
submitted_answer, reasoning_trace = agent(question_text, task_id)
|
75 |
answers_payload.append({
|
76 |
"task_id": task_id,
|
77 |
-
"
|
78 |
"reasoning_trace": reasoning_trace
|
79 |
})
|
80 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
@@ -83,7 +83,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
83 |
error_answer = f"AGENT ERROR: {e}"
|
84 |
answers_payload.append({
|
85 |
"task_id": task_id,
|
86 |
-
"
|
87 |
"reasoning_trace": f"Error occurred: {str(e)}"
|
88 |
})
|
89 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": error_answer})
|
|
|
74 |
submitted_answer, reasoning_trace = agent(question_text, task_id)
|
75 |
answers_payload.append({
|
76 |
"task_id": task_id,
|
77 |
+
"submitted_answer": submitted_answer,
|
78 |
"reasoning_trace": reasoning_trace
|
79 |
})
|
80 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
|
|
83 |
error_answer = f"AGENT ERROR: {e}"
|
84 |
answers_payload.append({
|
85 |
"task_id": task_id,
|
86 |
+
"submitted_answer": error_answer,
|
87 |
"reasoning_trace": f"Error occurred: {str(e)}"
|
88 |
})
|
89 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": error_answer})
|
test_agent_format.py
DELETED
@@ -1,99 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
"""
|
3 |
-
Test script to verify the agent's answer formatting works correctly.
|
4 |
-
"""
|
5 |
-
|
6 |
-
import os
|
7 |
-
from agent import smart_agent
|
8 |
-
from utils import format_gaia_answer
|
9 |
-
|
10 |
-
def test_answer_formatting():
|
11 |
-
"""Test the answer formatting function with various inputs."""
|
12 |
-
|
13 |
-
test_cases = [
|
14 |
-
# Test case: (raw_answer, expected_format)
|
15 |
-
("I think the answer is 42. FINAL ANSWER: 42", "42"),
|
16 |
-
("Let me calculate... FINAL ANSWER: 3.14159", "3.14159"),
|
17 |
-
("After research, FINAL ANSWER: New York", "New York"),
|
18 |
-
("The result is FINAL ANSWER: apple, banana, cherry", "apple, banana, cherry"),
|
19 |
-
("FINAL ANSWER: 1,234", "1234"), # Should remove commas from numbers
|
20 |
-
("FINAL ANSWER: \"Hello World\"", "Hello World"), # Should remove quotes
|
21 |
-
("FINAL ANSWER: approximately 100", "100"), # Should remove qualifiers
|
22 |
-
("No clear final answer format here", "No clear final answer format here"), # Fallback
|
23 |
-
]
|
24 |
-
|
25 |
-
print("π§ͺ Testing answer formatting...")
|
26 |
-
for i, (raw, expected) in enumerate(test_cases, 1):
|
27 |
-
result = format_gaia_answer(raw)
|
28 |
-
status = "β
" if result == expected else "β"
|
29 |
-
print(f"{status} Test {i}: '{raw}' -> '{result}' (expected: '{expected}')")
|
30 |
-
if result != expected:
|
31 |
-
print(f" β οΈ Mismatch detected!")
|
32 |
-
|
33 |
-
print("\n" + "="*50)
|
34 |
-
|
35 |
-
def test_simple_question():
|
36 |
-
"""Test the agent with a simple question."""
|
37 |
-
print("π€ Testing agent with a simple question...")
|
38 |
-
|
39 |
-
question = "What is 2 + 2?"
|
40 |
-
try:
|
41 |
-
answer, reasoning = smart_agent(question)
|
42 |
-
print(f"Question: {question}")
|
43 |
-
print(f"Answer: {answer}")
|
44 |
-
print(f"Reasoning length: {len(reasoning)} characters")
|
45 |
-
print(f"Raw reasoning preview: {reasoning[:200]}...")
|
46 |
-
|
47 |
-
# Check if answer follows expected format
|
48 |
-
if answer and answer.strip():
|
49 |
-
print("β
Agent returned a non-empty answer")
|
50 |
-
else:
|
51 |
-
print("β Agent returned empty answer")
|
52 |
-
|
53 |
-
except Exception as e:
|
54 |
-
print(f"β Error testing agent: {e}")
|
55 |
-
|
56 |
-
print("\n" + "="*50)
|
57 |
-
|
58 |
-
def test_api_format():
|
59 |
-
"""Test that our submission format matches API expectations."""
|
60 |
-
print("π‘ Testing API submission format...")
|
61 |
-
|
62 |
-
# Simulate what would be sent to the API
|
63 |
-
sample_submission = {
|
64 |
-
"task_id": "test_task_1",
|
65 |
-
"model_answer": "42",
|
66 |
-
"reasoning_trace": "I calculated 2+2 and got 4, but the question asks for something else..."
|
67 |
-
}
|
68 |
-
|
69 |
-
required_fields = ["task_id", "model_answer"]
|
70 |
-
optional_fields = ["reasoning_trace"]
|
71 |
-
|
72 |
-
print("Required fields check:")
|
73 |
-
for field in required_fields:
|
74 |
-
if field in sample_submission:
|
75 |
-
print(f"β
{field}: {sample_submission[field]}")
|
76 |
-
else:
|
77 |
-
print(f"β Missing required field: {field}")
|
78 |
-
|
79 |
-
print("Optional fields check:")
|
80 |
-
for field in optional_fields:
|
81 |
-
if field in sample_submission:
|
82 |
-
print(f"β
{field}: Present ({len(str(sample_submission[field]))} chars)")
|
83 |
-
else:
|
84 |
-
print(f"βΉοΈ Optional field not present: {field}")
|
85 |
-
|
86 |
-
if __name__ == "__main__":
|
87 |
-
print("π§ GAIA Agent Format Testing")
|
88 |
-
print("="*50)
|
89 |
-
|
90 |
-
# Test 1: Answer formatting
|
91 |
-
test_answer_formatting()
|
92 |
-
|
93 |
-
# Test 2: Simple agent question
|
94 |
-
test_simple_question()
|
95 |
-
|
96 |
-
# Test 3: API format
|
97 |
-
test_api_format()
|
98 |
-
|
99 |
-
print("π Testing complete!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|