Denis Davydov commited on
Commit
2e79a34
Β·
1 Parent(s): a5c9e62

fix submitted_answer

Browse files
Files changed (2) hide show
  1. app.py +2 -2
  2. test_agent_format.py +0 -99
app.py CHANGED
@@ -74,7 +74,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
74
  submitted_answer, reasoning_trace = agent(question_text, task_id)
75
  answers_payload.append({
76
  "task_id": task_id,
77
- "model_answer": submitted_answer,
78
  "reasoning_trace": reasoning_trace
79
  })
80
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
@@ -83,7 +83,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
83
  error_answer = f"AGENT ERROR: {e}"
84
  answers_payload.append({
85
  "task_id": task_id,
86
- "model_answer": error_answer,
87
  "reasoning_trace": f"Error occurred: {str(e)}"
88
  })
89
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": error_answer})
 
74
  submitted_answer, reasoning_trace = agent(question_text, task_id)
75
  answers_payload.append({
76
  "task_id": task_id,
77
+ "submitted_answer": submitted_answer,
78
  "reasoning_trace": reasoning_trace
79
  })
80
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
83
  error_answer = f"AGENT ERROR: {e}"
84
  answers_payload.append({
85
  "task_id": task_id,
86
+ "submitted_answer": error_answer,
87
  "reasoning_trace": f"Error occurred: {str(e)}"
88
  })
89
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": error_answer})
test_agent_format.py DELETED
@@ -1,99 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Test script to verify the agent's answer formatting works correctly.
4
- """
5
-
6
- import os
7
- from agent import smart_agent
8
- from utils import format_gaia_answer
9
-
10
- def test_answer_formatting():
11
- """Test the answer formatting function with various inputs."""
12
-
13
- test_cases = [
14
- # Test case: (raw_answer, expected_format)
15
- ("I think the answer is 42. FINAL ANSWER: 42", "42"),
16
- ("Let me calculate... FINAL ANSWER: 3.14159", "3.14159"),
17
- ("After research, FINAL ANSWER: New York", "New York"),
18
- ("The result is FINAL ANSWER: apple, banana, cherry", "apple, banana, cherry"),
19
- ("FINAL ANSWER: 1,234", "1234"), # Should remove commas from numbers
20
- ("FINAL ANSWER: \"Hello World\"", "Hello World"), # Should remove quotes
21
- ("FINAL ANSWER: approximately 100", "100"), # Should remove qualifiers
22
- ("No clear final answer format here", "No clear final answer format here"), # Fallback
23
- ]
24
-
25
- print("πŸ§ͺ Testing answer formatting...")
26
- for i, (raw, expected) in enumerate(test_cases, 1):
27
- result = format_gaia_answer(raw)
28
- status = "βœ…" if result == expected else "❌"
29
- print(f"{status} Test {i}: '{raw}' -> '{result}' (expected: '{expected}')")
30
- if result != expected:
31
- print(f" ⚠️ Mismatch detected!")
32
-
33
- print("\n" + "="*50)
34
-
35
- def test_simple_question():
36
- """Test the agent with a simple question."""
37
- print("πŸ€– Testing agent with a simple question...")
38
-
39
- question = "What is 2 + 2?"
40
- try:
41
- answer, reasoning = smart_agent(question)
42
- print(f"Question: {question}")
43
- print(f"Answer: {answer}")
44
- print(f"Reasoning length: {len(reasoning)} characters")
45
- print(f"Raw reasoning preview: {reasoning[:200]}...")
46
-
47
- # Check if answer follows expected format
48
- if answer and answer.strip():
49
- print("βœ… Agent returned a non-empty answer")
50
- else:
51
- print("❌ Agent returned empty answer")
52
-
53
- except Exception as e:
54
- print(f"❌ Error testing agent: {e}")
55
-
56
- print("\n" + "="*50)
57
-
58
- def test_api_format():
59
- """Test that our submission format matches API expectations."""
60
- print("πŸ“‘ Testing API submission format...")
61
-
62
- # Simulate what would be sent to the API
63
- sample_submission = {
64
- "task_id": "test_task_1",
65
- "model_answer": "42",
66
- "reasoning_trace": "I calculated 2+2 and got 4, but the question asks for something else..."
67
- }
68
-
69
- required_fields = ["task_id", "model_answer"]
70
- optional_fields = ["reasoning_trace"]
71
-
72
- print("Required fields check:")
73
- for field in required_fields:
74
- if field in sample_submission:
75
- print(f"βœ… {field}: {sample_submission[field]}")
76
- else:
77
- print(f"❌ Missing required field: {field}")
78
-
79
- print("Optional fields check:")
80
- for field in optional_fields:
81
- if field in sample_submission:
82
- print(f"βœ… {field}: Present ({len(str(sample_submission[field]))} chars)")
83
- else:
84
- print(f"ℹ️ Optional field not present: {field}")
85
-
86
- if __name__ == "__main__":
87
- print("πŸ”§ GAIA Agent Format Testing")
88
- print("="*50)
89
-
90
- # Test 1: Answer formatting
91
- test_answer_formatting()
92
-
93
- # Test 2: Simple agent question
94
- test_simple_question()
95
-
96
- # Test 3: API format
97
- test_api_format()
98
-
99
- print("🏁 Testing complete!")