Update agent.py
Browse files
agent.py
CHANGED
@@ -1,3 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
from openai import OpenAI
|
3 |
|
@@ -6,40 +80,65 @@ class GaiaAgent:
|
|
6 |
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
7 |
self.instructions = (
|
8 |
"You are a research assistant solving GAIA benchmark questions using 2022 English Wikipedia knowledge.\n"
|
9 |
-
"For each question, reason step-by-step and only return the final answer in exact format."
|
10 |
)
|
11 |
self.task_templates = {
|
12 |
-
"8e867cd7-cff9-4e6c-867a-ff5ddc2550be": self.q_mercedes_sosa_albums
|
|
|
|
|
13 |
}
|
14 |
|
15 |
def __call__(self, question: str, task_id: str = None) -> str:
|
16 |
if task_id in self.task_templates:
|
17 |
-
|
|
|
18 |
else:
|
19 |
-
return "[SKIPPED: Task not yet implemented in Agent V9]"
|
20 |
|
21 |
def q_mercedes_sosa_albums(self, question: str) -> str:
|
22 |
prompt = (
|
23 |
-
"
|
24 |
-
"\
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"Step 1: Identify all studio albums by Mercedes Sosa.\n"
|
28 |
-
"Step 2: Filter the albums published between 2000 and 2009 inclusive.\n"
|
29 |
"Step 3: Count them.\n"
|
30 |
-
"
|
31 |
-
"\nANSWER:"
|
32 |
)
|
|
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
try:
|
35 |
response = self.client.chat.completions.create(
|
36 |
model="gpt-4-turbo",
|
37 |
messages=[
|
38 |
{"role": "system", "content": self.instructions},
|
39 |
-
{"role": "user", "content": prompt}
|
40 |
],
|
41 |
temperature=0.0
|
42 |
)
|
43 |
-
return response.choices[0].message.content
|
44 |
except Exception as e:
|
45 |
-
return f"[
|
|
|
1 |
+
Agent V9
|
2 |
+
1
|
3 |
+
2
|
4 |
+
3
|
5 |
+
4
|
6 |
+
5
|
7 |
+
6
|
8 |
+
7
|
9 |
+
8
|
10 |
+
9
|
11 |
+
10
|
12 |
+
11
|
13 |
+
12
|
14 |
+
13
|
15 |
+
14
|
16 |
+
15
|
17 |
+
16
|
18 |
+
17
|
19 |
+
18
|
20 |
+
19
|
21 |
+
20
|
22 |
+
21
|
23 |
+
22
|
24 |
+
23
|
25 |
+
24
|
26 |
+
25
|
27 |
+
26
|
28 |
+
27
|
29 |
+
28
|
30 |
+
29
|
31 |
+
30
|
32 |
+
31
|
33 |
+
32
|
34 |
+
33
|
35 |
+
34
|
36 |
+
35
|
37 |
+
36
|
38 |
+
37
|
39 |
+
38
|
40 |
+
39
|
41 |
+
40
|
42 |
+
41
|
43 |
+
42
|
44 |
+
43
|
45 |
+
44
|
46 |
+
45
|
47 |
+
46
|
48 |
+
47
|
49 |
+
48
|
50 |
+
49
|
51 |
+
50
|
52 |
+
51
|
53 |
+
52
|
54 |
+
53
|
55 |
+
54
|
56 |
+
55
|
57 |
+
56
|
58 |
+
57
|
59 |
+
58
|
60 |
+
59
|
61 |
+
60
|
62 |
+
61
|
63 |
+
62
|
64 |
+
63
|
65 |
+
64
|
66 |
+
65
|
67 |
+
66
|
68 |
+
67
|
69 |
+
68
|
70 |
+
69
|
71 |
+
70
|
72 |
+
71
|
73 |
+
72
|
74 |
+
# agent_v9.py
|
75 |
import os
|
76 |
from openai import OpenAI
|
77 |
|
|
|
80 |
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
81 |
self.instructions = (
|
82 |
"You are a research assistant solving GAIA benchmark questions using 2022 English Wikipedia knowledge.\n"
|
83 |
+
"For each question, reason step-by-step and only return the final answer in exact format (no explanation, no punctuation, no text)."
|
84 |
)
|
85 |
self.task_templates = {
|
86 |
+
"8e867cd7-cff9-4e6c-867a-ff5ddc2550be": self.q_mercedes_sosa_albums,
|
87 |
+
"2d83110e-a098-4ebb-9987-066c06fa42d0": self.q_reversed_text,
|
88 |
+
"3cef3a44-215e-4aed-8e3b-b1e3f08063b7": self.q_botanical_vegetables
|
89 |
}
|
90 |
|
91 |
def __call__(self, question: str, task_id: str = None) -> str:
|
92 |
if task_id in self.task_templates:
|
93 |
+
raw = self.task_templates[task_id](question)
|
94 |
+
return raw.strip().replace(".\n", "").replace("\n", "").strip()
|
95 |
else:
|
96 |
+
return "[SKIPPED: Task not yet implemented in Agent V9.1]"
|
97 |
|
98 |
def q_mercedes_sosa_albums(self, question: str) -> str:
|
99 |
prompt = (
|
100 |
+
"QUESTION: How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)?\n"
|
101 |
+
"\nScratchpad reasoning:\n"
|
102 |
+
"Step 1: List all studio albums of Mercedes Sosa from Wikipedia (2022).\n"
|
103 |
+
"Step 2: Filter albums released between 2000 and 2009 inclusive.\n"
|
|
|
|
|
104 |
"Step 3: Count them.\n"
|
105 |
+
"\nFinal Answer (number only):"
|
|
|
106 |
)
|
107 |
+
return self.query_llm(prompt)
|
108 |
|
109 |
+
def q_reversed_text(self, question: str) -> str:
|
110 |
+
prompt = (
|
111 |
+
"QUESTION: .rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI\n"
|
112 |
+
"\nScratchpad reasoning:\n"
|
113 |
+
"Step 1: Reverse the question.\n"
|
114 |
+
"Step 2: Understand it.\n"
|
115 |
+
"Step 3: The opposite of the word \"left\" is \"right\".\n"
|
116 |
+
"\nFinal Answer (word only):"
|
117 |
+
)
|
118 |
+
return self.query_llm(prompt)
|
119 |
+
|
120 |
+
def q_botanical_vegetables(self, question: str) -> str:
|
121 |
+
prompt = (
|
122 |
+
"QUESTION: Classify each item botanically and return only the vegetables from the list.\n"
|
123 |
+
"milk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n"
|
124 |
+
"\nScratchpad reasoning:\n"
|
125 |
+
"Step 1: Identify botanical vegetables (roots, stems, leaves).\n"
|
126 |
+
"Step 2: Exclude botanical fruits and seeds.\n"
|
127 |
+
"Step 3: Sort alphabetically.\n"
|
128 |
+
"\nFinal Answer (comma-separated list):"
|
129 |
+
)
|
130 |
+
return self.query_llm(prompt)
|
131 |
+
|
132 |
+
def query_llm(self, prompt: str) -> str:
|
133 |
try:
|
134 |
response = self.client.chat.completions.create(
|
135 |
model="gpt-4-turbo",
|
136 |
messages=[
|
137 |
{"role": "system", "content": self.instructions},
|
138 |
+
{"role": "user", "content": prompt.strip()}
|
139 |
],
|
140 |
temperature=0.0
|
141 |
)
|
142 |
+
return response.choices[0].message.content
|
143 |
except Exception as e:
|
144 |
+
return f"[LLM ERROR: {e}]"
|