|
from llama_index.core.agent.workflow import ( |
|
ReActAgent, |
|
FunctionAgent, |
|
CodeActAgent |
|
) |
|
from llama_index.core.llms import LLM |
|
import os |
|
from typing import Optional, List, Any, Dict |
|
from llama_index.llms.openai import OpenAI |
|
from llama_index.llms.anthropic import Anthropic |
|
|
|
from tools.multimedia_tools import ( |
|
transcribe_audio_tool, |
|
) |
|
|
|
from tools.web_tools import ( |
|
tavily_tool, |
|
wikipedia_tool |
|
) |
|
|
|
from tools.coding_tools import ( |
|
execute_python_file_tool, |
|
csv_excel_reader_tool |
|
) |
|
|
|
class GaiaAgent(ReActAgent): |
|
""" |
|
A flexible ReActAgent for GAIA benchmark tasks that supports multiple LLM providers. |
|
|
|
This agent coordinates specialized sub-agents to solve diverse benchmark tasks, |
|
with precise output formatting as specified in the GAIA benchmark. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model_provider: str = "openai", |
|
model_name: str = "gpt-4o", |
|
api_key: Optional[str] = None, |
|
system_prompt: Optional[str] = None, |
|
tools: Optional[List[Any]] = None, |
|
name: str = "jefe", |
|
description: str = "Master coordinator agent for GAIA benchmark tasks", |
|
llm: Optional[LLM] = None, |
|
**kwargs |
|
): |
|
""" |
|
Initialize a GaiaAgent with flexible model configuration. |
|
|
|
Args: |
|
model_provider: The LLM provider to use ("openai", "anthropic", "cohere", etc.) |
|
model_name: The specific model name to use |
|
api_key: API key for the provider (defaults to environment variable) |
|
system_prompt: Custom system prompt (defaults to GAIA benchmark prompt) |
|
tools: List of tools to make available to the agent |
|
name: Name of the agent |
|
description: Description of the agent |
|
llm: Pre-configured LLM instance (if provided, model_provider and model_name are ignored) |
|
**kwargs: Additional parameters to pass to ReActAgent |
|
""" |
|
from tools.text_tools import reverse_text_tool |
|
|
|
|
|
if llm is None: |
|
llm = self._initialize_llm(model_provider, model_name, api_key) |
|
|
|
|
|
if tools is None: |
|
tools = [ |
|
reverse_text_tool, |
|
wikipedia_tool.load_data, |
|
wikipedia_tool.search_data, |
|
tavily_tool.search, |
|
transcribe_audio_tool, |
|
execute_python_file_tool, |
|
csv_excel_reader_tool |
|
] |
|
|
|
|
|
if system_prompt is None: |
|
system_prompt = self._get_default_system_prompt() |
|
|
|
can_handoff_to = [ |
|
"writer_agent" |
|
] |
|
|
|
|
|
super().__init__( |
|
name=name, |
|
description=description, |
|
llm=llm, |
|
system_prompt=system_prompt, |
|
tools=tools, |
|
can_handoff_to=can_handoff_to, |
|
**kwargs |
|
) |
|
|
|
def _initialize_llm(self, model_provider: str, model_name: str, api_key: Optional[str]) -> LLM: |
|
"""Initialize the appropriate LLM based on the provider.""" |
|
model_provider = model_provider.lower() |
|
|
|
if model_provider == "openai": |
|
return OpenAI( |
|
model=model_name, |
|
api_key=api_key or os.getenv("OPENAI_API_KEY"), |
|
additional_kwargs={ |
|
"reasoning_effort": "high" |
|
} if "o4" in model_name else {}) |
|
|
|
elif model_provider == "anthropic": |
|
return Anthropic( |
|
model=model_name, |
|
api_key=api_key or os.getenv("ANTHROPIC_API_KEY"), |
|
temperature=1.0 if "3-7" in model_name else 0.5, |
|
thinking_dict={"type": "enabled", "budget_tokens": 2048} if "3-7" in model_name else None, |
|
max_tokens=2048*4 |
|
) |
|
|
|
else: |
|
raise ValueError(f"Unsupported model provider: {model_provider}. " |
|
f"Supported providers are: openai, anthropic") |
|
|
|
|
|
def _get_default_system_prompt(self) -> str: |
|
"""Return the default system prompt for GAIA benchmark tasks.""" |
|
return """ |
|
You are the lead coordinator for a team of specialized AI agents tackling the GAIA benchmark. Your job is to analyze questions and generate detailed analysis, which you'll pass to a specialized formatting agent for final answer preparation. |
|
|
|
## QUESTION ANALYSIS PROCESS |
|
1. First, carefully read and parse the entire question |
|
2. Identify the EXACT output format required (single word, name, number, comma-separated list, etc.) |
|
3. Note any special formatting requirements (alphabetical order, specific notation, etc.) |
|
4. Identify what type of task this is (research, audio analysis, video analysis, code execution, data analysis, etc.) |
|
5. Break the question into sequential steps |
|
|
|
## SOLVING METHODOLOGY |
|
1. For each question, thoroughly work through the reasoning step-by-step |
|
2. Use available tools when needed: |
|
- reverse_text_tool: For reversing text |
|
- search tools (wikipedia_tool, tavily_tool): For finding information |
|
- transcribe_audio: For transcribing audio files (provide the path to the audio file) |
|
- get_audio_metadata: For getting metadata about audio files |
|
- execute_python_file: For executing Python code files and returning their output |
|
3. Document your full analysis, including all key facts, calculations, and relevant information |
|
4. Clearly identify what you believe the correct answer is |
|
5. Be extremely explicit about the required formatting for the final answer |
|
|
|
## HANDLING CODE EXECUTION TASKS |
|
When dealing with Python code files: |
|
1. Check if a Python file path is available in the context's "file_name" field |
|
2. Always use the execute_python_file tool with the exact file path provided |
|
3. Extract the specific numeric output requested from the execution result |
|
4. For code tasks, ensure you've captured the final numeric output exactly as printed by the code |
|
|
|
## HANDLING AUDIO TASKS |
|
When dealing with audio files: |
|
1. Check if an audio file path is available in the context's "audio_file_path" field |
|
2. Always use the transcribe_audio tool with the exact file path provided in the context |
|
3. Extract the specific information requested from the transcript (e.g., ingredients, page numbers, names) |
|
4. For audio tasks, ensure you've captured all relevant spoken content, including names, facts, or quotes as needed |
|
|
|
## HANDLING CSV OR EXCEL DATA TASKS |
|
When dealing with CSV files or data analysis tasks: |
|
1. Check if a CSV file path is mentioned in the question or available in the context |
|
2. Use the csv_reader tool with the specific CSV file path |
|
3. Once the data is loaded, analyze it according to the question requirements |
|
4. For data analysis tasks, ensure you've properly processed the CSV data and extracted the requested information |
|
5. When calculations or statistics are needed, perform them accurately and document your methodology |
|
|
|
## DELEGATION TO WRITER AGENT |
|
After completing your analysis, ALWAYS delegate the final answer preparation to the writer_agent with: |
|
- query: The original question |
|
- research_notes: Your complete analysis, all relevant facts, and what you believe is the correct answer |
|
- answer_format: EXPLICIT instructions on exactly how the answer should be formatted (single word, comma-separated list, etc.) |
|
|
|
Example handoff to writer_agent: |
|
``` |
|
I'll delegate to writer_agent to format the final answer. |
|
|
|
query: What is the first name of the scientist who discovered penicillin? |
|
research_notes: After researching, I found that Sir Alexander Fleming discovered penicillin in 1928. The full answer is "Alexander Fleming" but the question only asks for the first name, which is "Alexander". |
|
answer_format: Return ONLY the first name, with no additional text, punctuation, or explanation. |
|
``` |
|
|
|
IMPORTANT: NEVER provide the final answer directly to the user. ALWAYS hand off to the writer_agent for proper formatting. |
|
""" |
|
|
|
def create_writer_agent(model_config: Dict[str, Any]) -> ReActAgent: |
|
""" |
|
Create a writer agent that formats final answers based on research notes. |
|
|
|
Args: |
|
model_config: Dictionary containing model_provider, model_name, and api_key |
|
|
|
Returns: |
|
A configured ReActAgent for formatting final answers |
|
""" |
|
|
|
model_provider = model_config.get("model_provider", "openai") |
|
model_name = model_config.get("model_name", "gpt-4o") |
|
api_key = model_config.get("api_key") |
|
|
|
if model_provider.lower() == "openai": |
|
llm = OpenAI( |
|
model=model_name, |
|
api_key=api_key or os.getenv("OPENAI_API_KEY"), |
|
max_tokens=128, |
|
temperature=0.1, |
|
|
|
additional_kwargs={ |
|
"max_tokens": 128, |
|
"temperature": 0.5} |
|
) |
|
elif model_provider.lower() == "anthropic": |
|
llm = Anthropic( |
|
model=model_name, |
|
api_key=api_key or os.getenv("ANTHROPIC_API_KEY"), |
|
temperature=1.0 if "3-7" in model_name else 0.5, |
|
thinking_dict={"type": "enabled", "budget_tokens": 5112} if "3-7" in model_name else None, |
|
max_tokens=2048*4, |
|
|
|
) |
|
else: |
|
raise ValueError(f"Unsupported model provider for writer agent: {model_provider}") |
|
|
|
|
|
return ReActAgent( |
|
name="writer_agent", |
|
description="Formats the final answer exactly as specified for GAIA benchmark questions", |
|
system_prompt=""" |
|
You are a specialized formatting agent for the GAIA benchmark. Your ONLY job is to take the research from the main agent and format the answer EXACTLY as required by the benchmark question. |
|
|
|
## YOUR ROLE |
|
You will receive: |
|
- query: The original question |
|
- research_notes: The main agent's complete analysis and reasoning |
|
- answer_format: Specific formatting instructions for the final answer |
|
|
|
## CRITICAL RULES |
|
1. Your response MUST CONTAIN ONLY THE ANSWER - no explanations, no "the answer is" prefix |
|
2. Follow the answer_format instructions precisely |
|
3. Remove ALL unnecessary characters, spaces, punctuation, or wording |
|
4. If asked for a name, provide ONLY the name |
|
5. If asked for a number, provide ONLY the number |
|
6. If asked for a list, format it EXACTLY as specified (comma-separated, alphabetical, etc.) |
|
7. NEVER include your own thoughts or analysis |
|
8. NEVER add preamble or conclusion text |
|
|
|
## EXAMPLES OF CORRECT RESPONSES: |
|
When asked for "first name only": Alexander |
|
When asked for "comma-separated list in alphabetical order": apple, banana, cherry |
|
When asked for "single number": 42 |
|
When asked for "opposite of word 'right'": left |
|
When asked for "How many ...": eleven |
|
When asked for "What says Yoda": "May the force be with you" |
|
|
|
## CONCRETE EXAMPLE: |
|
When asked "The answer to the question of Universe, life and everything" |
|
- WRONG ANSWER: The answer to the question is 42. |
|
- RIGHT ANSWER: 42 |
|
|
|
- For question `How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.`: |
|
- WRONG ANSWER : `She released three studio albums in that period – Misa Criolla (2000), Corazón Libre (2005) and Cantora (2009).` |
|
- RIGHT ANSWER: `Three` |
|
|
|
- For question `"Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Teal'c say in response to the question "Isn't that hot?"`: |
|
- WRONG ANSWER: `"He replies, “Extremely.”"` |
|
- RIGHT ANSWER: `Extremely` |
|
|
|
REMEMBER: Your ENTIRE response should be just the answer - nothing more, nothing less. |
|
|
|
DO NOT EXPLAIN THE ANSWER. SIMPLY WRITE BACK THE ANSWER. |
|
""", |
|
llm=llm |
|
) |
|
|