Denis Davydov
commited on
Commit
·
a5c9e62
1
Parent(s):
f9a7c9b
enhanced web search
Browse files- agent.py +47 -42
- app.py +13 -3
- requirements.txt +5 -2
- test_agent_format.py +99 -0
- test_local.py +200 -99
- tools.py +242 -6
- utils.py +39 -55
agent.py
CHANGED
@@ -1,52 +1,45 @@
|
|
1 |
from typing import TypedDict, Annotated
|
2 |
import os
|
|
|
3 |
from langgraph.graph.message import add_messages
|
|
|
|
|
|
|
4 |
from langchain_core.messages import AnyMessage, HumanMessage, AIMessage, SystemMessage
|
5 |
from langgraph.prebuilt import ToolNode
|
6 |
from langgraph.graph import START, StateGraph
|
7 |
from langgraph.checkpoint.memory import MemorySaver
|
8 |
from langgraph.prebuilt import tools_condition
|
9 |
-
from
|
10 |
from tools import agent_tools
|
11 |
-
from utils import format_gaia_answer,
|
12 |
|
13 |
-
# Initialize LLM (
|
14 |
-
|
15 |
-
|
16 |
-
huggingfacehub_api_token=os.environ.get("HUGGINGFACE_API_TOKEN"),
|
17 |
temperature=0.1,
|
18 |
-
|
|
|
19 |
)
|
20 |
|
21 |
-
chat = ChatHuggingFace(llm=llm, verbose=True)
|
22 |
chat_with_tools = chat.bind_tools(agent_tools)
|
23 |
|
24 |
-
# System prompt for
|
25 |
-
SYSTEM_PROMPT = """You are a
|
26 |
-
|
27 |
-
Your approach should include:
|
28 |
-
- Multi-step reasoning and planning for complex questions
|
29 |
-
- Intelligent tool usage when needed for web search, file processing, calculations, and analysis
|
30 |
-
- Precise, factual answers based on reliable information
|
31 |
-
- Breaking down complex questions into manageable steps
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
6. Provide concise, direct answers without unnecessary prefixes
|
40 |
-
7. Focus on accuracy and helpfulness
|
41 |
-
8. Be factual and avoid speculation
|
42 |
|
43 |
-
|
44 |
|
45 |
# Generate the AgentState
|
46 |
class AgentState(TypedDict):
|
47 |
messages: Annotated[list[AnyMessage], add_messages]
|
48 |
task_id: str
|
49 |
-
question_analysis: dict
|
50 |
|
51 |
def assistant(state: AgentState):
|
52 |
"""Main assistant function that processes messages and calls tools."""
|
@@ -91,18 +84,14 @@ class SmartAgent:
|
|
91 |
|
92 |
def __init__(self):
|
93 |
self.agent = create_smart_agent()
|
94 |
-
print("🤖 Smart Agent initialized with
|
95 |
|
96 |
-
def __call__(self, question: str, task_id: str = None) ->
|
97 |
-
"""Process a question and return the formatted answer."""
|
98 |
try:
|
99 |
print(f"\n🎯 Processing question: {question[:100]}...")
|
100 |
|
101 |
-
#
|
102 |
-
analysis = analyze_question_type(question)
|
103 |
-
print(f"📊 Question analysis: {analysis}")
|
104 |
-
|
105 |
-
# Create execution plan
|
106 |
plan = create_execution_plan(question, task_id)
|
107 |
print(f"📋 Execution plan: {plan}")
|
108 |
|
@@ -111,24 +100,35 @@ class SmartAgent:
|
|
111 |
if task_id:
|
112 |
enhanced_question = f"Task ID: {task_id}\n\nQuestion: {question}\n\nNote: If this question involves files, use the file_download tool with task_id '{task_id}' to access associated files."
|
113 |
|
114 |
-
# Invoke the agent
|
115 |
thread_id = f"task-{task_id}" if task_id else "general"
|
116 |
-
config = {
|
|
|
|
|
|
|
117 |
|
118 |
initial_state = {
|
119 |
"messages": [HumanMessage(content=enhanced_question)],
|
120 |
-
"task_id": task_id or ""
|
121 |
-
"question_analysis": analysis
|
122 |
}
|
123 |
|
124 |
result = self.agent.invoke(initial_state, config=config)
|
125 |
|
126 |
-
# Extract the final answer
|
127 |
if result and 'messages' in result and result['messages']:
|
128 |
final_message = result['messages'][-1]
|
129 |
raw_answer = final_message.content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
else:
|
131 |
raw_answer = "No response generated"
|
|
|
132 |
|
133 |
# Format the answer for submission
|
134 |
formatted_answer = format_gaia_answer(raw_answer)
|
@@ -136,11 +136,16 @@ class SmartAgent:
|
|
136 |
print(f"✅ Raw answer: {raw_answer}")
|
137 |
print(f"🎯 Formatted answer: {formatted_answer}")
|
138 |
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
except Exception as e:
|
142 |
error_msg = f"Error processing question: {str(e)}"
|
143 |
print(f"❌ {error_msg}")
|
144 |
-
return error_msg
|
145 |
|
146 |
smart_agent = SmartAgent()
|
|
|
1 |
from typing import TypedDict, Annotated
|
2 |
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
from langgraph.graph.message import add_messages
|
5 |
+
|
6 |
+
# Load environment variables from .env file
|
7 |
+
load_dotenv()
|
8 |
from langchain_core.messages import AnyMessage, HumanMessage, AIMessage, SystemMessage
|
9 |
from langgraph.prebuilt import ToolNode
|
10 |
from langgraph.graph import START, StateGraph
|
11 |
from langgraph.checkpoint.memory import MemorySaver
|
12 |
from langgraph.prebuilt import tools_condition
|
13 |
+
from langchain_openai import ChatOpenAI
|
14 |
from tools import agent_tools
|
15 |
+
from utils import format_gaia_answer, create_execution_plan, log_agent_step
|
16 |
|
17 |
+
# Initialize OpenAI LLM with GPT-4o (most capable model)
|
18 |
+
chat = ChatOpenAI(
|
19 |
+
model="gpt-4o",
|
|
|
20 |
temperature=0.1,
|
21 |
+
max_tokens=1024,
|
22 |
+
api_key=os.environ.get("OPENAI_API_KEY")
|
23 |
)
|
24 |
|
|
|
25 |
chat_with_tools = chat.bind_tools(agent_tools)
|
26 |
|
27 |
+
# System prompt for GAIA evaluation (exact format required by HF)
|
28 |
+
SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
+
You have access to tools that can help you:
|
31 |
+
- Search the web for current information
|
32 |
+
- Download and process files associated with task IDs
|
33 |
+
- Analyze images
|
34 |
+
- Perform calculations
|
35 |
+
- Process text
|
|
|
|
|
|
|
36 |
|
37 |
+
IMPORTANT: You must provide a specific answer in the FINAL ANSWER format. Do not say you cannot find information or provide general approaches. Use web search to find the information you need, but limit yourself to 2-3 search attempts maximum. If you cannot find perfect information, make your best determination based on what you found and provide a concrete FINAL ANSWER. Always end with a specific FINAL ANSWER, never with explanations about not finding information."""
|
38 |
|
39 |
# Generate the AgentState
|
40 |
class AgentState(TypedDict):
|
41 |
messages: Annotated[list[AnyMessage], add_messages]
|
42 |
task_id: str
|
|
|
43 |
|
44 |
def assistant(state: AgentState):
|
45 |
"""Main assistant function that processes messages and calls tools."""
|
|
|
84 |
|
85 |
def __init__(self):
|
86 |
self.agent = create_smart_agent()
|
87 |
+
print("🤖 Smart Agent initialized with OpenAI GPT-4o and tools")
|
88 |
|
89 |
+
def __call__(self, question: str, task_id: str = None) -> tuple:
|
90 |
+
"""Process a question and return the formatted answer and reasoning trace."""
|
91 |
try:
|
92 |
print(f"\n🎯 Processing question: {question[:100]}...")
|
93 |
|
94 |
+
# Create simple execution plan for logging
|
|
|
|
|
|
|
|
|
95 |
plan = create_execution_plan(question, task_id)
|
96 |
print(f"📋 Execution plan: {plan}")
|
97 |
|
|
|
100 |
if task_id:
|
101 |
enhanced_question = f"Task ID: {task_id}\n\nQuestion: {question}\n\nNote: If this question involves files, use the file_download tool with task_id '{task_id}' to access associated files."
|
102 |
|
103 |
+
# Invoke the agent - let GPT-4o decide what tools to use
|
104 |
thread_id = f"task-{task_id}" if task_id else "general"
|
105 |
+
config = {
|
106 |
+
"configurable": {"thread_id": thread_id},
|
107 |
+
"recursion_limit": 15 # Allow more tool usage for complex searches
|
108 |
+
}
|
109 |
|
110 |
initial_state = {
|
111 |
"messages": [HumanMessage(content=enhanced_question)],
|
112 |
+
"task_id": task_id or ""
|
|
|
113 |
}
|
114 |
|
115 |
result = self.agent.invoke(initial_state, config=config)
|
116 |
|
117 |
+
# Extract the final answer and reasoning trace
|
118 |
if result and 'messages' in result and result['messages']:
|
119 |
final_message = result['messages'][-1]
|
120 |
raw_answer = final_message.content
|
121 |
+
|
122 |
+
# Build reasoning trace from all messages
|
123 |
+
reasoning_trace = []
|
124 |
+
for msg in result['messages']:
|
125 |
+
if hasattr(msg, 'content') and msg.content:
|
126 |
+
reasoning_trace.append(msg.content)
|
127 |
+
|
128 |
+
reasoning_text = "\n---\n".join(reasoning_trace)
|
129 |
else:
|
130 |
raw_answer = "No response generated"
|
131 |
+
reasoning_text = "No reasoning trace available"
|
132 |
|
133 |
# Format the answer for submission
|
134 |
formatted_answer = format_gaia_answer(raw_answer)
|
|
|
136 |
print(f"✅ Raw answer: {raw_answer}")
|
137 |
print(f"🎯 Formatted answer: {formatted_answer}")
|
138 |
|
139 |
+
# Validate the formatted answer
|
140 |
+
if not formatted_answer or formatted_answer.strip() == "":
|
141 |
+
print("⚠️ WARNING: Empty formatted answer!")
|
142 |
+
formatted_answer = "ERROR: No valid answer extracted"
|
143 |
+
|
144 |
+
return formatted_answer, reasoning_text
|
145 |
|
146 |
except Exception as e:
|
147 |
error_msg = f"Error processing question: {str(e)}"
|
148 |
print(f"❌ {error_msg}")
|
149 |
+
return error_msg, f"Error occurred: {str(e)}"
|
150 |
|
151 |
smart_agent = SmartAgent()
|
app.py
CHANGED
@@ -71,12 +71,22 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
71 |
print(f"Skipping item with missing task_id or question: {item}")
|
72 |
continue
|
73 |
try:
|
74 |
-
submitted_answer = agent(question_text, task_id)
|
75 |
-
answers_payload.append({
|
|
|
|
|
|
|
|
|
76 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
77 |
except Exception as e:
|
78 |
print(f"Error running agent on task {task_id}: {e}")
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
if not answers_payload:
|
82 |
print("Agent did not produce any answers to submit.")
|
|
|
71 |
print(f"Skipping item with missing task_id or question: {item}")
|
72 |
continue
|
73 |
try:
|
74 |
+
submitted_answer, reasoning_trace = agent(question_text, task_id)
|
75 |
+
answers_payload.append({
|
76 |
+
"task_id": task_id,
|
77 |
+
"model_answer": submitted_answer,
|
78 |
+
"reasoning_trace": reasoning_trace
|
79 |
+
})
|
80 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
81 |
except Exception as e:
|
82 |
print(f"Error running agent on task {task_id}: {e}")
|
83 |
+
error_answer = f"AGENT ERROR: {e}"
|
84 |
+
answers_payload.append({
|
85 |
+
"task_id": task_id,
|
86 |
+
"model_answer": error_answer,
|
87 |
+
"reasoning_trace": f"Error occurred: {str(e)}"
|
88 |
+
})
|
89 |
+
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": error_answer})
|
90 |
|
91 |
if not answers_payload:
|
92 |
print("Agent did not produce any answers to submit.")
|
requirements.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
# Core dependencies from unit3
|
2 |
langchain
|
3 |
langchain-community
|
4 |
-
langchain-
|
5 |
langgraph
|
6 |
huggingface_hub
|
7 |
|
@@ -10,8 +10,11 @@ gradio
|
|
10 |
requests
|
11 |
pillow
|
12 |
PyPDF2
|
13 |
-
|
14 |
python-dotenv
|
|
|
|
|
|
|
15 |
|
16 |
# For image processing and multimodal capabilities
|
17 |
transformers
|
|
|
1 |
# Core dependencies from unit3
|
2 |
langchain
|
3 |
langchain-community
|
4 |
+
langchain-openai
|
5 |
langgraph
|
6 |
huggingface_hub
|
7 |
|
|
|
10 |
requests
|
11 |
pillow
|
12 |
PyPDF2
|
13 |
+
ddgs
|
14 |
python-dotenv
|
15 |
+
beautifulsoup4
|
16 |
+
faiss-cpu
|
17 |
+
langchain-text-splitters
|
18 |
|
19 |
# For image processing and multimodal capabilities
|
20 |
transformers
|
test_agent_format.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script to verify the agent's answer formatting works correctly.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
from agent import smart_agent
|
8 |
+
from utils import format_gaia_answer
|
9 |
+
|
10 |
+
def test_answer_formatting():
|
11 |
+
"""Test the answer formatting function with various inputs."""
|
12 |
+
|
13 |
+
test_cases = [
|
14 |
+
# Test case: (raw_answer, expected_format)
|
15 |
+
("I think the answer is 42. FINAL ANSWER: 42", "42"),
|
16 |
+
("Let me calculate... FINAL ANSWER: 3.14159", "3.14159"),
|
17 |
+
("After research, FINAL ANSWER: New York", "New York"),
|
18 |
+
("The result is FINAL ANSWER: apple, banana, cherry", "apple, banana, cherry"),
|
19 |
+
("FINAL ANSWER: 1,234", "1234"), # Should remove commas from numbers
|
20 |
+
("FINAL ANSWER: \"Hello World\"", "Hello World"), # Should remove quotes
|
21 |
+
("FINAL ANSWER: approximately 100", "100"), # Should remove qualifiers
|
22 |
+
("No clear final answer format here", "No clear final answer format here"), # Fallback
|
23 |
+
]
|
24 |
+
|
25 |
+
print("🧪 Testing answer formatting...")
|
26 |
+
for i, (raw, expected) in enumerate(test_cases, 1):
|
27 |
+
result = format_gaia_answer(raw)
|
28 |
+
status = "✅" if result == expected else "❌"
|
29 |
+
print(f"{status} Test {i}: '{raw}' -> '{result}' (expected: '{expected}')")
|
30 |
+
if result != expected:
|
31 |
+
print(f" ⚠️ Mismatch detected!")
|
32 |
+
|
33 |
+
print("\n" + "="*50)
|
34 |
+
|
35 |
+
def test_simple_question():
|
36 |
+
"""Test the agent with a simple question."""
|
37 |
+
print("🤖 Testing agent with a simple question...")
|
38 |
+
|
39 |
+
question = "What is 2 + 2?"
|
40 |
+
try:
|
41 |
+
answer, reasoning = smart_agent(question)
|
42 |
+
print(f"Question: {question}")
|
43 |
+
print(f"Answer: {answer}")
|
44 |
+
print(f"Reasoning length: {len(reasoning)} characters")
|
45 |
+
print(f"Raw reasoning preview: {reasoning[:200]}...")
|
46 |
+
|
47 |
+
# Check if answer follows expected format
|
48 |
+
if answer and answer.strip():
|
49 |
+
print("✅ Agent returned a non-empty answer")
|
50 |
+
else:
|
51 |
+
print("❌ Agent returned empty answer")
|
52 |
+
|
53 |
+
except Exception as e:
|
54 |
+
print(f"❌ Error testing agent: {e}")
|
55 |
+
|
56 |
+
print("\n" + "="*50)
|
57 |
+
|
58 |
+
def test_api_format():
|
59 |
+
"""Test that our submission format matches API expectations."""
|
60 |
+
print("📡 Testing API submission format...")
|
61 |
+
|
62 |
+
# Simulate what would be sent to the API
|
63 |
+
sample_submission = {
|
64 |
+
"task_id": "test_task_1",
|
65 |
+
"model_answer": "42",
|
66 |
+
"reasoning_trace": "I calculated 2+2 and got 4, but the question asks for something else..."
|
67 |
+
}
|
68 |
+
|
69 |
+
required_fields = ["task_id", "model_answer"]
|
70 |
+
optional_fields = ["reasoning_trace"]
|
71 |
+
|
72 |
+
print("Required fields check:")
|
73 |
+
for field in required_fields:
|
74 |
+
if field in sample_submission:
|
75 |
+
print(f"✅ {field}: {sample_submission[field]}")
|
76 |
+
else:
|
77 |
+
print(f"❌ Missing required field: {field}")
|
78 |
+
|
79 |
+
print("Optional fields check:")
|
80 |
+
for field in optional_fields:
|
81 |
+
if field in sample_submission:
|
82 |
+
print(f"✅ {field}: Present ({len(str(sample_submission[field]))} chars)")
|
83 |
+
else:
|
84 |
+
print(f"ℹ️ Optional field not present: {field}")
|
85 |
+
|
86 |
+
if __name__ == "__main__":
|
87 |
+
print("🔧 GAIA Agent Format Testing")
|
88 |
+
print("="*50)
|
89 |
+
|
90 |
+
# Test 1: Answer formatting
|
91 |
+
test_answer_formatting()
|
92 |
+
|
93 |
+
# Test 2: Simple agent question
|
94 |
+
test_simple_question()
|
95 |
+
|
96 |
+
# Test 3: API format
|
97 |
+
test_api_format()
|
98 |
+
|
99 |
+
print("🏁 Testing complete!")
|
test_local.py
CHANGED
@@ -1,137 +1,238 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
-
|
4 |
-
|
5 |
"""
|
6 |
|
7 |
-
import
|
8 |
-
import
|
9 |
-
from dotenv import load_dotenv
|
10 |
-
|
11 |
-
# Load environment variables
|
12 |
-
load_dotenv()
|
13 |
-
|
14 |
-
# Add current directory to path for imports
|
15 |
-
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
16 |
-
|
17 |
-
from utils import fetch_random_question, analyze_question_type
|
18 |
from agent import smart_agent
|
19 |
|
20 |
-
def
|
21 |
-
"""Test the question
|
22 |
-
print("🧪 Testing question analysis...")
|
23 |
-
|
24 |
-
test_questions = [
|
25 |
-
"What is the current population of Tokyo?",
|
26 |
-
"Calculate 15 * 23 + 45",
|
27 |
-
"Analyze the image shown in the document",
|
28 |
-
"Extract all dates from the provided text file"
|
29 |
-
]
|
30 |
-
|
31 |
-
for question in test_questions:
|
32 |
-
analysis = analyze_question_type(question)
|
33 |
-
print(f"Question: {question}")
|
34 |
-
print(f"Analysis: {analysis}")
|
35 |
-
print()
|
36 |
-
|
37 |
-
def test_tools():
|
38 |
-
"""Test individual tools."""
|
39 |
-
print("🔧 Testing individual tools...")
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
calc_result = calculator_tool.func("15 + 27")
|
44 |
-
print(f"Calculator test: {calc_result}")
|
45 |
|
46 |
-
#
|
47 |
-
|
48 |
-
|
49 |
-
search_result = web_search_tool.func("Python programming language")
|
50 |
-
print(f"Web search test: {search_result[:100]}...")
|
51 |
-
except Exception as e:
|
52 |
-
print(f"Web search test failed: {e}")
|
53 |
|
|
|
54 |
print()
|
55 |
-
|
56 |
-
def test_agent_simple():
|
57 |
-
"""Test the agent with a simple question."""
|
58 |
-
print("🤖 Testing Smart agent with simple question...")
|
59 |
|
60 |
-
|
|
|
61 |
try:
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
66 |
except Exception as e:
|
67 |
-
print(f"❌
|
|
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
print()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
-
def
|
72 |
-
"""Test the agent with a
|
73 |
-
|
|
|
|
|
74 |
|
|
|
|
|
75 |
try:
|
76 |
question_data = fetch_random_question()
|
77 |
if not question_data:
|
78 |
-
print("❌ Failed to fetch question
|
79 |
-
return
|
80 |
|
81 |
-
task_id = question_data.get("task_id")
|
82 |
-
|
83 |
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
except Exception as e:
|
93 |
-
print(f"❌
|
|
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
print()
|
96 |
-
|
97 |
-
def check_environment():
|
98 |
-
"""Check if all required environment variables are set."""
|
99 |
-
print("🔍 Checking environment...")
|
100 |
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
else:
|
108 |
-
print(f"✅ {var} is set")
|
109 |
|
110 |
-
if
|
111 |
-
|
112 |
-
print("
|
|
|
|
|
113 |
return False
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
print()
|
129 |
|
130 |
-
#
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
if __name__ == "__main__":
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
+
Test script for validating agent performance on a random GAIA question.
|
4 |
+
Fetches one random question and tests the complete pipeline without submitting.
|
5 |
"""
|
6 |
|
7 |
+
import time
|
8 |
+
from utils import fetch_random_question, format_gaia_answer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
from agent import smart_agent
|
10 |
|
11 |
+
def test_predefined_gaia_question():
|
12 |
+
"""Test the agent with a predefined GAIA question to verify web search and answer format."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
print("🧪 Testing predefined GAIA question (1928 Olympics)")
|
15 |
+
print("="*60)
|
|
|
|
|
16 |
|
17 |
+
# Predefined question that requires web search
|
18 |
+
question = "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer."
|
19 |
+
task_id = "predefined_test"
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
print(f"❓ Question: {question}")
|
22 |
print()
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
# Run the agent
|
25 |
+
print("🤖 Running smart agent on the predefined question...")
|
26 |
try:
|
27 |
+
start_time = time.time()
|
28 |
+
answer, reasoning_trace = smart_agent(question, task_id)
|
29 |
+
end_time = time.time()
|
30 |
+
|
31 |
+
processing_time = end_time - start_time
|
32 |
+
print(f"✅ Agent completed in {processing_time:.2f} seconds")
|
33 |
+
print()
|
34 |
+
|
35 |
except Exception as e:
|
36 |
+
print(f"❌ Error running agent: {e}")
|
37 |
+
return False
|
38 |
|
39 |
+
# Display results
|
40 |
+
print("📊 AGENT RESULTS")
|
41 |
+
print("-" * 40)
|
42 |
+
print(f"🎯 Formatted Answer: '{answer}'")
|
43 |
+
print(f"📝 Reasoning Length: {len(reasoning_trace)} characters")
|
44 |
+
print(f"⏱️ Processing Time: {processing_time:.2f}s")
|
45 |
print()
|
46 |
+
|
47 |
+
# Show reasoning trace preview
|
48 |
+
print("🧠 REASONING TRACE PREVIEW")
|
49 |
+
print("-" * 40)
|
50 |
+
reasoning_preview = reasoning_trace[:400] + "..." if len(reasoning_trace) > 400 else reasoning_trace
|
51 |
+
print(reasoning_preview)
|
52 |
+
print()
|
53 |
+
|
54 |
+
# Validate answer format for GAIA
|
55 |
+
print("✅ GAIA FORMAT VALIDATION")
|
56 |
+
print("-" * 40)
|
57 |
+
|
58 |
+
# Check if answer is not empty
|
59 |
+
if answer and answer.strip():
|
60 |
+
print("✅ Answer is not empty")
|
61 |
+
else:
|
62 |
+
print("❌ Answer is empty or None")
|
63 |
+
return False
|
64 |
+
|
65 |
+
# Check if answer looks like IOC country code (2-3 uppercase letters)
|
66 |
+
import re
|
67 |
+
if re.match(r'^[A-Z]{2,3}$', answer.strip()):
|
68 |
+
print(f"✅ Answer '{answer}' matches IOC country code format")
|
69 |
+
else:
|
70 |
+
print(f"⚠️ Answer '{answer}' may not be in correct IOC format (should be 2-3 uppercase letters)")
|
71 |
+
|
72 |
+
# Check if web search was used (look for web_search in reasoning)
|
73 |
+
if "web_search" in reasoning_trace.lower() or "search" in reasoning_trace.lower():
|
74 |
+
print("✅ Agent appears to have used web search")
|
75 |
+
else:
|
76 |
+
print("⚠️ No clear evidence of web search usage")
|
77 |
+
|
78 |
+
# Check answer length (should be short for country code)
|
79 |
+
if len(answer.strip()) <= 5:
|
80 |
+
print("✅ Answer length is appropriate for country code")
|
81 |
+
else:
|
82 |
+
print("⚠️ Answer seems too long for a country code")
|
83 |
+
|
84 |
+
print()
|
85 |
+
|
86 |
+
# Final validation
|
87 |
+
print("🏁 FINAL VALIDATION")
|
88 |
+
print("-" * 40)
|
89 |
+
|
90 |
+
if answer and answer.strip() and len(answer.strip()) <= 5:
|
91 |
+
print("✅ PREDEFINED TEST PASSED - Answer format suitable for GAIA")
|
92 |
+
print(f"🎯 Agent produced: '{answer}' for 1928 Olympics question")
|
93 |
+
return True
|
94 |
+
else:
|
95 |
+
print("❌ PREDEFINED TEST FAILED - Answer format needs improvement")
|
96 |
+
return False
|
97 |
|
98 |
+
def test_random_gaia_question():
|
99 |
+
"""Test the agent with a random GAIA question and validate the complete pipeline."""
|
100 |
+
|
101 |
+
print("🔧 GAIA Random Question Test")
|
102 |
+
print("="*60)
|
103 |
|
104 |
+
# Step 1: Fetch a random question
|
105 |
+
print("📡 Fetching random question from GAIA API...")
|
106 |
try:
|
107 |
question_data = fetch_random_question()
|
108 |
if not question_data:
|
109 |
+
print("❌ Failed to fetch random question")
|
110 |
+
return False
|
111 |
|
112 |
+
task_id = question_data.get("task_id", "unknown")
|
113 |
+
question_text = question_data.get("question", "")
|
114 |
|
115 |
+
if not question_text:
|
116 |
+
print("❌ No question text in response")
|
117 |
+
return False
|
118 |
+
|
119 |
+
print(f"✅ Successfully fetched question")
|
120 |
+
print(f"📋 Task ID: {task_id}")
|
121 |
+
print(f"❓ Question: {question_text}")
|
122 |
+
print()
|
123 |
|
124 |
+
except Exception as e:
|
125 |
+
print(f"❌ Error fetching question: {e}")
|
126 |
+
return False
|
127 |
+
|
128 |
+
# Step 2: Run the agent
|
129 |
+
print("🤖 Running smart agent on the question...")
|
130 |
+
try:
|
131 |
+
start_time = time.time()
|
132 |
+
answer, reasoning_trace = smart_agent(question_text, task_id)
|
133 |
+
end_time = time.time()
|
134 |
+
|
135 |
+
processing_time = end_time - start_time
|
136 |
+
print(f"✅ Agent completed in {processing_time:.2f} seconds")
|
137 |
+
print()
|
138 |
|
139 |
except Exception as e:
|
140 |
+
print(f"❌ Error running agent: {e}")
|
141 |
+
return False
|
142 |
|
143 |
+
# Step 3: Display results
|
144 |
+
print("📊 AGENT RESULTS")
|
145 |
+
print("-" * 40)
|
146 |
+
print(f"🎯 Formatted Answer: '{answer}'")
|
147 |
+
print(f"📝 Reasoning Length: {len(reasoning_trace)} characters")
|
148 |
+
print(f"⏱️ Processing Time: {processing_time:.2f}s")
|
149 |
print()
|
|
|
|
|
|
|
|
|
150 |
|
151 |
+
# Step 4: Show reasoning trace preview
|
152 |
+
print("🧠 REASONING TRACE PREVIEW")
|
153 |
+
print("-" * 40)
|
154 |
+
reasoning_preview = reasoning_trace[:300] + "..." if len(reasoning_trace) > 300 else reasoning_trace
|
155 |
+
print(reasoning_preview)
|
156 |
+
print()
|
157 |
|
158 |
+
# Step 5: Validate answer format
|
159 |
+
print("✅ ANSWER VALIDATION")
|
160 |
+
print("-" * 40)
|
|
|
|
|
161 |
|
162 |
+
# Check if answer is not empty
|
163 |
+
if answer and answer.strip():
|
164 |
+
print("✅ Answer is not empty")
|
165 |
+
else:
|
166 |
+
print("❌ Answer is empty or None")
|
167 |
return False
|
168 |
|
169 |
+
# Check if answer contains error messages
|
170 |
+
if "ERROR" in answer.upper() or "FAILED" in answer.upper():
|
171 |
+
print("⚠️ Answer contains error message")
|
172 |
+
else:
|
173 |
+
print("✅ Answer appears to be valid (no error messages)")
|
174 |
+
|
175 |
+
# Check answer length (reasonable bounds)
|
176 |
+
if len(answer) > 1000:
|
177 |
+
print("⚠️ Answer is very long (>1000 chars) - might need review")
|
178 |
+
else:
|
179 |
+
print("✅ Answer length is reasonable")
|
180 |
|
181 |
+
print()
|
182 |
+
|
183 |
+
# Step 6: Show submission format
|
184 |
+
print("📡 SUBMISSION FORMAT PREVIEW")
|
185 |
+
print("-" * 40)
|
186 |
+
|
187 |
+
submission_entry = {
|
188 |
+
"task_id": task_id,
|
189 |
+
"model_answer": answer,
|
190 |
+
"reasoning_trace": reasoning_trace
|
191 |
+
}
|
192 |
+
|
193 |
+
# Validate required fields
|
194 |
+
required_fields = ["task_id", "model_answer"]
|
195 |
+
all_valid = True
|
196 |
+
|
197 |
+
for field in required_fields:
|
198 |
+
if field in submission_entry and submission_entry[field]:
|
199 |
+
print(f"✅ {field}: '{submission_entry[field][:50]}{'...' if len(str(submission_entry[field])) > 50 else ''}'")
|
200 |
+
else:
|
201 |
+
print(f"❌ Missing or empty {field}")
|
202 |
+
all_valid = False
|
203 |
+
|
204 |
+
# Check optional fields
|
205 |
+
if "reasoning_trace" in submission_entry and submission_entry["reasoning_trace"]:
|
206 |
+
print(f"✅ reasoning_trace: Present ({len(submission_entry['reasoning_trace'])} chars)")
|
207 |
+
else:
|
208 |
+
print("ℹ️ reasoning_trace: Not present (optional)")
|
209 |
|
210 |
print()
|
211 |
|
212 |
+
# Step 7: Final validation
|
213 |
+
print("🏁 FINAL VALIDATION")
|
214 |
+
print("-" * 40)
|
215 |
+
|
216 |
+
if all_valid and answer and answer.strip():
|
217 |
+
print("✅ ALL CHECKS PASSED - Agent is ready for submission!")
|
218 |
+
print("🚀 You can now run the full evaluation with confidence.")
|
219 |
+
return True
|
220 |
+
else:
|
221 |
+
print("❌ SOME CHECKS FAILED - Please review the issues above.")
|
222 |
+
return False
|
223 |
|
224 |
if __name__ == "__main__":
|
225 |
+
print("🧪 Testing agent with predefined GAIA question...")
|
226 |
+
print("This test validates web search functionality and answer formatting.")
|
227 |
+
print()
|
228 |
+
|
229 |
+
# Test the predefined 1928 Olympics question
|
230 |
+
success = test_predefined_gaia_question()
|
231 |
+
|
232 |
+
print("\n" + "="*60)
|
233 |
+
if success:
|
234 |
+
print("🎉 Predefined test completed successfully! Agent produces well-defined answers.")
|
235 |
+
print("💡 You can also run test_random_gaia_question() for additional testing.")
|
236 |
+
else:
|
237 |
+
print("⚠️ Predefined test revealed issues that need to be addressed.")
|
238 |
+
print("="*60)
|
tools.py
CHANGED
@@ -4,23 +4,106 @@ import os
|
|
4 |
from PIL import Image
|
5 |
import io
|
6 |
import base64
|
7 |
-
from
|
8 |
from typing import Optional
|
9 |
import json
|
10 |
import PyPDF2
|
11 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
#
|
14 |
-
|
15 |
|
16 |
def web_search_tool_func(query: str) -> str:
|
17 |
-
"""
|
18 |
try:
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
except Exception as e:
|
22 |
return f"Web search failed: {str(e)}"
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
web_search_tool = Tool(
|
25 |
name="web_search",
|
26 |
func=web_search_tool_func,
|
@@ -170,9 +253,162 @@ text_processor_tool = Tool(
|
|
170 |
description="Processes text for various operations like summarization, number extraction, date extraction. Specify operation as second parameter."
|
171 |
)
|
172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
# List of all tools for easy import
|
174 |
agent_tools = [
|
175 |
web_search_tool,
|
|
|
176 |
file_download_tool,
|
177 |
image_analysis_tool,
|
178 |
calculator_tool,
|
|
|
4 |
from PIL import Image
|
5 |
import io
|
6 |
import base64
|
7 |
+
from ddgs import DDGS
|
8 |
from typing import Optional
|
9 |
import json
|
10 |
import PyPDF2
|
11 |
import tempfile
|
12 |
+
import requests
|
13 |
+
from bs4 import BeautifulSoup
|
14 |
+
from langchain_community.vectorstores import FAISS
|
15 |
+
from langchain_openai import OpenAIEmbeddings
|
16 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
17 |
+
from langchain.schema import Document
|
18 |
+
from dotenv import load_dotenv
|
19 |
|
20 |
+
# Load environment variables
|
21 |
+
load_dotenv()
|
22 |
|
23 |
def web_search_tool_func(query: str) -> str:
|
24 |
+
"""Enhanced web search with Wikipedia priority using DDGS."""
|
25 |
try:
|
26 |
+
# Try Wikipedia-specific search first
|
27 |
+
print(f"🔍 Performing web search for: {query}")
|
28 |
+
wiki_results = search_wikipedia(query)
|
29 |
+
if wiki_results and len(wiki_results.strip()) > 100: # Good Wikipedia result
|
30 |
+
return f"Wikipedia search results:\n{wiki_results}"
|
31 |
+
|
32 |
+
# Fall back to general web search
|
33 |
+
general_results = search_general(query)
|
34 |
+
if general_results:
|
35 |
+
return f"Web search results:\n{general_results}"
|
36 |
+
else:
|
37 |
+
return "No relevant search results found."
|
38 |
+
|
39 |
except Exception as e:
|
40 |
return f"Web search failed: {str(e)}"
|
41 |
|
42 |
+
def search_wikipedia(query: str) -> str:
|
43 |
+
"""Search Wikipedia specifically for factual information."""
|
44 |
+
try:
|
45 |
+
with DDGS() as ddgs:
|
46 |
+
# Try multiple Wikipedia search strategies
|
47 |
+
search_queries = [
|
48 |
+
f"site:en.wikipedia.org {query}", # English Wikipedia specifically
|
49 |
+
f"{query} site:wikipedia.org", # Alternative format
|
50 |
+
f"{query} wikipedia" # General Wikipedia search
|
51 |
+
]
|
52 |
+
|
53 |
+
for search_query in search_queries:
|
54 |
+
try:
|
55 |
+
results = list(ddgs.text(search_query, max_results=3))
|
56 |
+
|
57 |
+
if results:
|
58 |
+
# Filter for relevant Wikipedia results
|
59 |
+
wiki_results = []
|
60 |
+
for result in results:
|
61 |
+
title = result.get('title', 'No title')
|
62 |
+
body = result.get('body', 'No content')
|
63 |
+
url = result.get('href', '')
|
64 |
+
|
65 |
+
# Only include if it's actually Wikipedia and relevant
|
66 |
+
if 'wikipedia.org' in url.lower() and any(term in title.lower() or term in body.lower() for term in query.lower().split()):
|
67 |
+
wiki_results.append(f"Title: {title}\nContent: {body}\nSource: {url}\n")
|
68 |
+
|
69 |
+
if wiki_results:
|
70 |
+
return "\n---\n".join(wiki_results)
|
71 |
+
|
72 |
+
except Exception:
|
73 |
+
continue # Try next search query
|
74 |
+
|
75 |
+
return "" # No good results found
|
76 |
+
|
77 |
+
except Exception as e:
|
78 |
+
return f"Wikipedia search failed: {str(e)}"
|
79 |
+
|
80 |
+
def search_general(query: str) -> str:
|
81 |
+
"""General web search as fallback."""
|
82 |
+
try:
|
83 |
+
with DDGS() as ddgs:
|
84 |
+
results = list(ddgs.text(query, max_results=5))
|
85 |
+
|
86 |
+
if not results:
|
87 |
+
return ""
|
88 |
+
|
89 |
+
# Format general results
|
90 |
+
formatted_results = []
|
91 |
+
for result in results:
|
92 |
+
title = result.get('title', 'No title')
|
93 |
+
body = result.get('body', 'No content')
|
94 |
+
url = result.get('href', '')
|
95 |
+
|
96 |
+
# Prioritize reliable sources
|
97 |
+
if any(domain in url.lower() for domain in ['wikipedia.org', 'britannica.com', 'edu', 'gov']):
|
98 |
+
formatted_results.insert(0, f"Title: {title}\nContent: {body}\nSource: {url}\n")
|
99 |
+
else:
|
100 |
+
formatted_results.append(f"Title: {title}\nContent: {body}\nSource: {url}\n")
|
101 |
+
|
102 |
+
return "\n---\n".join(formatted_results)
|
103 |
+
|
104 |
+
except Exception as e:
|
105 |
+
return f"General search failed: {str(e)}"
|
106 |
+
|
107 |
web_search_tool = Tool(
|
108 |
name="web_search",
|
109 |
func=web_search_tool_func,
|
|
|
253 |
description="Processes text for various operations like summarization, number extraction, date extraction. Specify operation as second parameter."
|
254 |
)
|
255 |
|
256 |
+
def enhanced_web_retrieval_tool_func(query: str) -> str:
|
257 |
+
"""Enhanced web search with vector retrieval for deep content analysis."""
|
258 |
+
try:
|
259 |
+
print(f"🔍 Enhanced web retrieval for: {query}")
|
260 |
+
|
261 |
+
# Step 1: Get search results with URLs
|
262 |
+
search_results = get_search_urls(query)
|
263 |
+
if not search_results:
|
264 |
+
return "No search results found."
|
265 |
+
|
266 |
+
# Step 2: Fetch and process webpage content
|
267 |
+
documents = []
|
268 |
+
for result in search_results[:4]: # Top 4 results as requested
|
269 |
+
url = result.get('url', '')
|
270 |
+
title = result.get('title', 'No title')
|
271 |
+
|
272 |
+
print(f"📄 Fetching content from: {title}")
|
273 |
+
content = fetch_webpage_content(url)
|
274 |
+
if content:
|
275 |
+
doc = Document(
|
276 |
+
page_content=content,
|
277 |
+
metadata={"source": url, "title": title}
|
278 |
+
)
|
279 |
+
documents.append(doc)
|
280 |
+
|
281 |
+
if not documents:
|
282 |
+
return "Could not fetch content from any search results."
|
283 |
+
|
284 |
+
# Step 3: Create vector store and search
|
285 |
+
return search_documents_with_vector_store(documents, query)
|
286 |
+
|
287 |
+
except Exception as e:
|
288 |
+
return f"Enhanced web retrieval failed: {str(e)}"
|
289 |
+
|
290 |
+
def get_search_urls(query: str) -> list:
|
291 |
+
"""Get search results from English Wikipedia only using DDGS."""
|
292 |
+
try:
|
293 |
+
with DDGS() as ddgs:
|
294 |
+
# Create Wikipedia-specific search queries
|
295 |
+
wikipedia_queries = [
|
296 |
+
f"site:en.wikipedia.org {query}",
|
297 |
+
f"{query} site:en.wikipedia.org"
|
298 |
+
]
|
299 |
+
|
300 |
+
search_results = []
|
301 |
+
seen_urls = set()
|
302 |
+
|
303 |
+
for wiki_query in wikipedia_queries:
|
304 |
+
try:
|
305 |
+
results = list(ddgs.text(wiki_query, max_results=2))
|
306 |
+
|
307 |
+
for result in results:
|
308 |
+
url = result.get('href', '')
|
309 |
+
|
310 |
+
# Only include Wikipedia URLs and avoid duplicates
|
311 |
+
if 'en.wikipedia.org' in url and url not in seen_urls:
|
312 |
+
search_results.append({
|
313 |
+
'url': url,
|
314 |
+
'title': result.get('title', 'No title'),
|
315 |
+
'snippet': result.get('body', 'No content')
|
316 |
+
})
|
317 |
+
seen_urls.add(url)
|
318 |
+
|
319 |
+
# Limit to 4 unique Wikipedia pages
|
320 |
+
if len(search_results) >= 4:
|
321 |
+
break
|
322 |
+
|
323 |
+
if len(search_results) >= 4:
|
324 |
+
break
|
325 |
+
|
326 |
+
except Exception:
|
327 |
+
continue # Try next query
|
328 |
+
|
329 |
+
return search_results
|
330 |
+
|
331 |
+
except Exception as e:
|
332 |
+
print(f"Wikipedia search URL retrieval failed: {e}")
|
333 |
+
return []
|
334 |
+
|
335 |
+
def fetch_webpage_content(url: str) -> str:
|
336 |
+
"""Fetch and extract clean text content from a webpage."""
|
337 |
+
try:
|
338 |
+
headers = {
|
339 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
340 |
+
}
|
341 |
+
|
342 |
+
response = requests.get(url, headers=headers, timeout=10)
|
343 |
+
response.raise_for_status()
|
344 |
+
|
345 |
+
# Parse HTML and extract text
|
346 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
347 |
+
|
348 |
+
# Remove script and style elements
|
349 |
+
for script in soup(["script", "style"]):
|
350 |
+
script.decompose()
|
351 |
+
|
352 |
+
# Get text content
|
353 |
+
text = soup.get_text()
|
354 |
+
|
355 |
+
# Clean up text
|
356 |
+
lines = (line.strip() for line in text.splitlines())
|
357 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
358 |
+
text = ' '.join(chunk for chunk in chunks if chunk)
|
359 |
+
|
360 |
+
return text[:20000] # Increase to 20k characters to get more content
|
361 |
+
|
362 |
+
except Exception as e:
|
363 |
+
print(f"Failed to fetch content from {url}: {e}")
|
364 |
+
return ""
|
365 |
+
|
366 |
+
def search_documents_with_vector_store(documents: list, query: str) -> str:
|
367 |
+
"""Create vector store and search for relevant information."""
|
368 |
+
try:
|
369 |
+
# Split documents into chunks
|
370 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
371 |
+
chunk_size=1000,
|
372 |
+
chunk_overlap=200,
|
373 |
+
length_function=len,
|
374 |
+
)
|
375 |
+
|
376 |
+
splits = text_splitter.split_documents(documents)
|
377 |
+
|
378 |
+
if not splits:
|
379 |
+
return "No content to process after splitting."
|
380 |
+
|
381 |
+
# Create embeddings and vector store
|
382 |
+
embeddings = OpenAIEmbeddings()
|
383 |
+
vectorstore = FAISS.from_documents(splits, embeddings)
|
384 |
+
|
385 |
+
# Search for relevant chunks with the original query
|
386 |
+
relevant_docs = vectorstore.similarity_search(query, k=5)
|
387 |
+
|
388 |
+
# Format results
|
389 |
+
results = []
|
390 |
+
for i, doc in enumerate(relevant_docs, 1):
|
391 |
+
source = doc.metadata.get('source', 'Unknown source')
|
392 |
+
title = doc.metadata.get('title', 'No title')
|
393 |
+
content = doc.page_content[:5000] # First 500 chars
|
394 |
+
|
395 |
+
results.append(f"Result {i} from {title}:\n{content}\nSource: {source}\n")
|
396 |
+
|
397 |
+
return "\n---\n".join(results)
|
398 |
+
|
399 |
+
except Exception as e:
|
400 |
+
return f"Vector search failed: {str(e)}"
|
401 |
+
|
402 |
+
enhanced_web_retrieval_tool = Tool(
|
403 |
+
name="enhanced_web_retrieval",
|
404 |
+
func=enhanced_web_retrieval_tool_func,
|
405 |
+
description="Enhanced Wikipedia-only search with vector retrieval. Fetches full content from English Wikipedia pages and uses semantic search to find relevant information. Use this for factual questions that need detailed Wikipedia content analysis."
|
406 |
+
)
|
407 |
+
|
408 |
# List of all tools for easy import
|
409 |
agent_tools = [
|
410 |
web_search_tool,
|
411 |
+
enhanced_web_retrieval_tool,
|
412 |
file_download_tool,
|
413 |
image_analysis_tool,
|
414 |
calculator_tool,
|
utils.py
CHANGED
@@ -41,78 +41,62 @@ def submit_answers(username: str, agent_code: str, answers: List[Dict[str, str]]
|
|
41 |
|
42 |
def format_gaia_answer(raw_answer: str) -> str:
|
43 |
"""Format the agent's raw answer for GAIA submission (exact match)."""
|
44 |
-
|
45 |
-
prefixes_to_remove = [
|
46 |
-
"FINAL ANSWER:",
|
47 |
-
"Final Answer:",
|
48 |
-
"Answer:",
|
49 |
-
"The answer is:",
|
50 |
-
"The final answer is:",
|
51 |
-
]
|
52 |
|
53 |
-
|
|
|
|
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
# Remove trailing punctuation that might not be in ground truth
|
60 |
while answer and answer[-1] in '.!?':
|
61 |
answer = answer[:-1].strip()
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
]),
|
73 |
-
"needs_file_processing": "file" in question_lower or "document" in question_lower,
|
74 |
-
"needs_calculation": any(keyword in question_lower for keyword in [
|
75 |
-
"calculate", "compute", "sum", "total", "average", "percentage", "multiply", "divide"
|
76 |
-
]),
|
77 |
-
"needs_image_analysis": any(keyword in question_lower for keyword in [
|
78 |
-
"image", "picture", "photo", "visual", "shown", "displayed"
|
79 |
-
]),
|
80 |
-
"needs_text_processing": any(keyword in question_lower for keyword in [
|
81 |
-
"extract", "find in", "search for", "list", "count"
|
82 |
-
])
|
83 |
-
}
|
84 |
|
85 |
-
return
|
86 |
|
87 |
def create_execution_plan(question: str, task_id: str = None) -> List[str]:
|
88 |
-
"""Create a
|
89 |
-
analysis = analyze_question_type(question)
|
90 |
plan = []
|
91 |
|
92 |
# Always start with understanding the question
|
93 |
plan.append("Analyze the question to understand what information is needed")
|
94 |
|
95 |
-
# Add file processing if
|
96 |
-
if task_id
|
97 |
-
plan.append(f"
|
98 |
-
|
99 |
-
# Add web search if needed
|
100 |
-
if analysis["needs_web_search"]:
|
101 |
-
plan.append("Search the web for current/recent information")
|
102 |
-
|
103 |
-
# Add image analysis if needed
|
104 |
-
if analysis["needs_image_analysis"]:
|
105 |
-
plan.append("Analyze any images for visual information")
|
106 |
-
|
107 |
-
# Add calculation if needed
|
108 |
-
if analysis["needs_calculation"]:
|
109 |
-
plan.append("Perform necessary calculations")
|
110 |
-
|
111 |
-
# Add text processing if needed
|
112 |
-
if analysis["needs_text_processing"]:
|
113 |
-
plan.append("Process and extract specific information from text")
|
114 |
|
115 |
-
#
|
|
|
116 |
plan.append("Synthesize all information to provide the final answer")
|
117 |
|
118 |
return plan
|
|
|
41 |
|
42 |
def format_gaia_answer(raw_answer: str) -> str:
|
43 |
"""Format the agent's raw answer for GAIA submission (exact match)."""
|
44 |
+
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
+
# Look for FINAL ANSWER: pattern (case insensitive)
|
47 |
+
final_answer_pattern = r'FINAL ANSWER:\s*(.+?)(?:\n|$)'
|
48 |
+
match = re.search(final_answer_pattern, raw_answer, re.IGNORECASE | re.DOTALL)
|
49 |
|
50 |
+
if match:
|
51 |
+
answer = match.group(1).strip()
|
52 |
+
else:
|
53 |
+
# Fallback: try to extract from common patterns
|
54 |
+
fallback_patterns = [
|
55 |
+
r'(?:The\s+)?(?:final\s+)?answer\s+is:?\s*(.+?)(?:\n|$)',
|
56 |
+
r'(?:Answer|Result):\s*(.+?)(?:\n|$)',
|
57 |
+
]
|
58 |
+
|
59 |
+
answer = raw_answer.strip()
|
60 |
+
for pattern in fallback_patterns:
|
61 |
+
match = re.search(pattern, answer, re.IGNORECASE)
|
62 |
+
if match:
|
63 |
+
answer = match.group(1).strip()
|
64 |
+
break
|
65 |
+
|
66 |
+
# Apply GAIA formatting rules
|
67 |
+
answer = answer.strip()
|
68 |
|
69 |
# Remove trailing punctuation that might not be in ground truth
|
70 |
while answer and answer[-1] in '.!?':
|
71 |
answer = answer[:-1].strip()
|
72 |
|
73 |
+
# Remove quotes if they wrap the entire answer
|
74 |
+
if len(answer) >= 2 and answer[0] == answer[-1] and answer[0] in '"\'':
|
75 |
+
answer = answer[1:-1].strip()
|
76 |
+
|
77 |
+
# Additional cleanup for common issues
|
78 |
+
# Remove "approximately" or similar qualifiers
|
79 |
+
answer = re.sub(r'^(?:approximately|about|roughly|around)\s+', '', answer, flags=re.IGNORECASE)
|
80 |
|
81 |
+
# For numbers, ensure no commas (as per GAIA rules)
|
82 |
+
if re.match(r'^[\d,]+(?:\.\d+)?$', answer):
|
83 |
+
answer = answer.replace(',', '')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
+
return answer
|
86 |
|
87 |
def create_execution_plan(question: str, task_id: str = None) -> List[str]:
|
88 |
+
"""Create a simple execution plan - let GPT-4o decide what tools to use."""
|
|
|
89 |
plan = []
|
90 |
|
91 |
# Always start with understanding the question
|
92 |
plan.append("Analyze the question to understand what information is needed")
|
93 |
|
94 |
+
# Add file processing if task_id is provided
|
95 |
+
if task_id:
|
96 |
+
plan.append(f"Check for and process any files associated with task {task_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
+
# Let the LLM decide what other tools to use
|
99 |
+
plan.append("Use appropriate tools (web search, calculations, etc.) as needed")
|
100 |
plan.append("Synthesize all information to provide the final answer")
|
101 |
|
102 |
return plan
|