Update app.py
Browse files
app.py
CHANGED
@@ -7,9 +7,10 @@ from smolagents import CodeAgent, DuckDuckGoSearchTool, OpenAIServerModel
|
|
7 |
|
8 |
# Constants
|
9 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
10 |
-
MAX_QUESTION_LENGTH = 4000 #
|
|
|
11 |
|
12 |
-
# --- Agent Definition
|
13 |
class SmartGAIAAgent:
|
14 |
def __init__(self):
|
15 |
self.api_key = os.getenv("OPENAI_API_KEY")
|
@@ -17,17 +18,19 @@ class SmartGAIAAgent:
|
|
17 |
raise ValueError("Missing OPENAI_API_KEY")
|
18 |
self.model = OpenAIServerModel(model_id="gpt-4", api_key=self.api_key)
|
19 |
|
20 |
-
# Agent with DuckDuckGo + built-in Python interpreter
|
21 |
self.agent = CodeAgent(
|
22 |
tools=[DuckDuckGoSearchTool()],
|
23 |
model=self.model,
|
24 |
add_base_tools=True
|
25 |
)
|
26 |
|
|
|
|
|
|
|
27 |
def __call__(self, question: str) -> str:
|
28 |
try:
|
29 |
-
|
30 |
-
result = self.agent.run(
|
31 |
return result.strip()
|
32 |
except Exception as e:
|
33 |
print(f"Agent error: {e}")
|
@@ -65,16 +68,20 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
65 |
|
66 |
for item in questions_data:
|
67 |
task_id = item.get("task_id")
|
68 |
-
question_text = item.get("question")
|
69 |
|
70 |
-
# Skip
|
|
|
|
|
|
|
|
|
71 |
if not task_id or not question_text:
|
72 |
continue
|
73 |
if len(question_text) > MAX_QUESTION_LENGTH:
|
74 |
print(f"Skipping long question: {task_id}")
|
75 |
continue
|
76 |
-
if any(keyword in question_text.lower() for keyword in
|
77 |
-
print(f"Skipping
|
78 |
continue
|
79 |
|
80 |
try:
|
@@ -106,29 +113,28 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
106 |
response.raise_for_status()
|
107 |
result_data = response.json()
|
108 |
final_status = (
|
109 |
-
f"Submission Successful
|
110 |
-
f"User: {result_data.get('username')}
|
111 |
f"Score: {result_data.get('score')}% "
|
112 |
-
f"({result_data.get('correct_count')}/{result_data.get('total_attempted')})
|
113 |
f"Message: {result_data.get('message')}"
|
114 |
)
|
115 |
return final_status, pd.DataFrame(results_log)
|
116 |
except Exception as e:
|
117 |
return f"Submission failed: {e}", pd.DataFrame(results_log)
|
118 |
|
119 |
-
# --- Gradio
|
120 |
with gr.Blocks() as demo:
|
121 |
-
gr.Markdown("# GAIA Agent Evaluation")
|
122 |
gr.Markdown("""
|
123 |
-
**Instructions:**
|
124 |
1. Log in to Hugging Face
|
125 |
-
2. Click 'Run Evaluation
|
126 |
-
3.
|
127 |
""")
|
128 |
gr.LoginButton()
|
129 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
130 |
status_output = gr.Textbox(label="Submission Status", lines=5)
|
131 |
-
results_table = gr.DataFrame(label="Results")
|
132 |
|
133 |
run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
|
134 |
|
|
|
7 |
|
8 |
# Constants
|
9 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
10 |
+
MAX_QUESTION_LENGTH = 4000 # Character-based limit for questions
|
11 |
+
MAX_WEBPAGE_CONTENT = 3000 # Character limit for visited pages (GPT-4 context safe)
|
12 |
|
13 |
+
# --- Agent Definition ---
|
14 |
class SmartGAIAAgent:
|
15 |
def __init__(self):
|
16 |
self.api_key = os.getenv("OPENAI_API_KEY")
|
|
|
18 |
raise ValueError("Missing OPENAI_API_KEY")
|
19 |
self.model = OpenAIServerModel(model_id="gpt-4", api_key=self.api_key)
|
20 |
|
|
|
21 |
self.agent = CodeAgent(
|
22 |
tools=[DuckDuckGoSearchTool()],
|
23 |
model=self.model,
|
24 |
add_base_tools=True
|
25 |
)
|
26 |
|
27 |
+
def truncate_if_needed(self, question: str) -> str:
|
28 |
+
return question[:MAX_QUESTION_LENGTH]
|
29 |
+
|
30 |
def __call__(self, question: str) -> str:
|
31 |
try:
|
32 |
+
clean_question = self.truncate_if_needed(question)
|
33 |
+
result = self.agent.run(clean_question)
|
34 |
return result.strip()
|
35 |
except Exception as e:
|
36 |
print(f"Agent error: {e}")
|
|
|
68 |
|
69 |
for item in questions_data:
|
70 |
task_id = item.get("task_id")
|
71 |
+
question_text = item.get("question", "")
|
72 |
|
73 |
+
# Skip problematic questions
|
74 |
+
skip_keywords = [
|
75 |
+
'attached', '.mp3', '.wav', '.png', '.jpg', '.jpeg',
|
76 |
+
'youtube', '.mp4', 'video', 'listen', 'watch'
|
77 |
+
]
|
78 |
if not task_id or not question_text:
|
79 |
continue
|
80 |
if len(question_text) > MAX_QUESTION_LENGTH:
|
81 |
print(f"Skipping long question: {task_id}")
|
82 |
continue
|
83 |
+
if any(keyword in question_text.lower() for keyword in skip_keywords):
|
84 |
+
print(f"Skipping unsupported question ({task_id}): {question_text[:60]}...")
|
85 |
continue
|
86 |
|
87 |
try:
|
|
|
113 |
response.raise_for_status()
|
114 |
result_data = response.json()
|
115 |
final_status = (
|
116 |
+
f"Submission Successful!\n"
|
117 |
+
f"User: {result_data.get('username')}\n"
|
118 |
f"Score: {result_data.get('score')}% "
|
119 |
+
f"({result_data.get('correct_count')}/{result_data.get('total_attempted')})\n"
|
120 |
f"Message: {result_data.get('message')}"
|
121 |
)
|
122 |
return final_status, pd.DataFrame(results_log)
|
123 |
except Exception as e:
|
124 |
return f"Submission failed: {e}", pd.DataFrame(results_log)
|
125 |
|
126 |
+
# --- Gradio UI ---
|
127 |
with gr.Blocks() as demo:
|
128 |
+
gr.Markdown("# 🧠 GAIA Agent Evaluation")
|
129 |
gr.Markdown("""
|
|
|
130 |
1. Log in to Hugging Face
|
131 |
+
2. Click 'Run Evaluation & Submit All Answers'
|
132 |
+
3. View your score on the leaderboard
|
133 |
""")
|
134 |
gr.LoginButton()
|
135 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
136 |
status_output = gr.Textbox(label="Submission Status", lines=5)
|
137 |
+
results_table = gr.DataFrame(label="Evaluation Results")
|
138 |
|
139 |
run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
|
140 |
|