hassenhamdi commited on
Commit
4ca33ca
·
verified ·
1 Parent(s): a959e70

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -286
app.py CHANGED
@@ -1,326 +1,149 @@
1
  import os
2
- import gradio as gr
3
- import requests
4
  import pandas as pd
5
- from smolagents import OpenAIServerModel
6
- from smolagents import CodeAgent, Tool, tool
7
- from smolagents import DuckDuckGoSearchTool, VisitWebpageTool
8
- from smolagents import PythonInterpreterTool
9
  import time
10
- from requests.exceptions import HTTPError
11
-
12
- # --- Constants ---
13
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
14
-
15
- # --- Tool Definitions ---
16
-
17
- class GaiaFileTool(Tool):
18
- """
19
- A smolagents.Tool subclass for downloading files from the GAIA API.
20
- """
21
- name = "download_gaia_file"
22
- description = "Downloads a file associated with a given GAIA task ID and returns its content. It takes 'task_id' as input and returns the file content as a string. Use this when a question refers to an external file."
23
- inputs = {"task_id": {"type": "string", "description": "The task ID for which to download the file (e.g., '2345')."}}
24
- output_type = "string"
25
-
26
- def __init__(self, api_base_url=DEFAULT_API_URL):
27
- super().__init__()
28
- self.api_base_url = api_base_url
29
- print(f"GaiaFileTool initialized with API base URL: {self.api_base_url}")
30
-
31
- def forward(self, task_id: str) -> str:
32
- """
33
- The core logic for the tool: downloads a file from the GAIA API.
34
- This method is called by the agent when it uses this tool.
35
- """
36
- file_url = f"{self.api_base_url}/files/{task_id}"
37
- print(f"Attempting to download file from: {file_url}")
38
- try:
39
- response = requests.get(file_url)
40
- response.raise_for_status()
41
- print(f"Successfully downloaded file for task_id {task_id}")
42
- return response.text
43
- except requests.exceptions.RequestException as e:
44
- print(f"Error downloading file for task_id {task_id}: {e}")
45
- return f"Error downloading file: {e}"
46
 
47
- # --- Custom GAIA Agent Definition ---
48
- class GaiaAgent(CodeAgent):
49
- """
50
- A smolagents-based agent designed to tackle GAIA Level 1 benchmark questions.
51
- It uses Gemini Flash for reasoning and integrates a Python Interpreter, a
52
- GAIA file download tool, and web browsing/searching tools.
53
- """
54
- def __init__(self):
55
- print("GaiaAgent initializing...")
56
- gemini_api_key = os.getenv("GEMINI_API_KEY")
57
- if not gemini_api_key:
58
- print("WARNING: GEMINI_API_KEY environment variable not set.")
59
- print("Please set GEMINI_API_KEY for Gemini Flash to work.")
60
-
61
- self.llm_model = OpenAIServerModel(
62
- model_id="gemini-2.0-flash",
63
- api_base="https://generativelanguage.googleapis.com/v1beta/openai/",
64
- api_key=gemini_api_key,
65
- temperature=0.1,
66
- )
67
 
68
- # Initialize GAIA file tool
69
- gaia_file_tool_instance = GaiaFileTool()
 
 
70
 
71
- # Initialize web searching and browsing tools
72
- duckduckgo_search_tool = DuckDuckGoSearchTool()
73
- visit_webpage_tool = VisitWebpageTool()
74
 
75
- # Initialize the built-in Python Interpreter Tool
76
- python_interpreter_tool = PythonInterpreterTool()
 
 
77
 
78
- # Define the tools available to the agent
79
- agent_tools = [
80
- python_interpreter_tool,
81
- gaia_file_tool_instance,
82
- duckduckgo_search_tool,
83
- visit_webpage_tool
84
- ]
85
- # Set verbosity_level directly to 2 for DEBUG logs
86
- super().__init__(model=self.llm_model, tools=agent_tools, verbosity_level=2)
87
- print("GaiaAgent initialized successfully with Gemini Flash and built-in tools.")
88
 
89
- def __call__(self, question: str) -> str:
90
- """
91
- The main method for the agent to process a question and return an answer.
92
- This will involve the agent's internal reasoning, tool use, and planning.
93
- Includes retry logic for LLM calls to handle rate limits.
94
- """
95
- print(f"\n--- Agent received question (first 100 chars): {question[:100]}...")
96
 
97
- prompt = (
98
- f"You are an AI agent designed to solve GAIA benchmark questions. "
99
- f"Your goal is to provide the exact answer as a string, without any additional text, "
100
- f"explanation, or the phrase 'FINAL ANSWER:'. "
101
- f"Break down the problem, use the available tools (python_interpreter, download_gaia_file, "
102
- f"duckduckgo_search_tool, visit_webpage_tool) as needed, and think step-by-step. "
103
- f"When using web search or webpage visit tools, be highly efficient. "
104
- f"Formulate comprehensive search queries to get as much relevant information as possible in one go. "
105
- f"Only visit a webpage if absolutely necessary and when you expect it to contain the direct answer or crucial data. "
106
- f"Avoid redundant searches or visiting multiple pages for the same piece of information. "
107
- f"Use 'python_interpreter' for any calculations or code execution. "
108
- f"Use 'duckduckgo_search_tool' to find information on the web. "
109
- f"Use 'visit_webpage_tool' to read the content of a specific URL. "
110
- f"When you have the final answer, output ONLY the answer string.\n\n"
111
- f"Question: {question}"
112
- )
113
 
114
- print(f"Agent running with prompt (first 200 chars): {prompt[:200]}...")
115
-
116
- max_retries = 5
117
- initial_retry_delay = 30
118
- retry_delay = initial_retry_delay
119
- result = None
120
-
121
- for attempt in range(max_retries):
122
- try:
123
- result = self.run(prompt)
124
- print(f"Agent raw output from self.run():\n{result}")
125
- break # Break loop if successful
126
- except HTTPError as e:
127
- if e.response.status_code == 429:
128
- error_details = ""
129
- try:
130
- error_json = e.response.json()
131
- if 'error' in error_json and 'details' in error_json['error']:
132
- for detail in error_json['error']['details']:
133
- if detail.get('@type') == 'type.googleapis.com/google.rpc.QuotaFailure':
134
- quota_metric = detail.get('quotaMetric', 'N/A')
135
- quota_id = detail.get('quotaId', 'N/A')
136
- quota_value = detail.get('quotaValue', 'N/A')
137
- error_details = f"Quota Metric: {quota_metric}, Quota ID: {quota_id}, Value: {quota_value}. "
138
- break
139
- except Exception as parse_error:
140
- print(f"Could not parse detailed error from 429 response: {parse_error}")
141
- error_details = "Check Google Cloud Console for details. "
142
-
143
- error_message = (
144
- f"Gemini API Rate limit hit (429) on attempt {attempt + 1}/{max_retries}. "
145
- f"{error_details}"
146
- f"Retrying in {retry_delay} seconds... "
147
- f"This could be due to the 15 RPM or 200 RPD free tier limits. "
148
- f"If this persists, your daily quota might be exhausted."
149
- )
150
- print(error_message)
151
- time.sleep(retry_delay)
152
- retry_delay *= 2
153
- else:
154
- raise
155
- except Exception as e:
156
- import traceback
157
- print(f"--- Error during agent execution on attempt {attempt + 1}/{max_retries}: {e}")
158
- traceback.print_exc()
159
- if attempt < max_retries - 1:
160
- print(f"Retrying in {retry_delay} seconds...")
161
- time.sleep(retry_delay)
162
- retry_delay *= 2
163
- else:
164
- return "Agent encountered an error and could not provide an answer after multiple retries."
165
-
166
- if result is None:
167
- return "Agent failed after multiple retries due to an unknown error or persistent rate limits."
168
-
169
- final_answer = self._extract_exact_answer(result)
170
- print(f"--- Agent returning final answer (first 100 chars): {final_answer[:100]}...")
171
- return final_answer
172
-
173
- def _extract_exact_answer(self, raw_output: str) -> str:
174
- """
175
- Extracts and formats the exact answer from the agent's raw output.
176
- Ensures no "FINAL ANSWER" text is included and handles any
177
- extraneous formatting. This function is crucial for GAIA's exact match scoring.
178
- """
179
- print(f"Attempting to extract exact answer from raw output (first 200 chars):\n{raw_output[:200]}...")
180
-
181
- cleaned_output = raw_output.replace("FINAL ANSWER:", "").strip()
182
- cleaned_output = cleaned_output.replace("Answer:", "").strip()
183
- cleaned_output = cleaned_output.replace("The answer is:", "").strip()
184
- cleaned_output = cleaned_output.replace("```python", "").replace("```", "").strip()
185
-
186
- lines = cleaned_output.split('\n')
187
- if lines:
188
- potential_answer = lines[-1].strip()
189
- if len(potential_answer) < 5 or "tool_code" in potential_answer.lower():
190
- for line in reversed(lines[:-1]):
191
- if line.strip() and "tool_code" not in line.lower():
192
- potential_answer = line.strip()
193
- break
194
- cleaned_output = potential_answer
195
-
196
- if cleaned_output.startswith('"') and cleaned_output.endswith('"'):
197
- cleaned_output = cleaned_output[1:-1]
198
- if cleaned_output.startswith("'") and cleaned_output.endswith("'"):
199
- cleaned_output = cleaned_output[1:-1]
200
-
201
- print(f"Extracted and cleaned answer: {cleaned_output[:100]}...")
202
- return cleaned_output.strip()
203
-
204
-
205
- # --- Gradio Application Logic ---
206
-
207
- def run_and_submit_all(profile: gr.OAuthProfile | None):
208
  """
209
- Fetches all questions, runs the GaiaAgent on them, submits all answers,
210
- and displays the results.
211
  """
212
- space_id = os.getenv("SPACE_ID")
213
-
214
- if profile:
215
- username = f"{profile.username}"
216
- print(f"User logged in: {username}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  else:
218
- print("User not logged in.")
219
- return "Please Login to Hugging Face with the button.", None
220
-
221
- api_url = DEFAULT_API_URL
222
- questions_url = f"{api_url}/questions"
223
- submit_url = f"{api_url}/submit"
224
 
225
- try:
226
- agent = GaiaAgent()
227
- except Exception as e:
228
- print(f"Error during agent initialization in run_and_submit_all: {e}")
229
- import traceback
230
- traceback.print_exc()
231
- return f"Error initializing agent: {e}", None
232
-
233
- try:
234
- print(f"Fetching questions from: {questions_url}")
235
- questions_response = requests.get(questions_url)
236
- questions_response.raise_for_status()
237
- questions = questions_response.json()
238
- print(f"Fetched {len(questions)} questions.")
239
- except requests.exceptions.RequestException as e:
240
- print(f"Error fetching questions: {e}")
241
- return f"Error fetching questions: {e}", None
242
-
243
- all_answers = []
244
- results_data = []
245
-
246
- for i, q_data in enumerate(questions):
247
- task_id = q_data.get("task_id", f"unknown_{i}")
248
- question_text = q_data.get("question", "No question text found.")
249
- print(f"\n--- Processing Task ID: {task_id} ---")
250
- print(f"Question: {question_text[:100]}...")
251
-
252
- agent_answer = agent(question_text)
253
- all_answers.append({"task_id": task_id, "answer": agent_answer})
254
- results_data.append({
255
- "Task ID": task_id,
256
- "Question": question_text,
257
- "Agent Answer": agent_answer
258
- })
259
- print(f"--- Finished processing Task ID: {task_id} ---")
260
 
261
- try:
262
- print(f"\nSubmitting {len(all_answers)} answers to: {submit_url}")
263
- submission_payload = {
264
- "username": username,
265
- "code_link": f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "local_execution",
266
- "answers": all_answers
267
- }
268
- submit_response = requests.post(submit_url, json=submission_payload)
269
- submit_response.raise_for_status()
270
- submission_result = submit_response.json()
271
- print(f"Submission successful: {submission_result}")
272
- status_message = f"Submission successful!\nScore: {submission_result.get('score', 'N/A')}\nDetails: {submission_result.get('message', 'No message')}"
273
- except requests.exceptions.RequestException as e:
274
- print(f"Error submitting answers: {e}")
275
- status_message = f"Error submitting answers: {e}"
276
 
277
- results_df = pd.DataFrame(results_data)
278
- return status_message, results_df
279
 
280
- # --- Gradio UI ---
281
  with gr.Blocks() as demo:
 
282
  gr.Markdown(
283
  """
284
- # GAIA Level 1 Agent Evaluation
285
- This application allows you to run your `smolagents`-based agent on the GAIA Level 1 benchmark
286
- and submit your answers to the leaderboard.
287
-
288
- **Important:**
289
- 1. **Login to Hugging Face** using the button below to submit your score.
290
- 2. **Set `GEMINI_API_KEY`**: Ensure your `GEMINI_API_KEY` is set as a Space Secret
291
- in Hugging Face Spaces (or as an environment variable if running locally)
292
- for the Gemini Flash model to function.
 
 
 
293
  """
294
  )
295
 
296
  gr.LoginButton()
297
 
298
- run_button = gr.Button("Run Evaluation & Submit All Answers")
 
 
299
 
300
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
301
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
 
 
302
 
303
- run_button.click(
304
- fn=run_and_submit_all,
305
- outputs=[status_output, results_table]
 
 
 
 
 
306
  )
307
 
308
  if __name__ == "__main__":
309
- print("\n" + "-"*30 + " App Starting " + "-"*30)
 
 
310
  space_host_startup = os.getenv("SPACE_HOST")
311
  space_id_startup = os.getenv("SPACE_ID")
312
 
313
  if space_host_startup:
314
- print(f"✅ SPACE_HOST found: {space_host_startup}")
315
- print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
316
  else:
317
- print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
318
 
319
  if space_id_startup:
320
- print(f"✅ SPACE_ID found: {space_id_startup}")
321
- print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
322
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
323
  else:
324
- print("ℹ️ SPACE_ID environment variable not found. Code link might be incorrect for submission.")
 
 
 
 
 
325
 
326
- demo.launch()
 
1
  import os
 
 
2
  import pandas as pd
3
+ import gradio as gr
4
+ import logging
 
 
5
  import time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # Import the new Settings, Evaluator, and Runner classes
9
+ from settings import Settings
10
+ from evaluator import Evaluator
11
+ from runner import Runner
12
 
13
+ # Configure logging
14
+ logging.basicConfig(level=logging.INFO, force=True)
15
+ logger = logging.getLogger(__name__)
16
 
17
+ # Initialize settings, evaluator, and runner
18
+ settings = Settings()
19
+ evaluator = Evaluator(settings)
20
+ runner = Runner(settings)
21
 
 
 
 
 
 
 
 
 
 
 
22
 
 
 
 
 
 
 
 
23
 
24
+ LOGIN_MESSAGE = "Please Login to Hugging Face with the button."
25
+ EMPTY_RESULTS_TABLE = pd.DataFrame(columns=['task_id', 'question', 'answer'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ def _format_elapsed_time(elapsed_time):
28
+ """Formats elapsed time into minutes and seconds."""
29
+ minutes = int(elapsed_time // 60)
30
+ seconds = elapsed_time % 60
31
+ if minutes > 0:
32
+ return f"Elapsed time: {minutes} minutes {seconds:.2f} seconds"
33
+ else:
34
+ return f"Elapsed time: {seconds:.2f} seconds"
35
+
36
+ def _run_agent_on_questions(questions_list: list, username: str) -> tuple[str, pd.DataFrame]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  """
38
+ Helper function to run the agent on a list of questions and return status and results.
 
39
  """
40
+ start_time = time.time()
41
+ logger.info(f"Starting agent run for user: {username} on {len(questions_list)} questions.")
42
+
43
+ # The runner handles the agent execution and saving of answers
44
+ question_answer_pairs_df = runner.run_agent(questions_list, username)
45
+
46
+ end_time = time.time()
47
+ elapsed_time_str = _format_elapsed_time(end_time - start_time)
48
+ message = f"Agent run complete. {elapsed_time_str}"
49
+ logger.info(message)
50
+ return message, question_answer_pairs_df
51
+
52
+ def run_one(profile: gr.OAuthProfile | None) -> tuple[str, pd.DataFrame]:
53
+ """Runs the agent on one random question."""
54
+ if profile:
55
+ try:
56
+ question = evaluator.get_one_question()
57
+ return _run_agent_on_questions([question], profile.username)
58
+ except Exception as e:
59
+ logger.error(f"Error getting one question: {e}")
60
+ return f"Error getting question: {e}", EMPTY_RESULTS_TABLE
61
  else:
62
+ return LOGIN_MESSAGE, EMPTY_RESULTS_TABLE
 
 
 
 
 
63
 
64
+ def run_all(profile: gr.OAuthProfile | None) -> tuple[str, pd.DataFrame]:
65
+ """Runs the agent on all questions."""
66
+ if profile:
67
+ try:
68
+ questions = evaluator.get_questions()
69
+ return _run_agent_on_questions(questions, profile.username)
70
+ except Exception as e:
71
+ logger.error(f"Error getting all questions: {e}")
72
+ return f"Error getting questions: {e}", EMPTY_RESULTS_TABLE
73
+ else:
74
+ return LOGIN_MESSAGE, EMPTY_RESULTS_TABLE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ def submit(profile: gr.OAuthProfile | None) -> str:
77
+ """Submits cached answers for evaluation."""
78
+ if profile:
79
+ return evaluator.submit_answers(profile.username)
80
+ else:
81
+ return LOGIN_MESSAGE
 
 
 
 
 
 
 
 
 
82
 
 
 
83
 
84
+ # --- Build Gradio Interface using Blocks ---
85
  with gr.Blocks() as demo:
86
+ gr.Markdown("# GAIA Agent Evaluation Runner")
87
  gr.Markdown(
88
  """
89
+ **Instructions:**
90
+
91
+ 1. Log in to your Hugging Face account using the button below.
92
+ 2. Click 'Get One Answer' to run the agent on a random question or 'Get All Answers' to run all.
93
+ 3. Click 'Submit Answers' to submit answers for evaluation. **Your HF username will be submitted for leaderboard tracking.**
94
+
95
+ ---
96
+ **Disclaimers:**
97
+ * Running 'Get All Answers' can take significant time as the agent processes all 20 questions.
98
+ * Agent logs are detailed (DEBUG level) and may appear interleaved due to parallel execution.
99
+ * The 'Submit Answers' button uses the most recent agent answers cached locally for your username.
100
+ * **API Keys Required:** Ensure `GEMINI_API_KEY` is set as a Space Secret (or environment variable if running locally).
101
  """
102
  )
103
 
104
  gr.LoginButton()
105
 
106
+ run_one_button = gr.Button("Get One Answer")
107
+ run_all_button = gr.Button("Get All Answers")
108
+ submit_button = gr.Button("Submit Answers")
109
 
110
+ status_output = gr.Textbox(
111
+ label="Run Status / Submission Result", lines=5, interactive=False)
112
+ results_table = gr.DataFrame(
113
+ label="Questions and Agent Answers", wrap=True)
114
 
115
+ run_one_button.click(
116
+ fn=run_one, outputs=[status_output, results_table]
117
+ )
118
+ run_all_button.click(
119
+ fn=run_all, outputs=[status_output, results_table]
120
+ )
121
+ submit_button.click(
122
+ fn=submit, outputs=[status_output]
123
  )
124
 
125
  if __name__ == "__main__":
126
+ logger.info("\n" + "-"*30 + " App Starting " + "-"*30)
127
+
128
+ # Check for SPACE_HOST and SPACE_ID at startup for information
129
  space_host_startup = os.getenv("SPACE_HOST")
130
  space_id_startup = os.getenv("SPACE_ID")
131
 
132
  if space_host_startup:
133
+ logger.info(f"✅ SPACE_HOST found: {space_host_startup}")
134
+ logger.info(f" Runtime URL should be: https://{space_host_startup}.hf.space")
135
  else:
136
+ logger.info("ℹ️ SPACE_HOST environment variable not found (running locally?).")
137
 
138
  if space_id_startup:
139
+ logger.info(f"✅ SPACE_ID found: {space_id_startup}")
140
+ logger.info(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
141
+ logger.info(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
142
  else:
143
+ logger.info("ℹ️ SPACE_ID environment variable not found. Repo URL cannot be determined.")
144
+
145
+ logger.info("-"*(60 + len(" App Starting ")) + "\n")
146
+
147
+ logger.info("Launching Gradio Interface for GAIA Agent Evaluation...")
148
+ demo.launch(debug=True, share=False)
149