phucdev commited on
Commit
9c49c2c
·
1 Parent(s): 81917a3

Implement basic agent and tools to solve GAIA questions

Browse files
Files changed (8) hide show
  1. .env.example +15 -0
  2. .gitignore +174 -0
  3. README.md +1 -1
  4. agent.py +65 -0
  5. app.py +153 -65
  6. prompt.json +5 -0
  7. requirements.txt +24 -2
  8. tools.py +359 -0
.env.example ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # example of file for storing private and user specific environment variables, like keys or system paths
2
+ # rename it to ".env" (excluded from version control by default)
3
+
4
+ # example, uncomment and adapt for your needs!
5
+ # MY_VAR="/home/user/my/system/path"
6
+
7
+ # OPENWEATHERMAP_API_KEY=YOUR_OPENWEATHERMAP_API_KEY
8
+ # OPENAI_API_KEY=YOUR_OPENAI_API_KEY
9
+ # FUSE_PUBLIC_KEY=YOUR_FUSE_PUBLIC_KEY
10
+ # LANGFUSE_SECRET_KEY=YOUR_LANGFUSE_SECRET_KEY
11
+ # OTEL_EXPORTER_OTLP_ENDPOINT=https://cloud.langfuse.com/api/public/otel
12
+ # LANGFUSE_HOST=https://cloud.langfuse.com
13
+ # HUGGINGFACE_API_KEY=YOUR_HUGGINGFACE_API_KEY
14
+ # ANTHROPIC_API_KEY=YOUR_ANTHROPIC_API_KEY
15
+ # GROQ_API_KEY=YOUR_GROQ_API_KEY
.gitignore ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # Ruff stuff:
171
+ .ruff_cache/
172
+
173
+ # PyPI configuration file
174
+ .pypirc
README.md CHANGED
@@ -12,4 +12,4 @@ hf_oauth: true
12
  hf_oauth_expiration_minutes: 480
13
  ---
14
 
15
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
12
  hf_oauth_expiration_minutes: 480
13
  ---
14
 
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
agent.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Annotated, Optional, TypedDict
2
+
3
+ from dotenv import find_dotenv, load_dotenv
4
+ from langchain.chat_models import init_chat_model
5
+ from langchain_core.messages import AnyMessage, HumanMessage
6
+ from langgraph.graph.message import add_messages
7
+ from langgraph.prebuilt import create_react_agent
8
+
9
+ from tools import (add, ask_about_image, divide, get_current_time_and_date,
10
+ get_sum, get_weather_info, get_youtube_transcript,
11
+ get_youtube_video_info, inspect_file_as_text, multiply,
12
+ reverse_text, subtract, visit_website, web_search,
13
+ wiki_search)
14
+
15
+
16
+ class AgentState(TypedDict):
17
+ input_file: Optional[str] # Contains file path
18
+ messages: Annotated[list[AnyMessage], add_messages]
19
+
20
+
21
+ class BasicAgent:
22
+ def __init__(self):
23
+ load_dotenv(find_dotenv())
24
+ model = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
25
+ system_prompt = (
26
+ "You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer "
27
+ "with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR "
28
+ "as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a "
29
+ "number, don't use comma to write your number neither use units such as $ or percent sign unless specified "
30
+ "otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), "
31
+ "and write the digits in plain text unless specified otherwise. If you are asked for a comma separated "
32
+ "list, apply the above rules depending of whether the element to be put in the list is a number or a string."
33
+ "Give it all you can: I know for a fact that you have access to all the relevant tools to solve it and find "
34
+ "the correct answer (the answer does exist). Failure or 'I cannot answer' or 'None found' will not be "
35
+ "tolerated, success will be rewarded. Run verification steps if that's needed, you must make sure you find "
36
+ "the correct answer! "
37
+ )
38
+ tools = [
39
+ get_weather_info,
40
+ add,
41
+ get_sum,
42
+ subtract,
43
+ multiply,
44
+ divide,
45
+ get_current_time_and_date,
46
+ wiki_search,
47
+ web_search,
48
+ visit_website,
49
+ inspect_file_as_text,
50
+ ask_about_image,
51
+ reverse_text,
52
+ get_youtube_video_info,
53
+ get_youtube_transcript,
54
+ ]
55
+
56
+ self.agent = create_react_agent(model=model, tools=tools, prompt=system_prompt)
57
+ print("BasicAgent initialized.")
58
+
59
+ def __call__(self, question: str) -> str:
60
+ print(f"Agent received question (first 50 chars): {question[:50]}...")
61
+ messages = [HumanMessage(content=question)]
62
+ response = self.agent.invoke({"messages": messages})
63
+ response_string = response["messages"][-1].content
64
+ print(f"Agent's response: {response_string}")
65
+ return response_string
app.py CHANGED
@@ -1,34 +1,89 @@
 
 
1
  import os
 
 
 
 
2
  import gradio as gr
3
- import requests
4
- import inspect
5
  import pandas as pd
 
 
 
6
 
7
  # (Keep Constants as is)
8
  # --- Constants ---
9
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
 
11
- # --- Basic Agent Definition ---
12
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
13
- class BasicAgent:
14
- def __init__(self):
15
- print("BasicAgent initialized.")
16
- def __call__(self, question: str) -> str:
17
- print(f"Agent received question (first 50 chars): {question[:50]}...")
18
- fixed_answer = "This is a default answer."
19
- print(f"Agent returning fixed answer: {fixed_answer}")
20
- return fixed_answer
21
-
22
- def run_and_submit_all( profile: gr.OAuthProfile | None):
23
- """
24
- Fetches all questions, runs the BasicAgent on them, submits all answers,
25
- and displays the results.
26
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  # --- Determine HF Space Runtime URL and Repo URL ---
28
- space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
 
29
 
30
  if profile:
31
- username= f"{profile.username}"
32
  print(f"User logged in: {username}")
33
  else:
34
  print("User not logged in.")
@@ -38,62 +93,89 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
38
  questions_url = f"{api_url}/questions"
39
  submit_url = f"{api_url}/submit"
40
 
41
- # 1. Instantiate Agent ( modify this part to create your agent)
42
- try:
43
- agent = BasicAgent()
44
- except Exception as e:
45
- print(f"Error instantiating agent: {e}")
46
- return f"Error initializing agent: {e}", None
47
- # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
48
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
49
  print(agent_code)
50
 
51
- # 2. Fetch Questions
52
  print(f"Fetching questions from: {questions_url}")
53
  try:
54
  response = requests.get(questions_url, timeout=15)
55
  response.raise_for_status()
56
  questions_data = response.json()
57
  if not questions_data:
58
- print("Fetched questions list is empty.")
59
- return "Fetched questions list is empty or invalid format.", None
60
  print(f"Fetched {len(questions_data)} questions.")
61
  except requests.exceptions.RequestException as e:
62
  print(f"Error fetching questions: {e}")
63
  return f"Error fetching questions: {e}", None
64
  except requests.exceptions.JSONDecodeError as e:
65
- print(f"Error decoding JSON response from questions endpoint: {e}")
66
- print(f"Response text: {response.text[:500]}")
67
- return f"Error decoding server response for questions: {e}", None
68
  except Exception as e:
69
  print(f"An unexpected error occurred fetching questions: {e}")
70
  return f"An unexpected error occurred fetching questions: {e}", None
71
 
72
- # 3. Run your Agent
73
  results_log = []
74
  answers_payload = []
75
- print(f"Running agent on {len(questions_data)} questions...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  for item in questions_data:
77
- task_id = item.get("task_id")
78
- question_text = item.get("question")
79
- if not task_id or question_text is None:
80
- print(f"Skipping item with missing task_id or question: {item}")
81
- continue
82
- try:
83
- submitted_answer = agent(question_text)
84
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
85
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
86
- except Exception as e:
87
- print(f"Error running agent on task {task_id}: {e}")
88
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
89
 
90
  if not answers_payload:
91
  print("Agent did not produce any answers to submit.")
92
- return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
 
 
 
93
 
94
- # 4. Prepare Submission
95
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
96
- status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
 
 
 
 
 
 
 
97
  print(status_update)
98
 
99
  # 5. Submit
@@ -106,7 +188,8 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
106
  f"Submission Successful!\n"
107
  f"User: {result_data.get('username')}\n"
108
  f"Overall Score: {result_data.get('score', 'N/A')}% "
109
- f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
 
110
  f"Message: {result_data.get('message', 'No message received.')}"
111
  )
112
  print("Submission successful.")
@@ -154,7 +237,7 @@ with gr.Blocks() as demo:
154
  ---
155
  **Disclaimers:**
156
  Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
157
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
158
  """
159
  )
160
 
@@ -162,20 +245,19 @@ with gr.Blocks() as demo:
162
 
163
  run_button = gr.Button("Run Evaluation & Submit All Answers")
164
 
165
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
 
 
166
  # Removed max_rows=10 from DataFrame constructor
167
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
168
 
169
- run_button.click(
170
- fn=run_and_submit_all,
171
- outputs=[status_output, results_table]
172
- )
173
 
174
  if __name__ == "__main__":
175
- print("\n" + "-"*30 + " App Starting " + "-"*30)
176
  # Check for SPACE_HOST and SPACE_ID at startup for information
177
  space_host_startup = os.getenv("SPACE_HOST")
178
- space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
179
 
180
  if space_host_startup:
181
  print(f"✅ SPACE_HOST found: {space_host_startup}")
@@ -183,14 +265,20 @@ if __name__ == "__main__":
183
  else:
184
  print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
185
 
186
- if space_id_startup: # Print repo URLs if SPACE_ID is found
187
  print(f"✅ SPACE_ID found: {space_id_startup}")
188
  print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
189
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
 
 
 
190
  else:
191
- print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
 
 
 
192
 
193
- print("-"*(60 + len(" App Starting ")) + "\n")
194
 
195
  print("Launching Gradio Interface for Basic Agent Evaluation...")
196
- demo.launch(debug=True, share=False)
 
1
+ # import inspect
2
+ import json
3
  import os
4
+ from pathlib import Path
5
+ from typing import Dict
6
+ from zipfile import ZipFile
7
+
8
  import gradio as gr
 
 
9
  import pandas as pd
10
+ import requests
11
+
12
+ from agent import BasicAgent
13
 
14
  # (Keep Constants as is)
15
  # --- Constants ---
16
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
17
 
18
+ with open("prompt.json", mode="r") as f:
19
+ prompt_template = json.load(f)
20
+
21
+
22
+ def post_process_answer(answer: str) -> str:
23
+ """Post-process the answer to extract the final answer."""
24
+ if "FINAL ANSWER:" in answer:
25
+ answer = answer.split("FINAL ANSWER:")[-1].strip()
26
+ return answer
27
+
28
+
29
+ def solve_question(question: Dict[str, str]) -> Dict[str, str]:
30
+ """Solve the question using the BasicAgent."""
31
+ agent = BasicAgent()
32
+ question_text = question.get("question")
33
+ task_id = question.get("task_id")
34
+ if not question_text:
35
+ raise ValueError("Question text is empty.")
36
+ augmented_question = prompt_template["user_prompt"] + question_text
37
+ if question.get("file_name"):
38
+ file_url = DEFAULT_API_URL + "/files"
39
+ response = requests.get(f"{file_url}/{question['file_name']}", timeout=15)
40
+ file_path = Path("files") / question["file_name"]
41
+ # Create files directory if it doesn't exist
42
+ file_path.parent.mkdir(parents=True, exist_ok=True)
43
+ with open(file_path, "wb") as f:
44
+ f.write(response.content)
45
+ if file_path.suffix == "zip":
46
+ # If the file is a zip, we need to extract the files and give the LLM the list of files
47
+ file_paths = []
48
+ with ZipFile(file_path, "r") as zip_ref:
49
+ for file_info in zip_ref.infolist():
50
+ # Read file content
51
+ file_data = zip_ref.read(file_info.filename)
52
+ extracted_file_path = file_path / file_info.filename
53
+ with open(extracted_file_path, "wb") as extracted_file:
54
+ extracted_file.write(file_data)
55
+ file_paths.append(str(extracted_file_path))
56
+ augmented_question += prompt_template["use_files_prompt"] + str(file_paths)
57
+ else:
58
+ augmented_question += prompt_template["use_file_prompt"] + str(file_path)
59
+ try:
60
+ agent_response = agent(augmented_question)
61
+ submitted_answer = post_process_answer(agent_response)
62
+ return {
63
+ "Task ID": task_id,
64
+ "Question": augmented_question,
65
+ "Submitted Answer": submitted_answer,
66
+ "Full Answer": agent_response,
67
+ }
68
+ except Exception as e:
69
+ print(f"Error running agent on task {task_id}: {e}")
70
+ return {
71
+ "Task ID": task_id,
72
+ "Question": augmented_question,
73
+ "Submitted Answer": f"AGENT ERROR: {e}",
74
+ "Full Answer": "",
75
+ }
76
+
77
+
78
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
79
+ """Fetches all questions, runs the BasicAgent on them, submits all answers, and displays the
80
+ results."""
81
  # --- Determine HF Space Runtime URL and Repo URL ---
82
+ # Get the SPACE_ID for sending link to the code
83
+ space_id = os.getenv("SPACE_ID")
84
 
85
  if profile:
86
+ username = f"{profile.username}"
87
  print(f"User logged in: {username}")
88
  else:
89
  print("User not logged in.")
 
93
  questions_url = f"{api_url}/questions"
94
  submit_url = f"{api_url}/submit"
95
 
96
+ # In the case of an app running as a hugging Face space, this link points
97
+ # toward your codebase ( useful for others so please keep it public)
 
 
 
 
 
98
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
99
  print(agent_code)
100
 
101
+ # Fetch Questions
102
  print(f"Fetching questions from: {questions_url}")
103
  try:
104
  response = requests.get(questions_url, timeout=15)
105
  response.raise_for_status()
106
  questions_data = response.json()
107
  if not questions_data:
108
+ print("Fetched questions list is empty.")
109
+ return "Fetched questions list is empty or invalid format.", None
110
  print(f"Fetched {len(questions_data)} questions.")
111
  except requests.exceptions.RequestException as e:
112
  print(f"Error fetching questions: {e}")
113
  return f"Error fetching questions: {e}", None
114
  except requests.exceptions.JSONDecodeError as e:
115
+ print(f"Error decoding JSON response from questions endpoint: {e}")
116
+ print(f"Response text: {response.text[:500]}")
117
+ return f"Error decoding server response for questions: {e}", None
118
  except Exception as e:
119
  print(f"An unexpected error occurred fetching questions: {e}")
120
  return f"An unexpected error occurred fetching questions: {e}", None
121
 
122
+ # Run your Agent
123
  results_log = []
124
  answers_payload = []
125
+
126
+ results_file_path = Path("files/results_log.jsonl")
127
+ results_file_path.parent.mkdir(parents=True, exist_ok=True)
128
+ solved_task_ids = []
129
+ if results_file_path.exists():
130
+ print(f"Results file already exists: {results_file_path}")
131
+ with open(results_file_path, "r") as results_file:
132
+ for line in results_file:
133
+ result = json.loads(line)
134
+ results_log.append(result)
135
+ solved_task_ids.append(result["Task ID"])
136
+ filtered_questions_data = [
137
+ question
138
+ for question in questions_data
139
+ if question["task_id"] not in solved_task_ids
140
+ ]
141
+ if solved_task_ids:
142
+ print(
143
+ f"Found {len(solved_task_ids)} solved questions. "
144
+ f"Running agent on remaining {len(filtered_questions_data)} questions."
145
+ )
146
+ else:
147
+ print(f"Running agent on {len(questions_data)} questions...")
148
  for item in questions_data:
149
+ result = solve_question(item)
150
+ results_log.append(result)
151
+ with open(results_file_path, "w") as results_file:
152
+ for result in results_log:
153
+ results_file.write(json.dumps(result) + "\n")
154
+ for result in results_log:
155
+ answers_payload.append(
156
+ {
157
+ "task_id": result["Task ID"],
158
+ "submitted_answer": result["Submitted Answer"],
159
+ }
160
+ )
161
 
162
  if not answers_payload:
163
  print("Agent did not produce any answers to submit.")
164
+ return (
165
+ "Agent did not produce any answers to submit.",
166
+ pd.DataFrame(results_log),
167
+ )
168
 
169
+ # 4. Prepare Submission
170
+ submission_data = {
171
+ "username": username.strip(),
172
+ "agent_code": agent_code,
173
+ "answers": answers_payload,
174
+ }
175
+ status_update = (
176
+ f"Agent finished. Submitting {len(answers_payload)} "
177
+ f"answers for user '{username}'..."
178
+ )
179
  print(status_update)
180
 
181
  # 5. Submit
 
188
  f"Submission Successful!\n"
189
  f"User: {result_data.get('username')}\n"
190
  f"Overall Score: {result_data.get('score', 'N/A')}% "
191
+ f"({result_data.get('correct_count', '?')}/"
192
+ f"{result_data.get('total_attempted', '?')} correct)\n"
193
  f"Message: {result_data.get('message', 'No message received.')}"
194
  )
195
  print("Submission successful.")
 
237
  ---
238
  **Disclaimers:**
239
  Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
240
+ This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a separate action or even to answer the questions in async.
241
  """
242
  )
243
 
 
245
 
246
  run_button = gr.Button("Run Evaluation & Submit All Answers")
247
 
248
+ status_output = gr.Textbox(
249
+ label="Run Status / Submission Result", lines=5, interactive=False
250
+ )
251
  # Removed max_rows=10 from DataFrame constructor
252
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
253
 
254
+ run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 
 
 
255
 
256
  if __name__ == "__main__":
257
+ print("\n" + "-" * 30 + " App Starting " + "-" * 30)
258
  # Check for SPACE_HOST and SPACE_ID at startup for information
259
  space_host_startup = os.getenv("SPACE_HOST")
260
+ space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
261
 
262
  if space_host_startup:
263
  print(f"✅ SPACE_HOST found: {space_host_startup}")
 
265
  else:
266
  print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
267
 
268
+ if space_id_startup: # Print repo URLs if SPACE_ID is found
269
  print(f"✅ SPACE_ID found: {space_id_startup}")
270
  print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
271
+ print(
272
+ f" Repo Tree URL: https://huggingface.co/spaces/"
273
+ f"{space_id_startup}/tree/main"
274
+ )
275
  else:
276
+ print(
277
+ "ℹ️ SPACE_ID environment variable not found (running locally?). "
278
+ "Repo URL cannot be determined."
279
+ )
280
 
281
+ print("-" * (60 + len(" App Starting ")) + "\n")
282
 
283
  print("Launching Gradio Interface for Basic Agent Evaluation...")
284
+ demo.launch(debug=True, share=False)
prompt.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "user_prompt": "Here is the task:\n",
3
+ "use_files_prompt": "\n\nTo solve the task above, you will have to use the appropriate tools to extract the relevant information from these attached files:\n",
4
+ "use_file_prompt": "\n\nTo solve the task above, you will have to use the appropriate tools to extract the relevant information from the attached file:\n"
5
+ }
requirements.txt CHANGED
@@ -1,2 +1,24 @@
1
- gradio
2
- requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4==4.13.4
2
+ datasets==3.5.1
3
+ duckduckgo-search==8.0.1
4
+ gradio==5.29.0
5
+ huggingface-hub==0.30.2
6
+ langchain==0.3.25
7
+ langchain-community==0.3.23
8
+ langchain-core==0.3.58
9
+ langchain_groq==0.3.2
10
+ langchain-huggingface==0.1.2
11
+ langchain-openai==0.3.16
12
+ langgraph==0.4.1
13
+ numpy==2.2.5
14
+ openai-whisper==20240930
15
+ openpyxl==3.1.5
16
+ pandas==2.2.3
17
+ pyrootutils~=1.0.4
18
+ python-dotenv~=1.1.0
19
+ requests==2.32.3
20
+ tabulate==0.9.0
21
+ unstructured[pdf,docx,pptx]==0.17.2
22
+ wikipedia~=1.4.0
23
+ youtube-transcript-api==1.0.3
24
+ yt-dlp==2025.4.30
tools.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ from datetime import datetime
4
+
5
+ import pandas as pd
6
+ import requests
7
+ import whisper
8
+ import wikipedia
9
+ from dotenv import find_dotenv, load_dotenv
10
+ from langchain.chat_models import init_chat_model
11
+ from langchain_community.document_loaders import (
12
+ UnstructuredPDFLoader, UnstructuredPowerPointLoader,
13
+ UnstructuredWordDocumentLoader, WebBaseLoader)
14
+ from langchain_community.tools import DuckDuckGoSearchRun
15
+ from langchain_core.prompts import ChatPromptTemplate
16
+ from langchain_core.tools import tool
17
+ from youtube_transcript_api import YouTubeTranscriptApi
18
+ from yt_dlp import YoutubeDL
19
+
20
+
21
+ @tool
22
+ def get_weather_info(location: str) -> str:
23
+ """Fetches dummy weather information for a given location.
24
+
25
+ Usage:
26
+ ```
27
+ # Initialize the tool
28
+ weather_info_tool = Tool(
29
+ name="get_weather_info",
30
+ func=get_weather_info,
31
+ description="Fetches weather information for a given location.")
32
+ ```
33
+ """
34
+ load_dotenv(find_dotenv())
35
+ api_key = os.getenv("OPENWEATHERMAP_API_KEY")
36
+ url = (
37
+ f"https://api.openweathermap.org/data/2.5/"
38
+ f"weather?q={location}&appid={api_key}&units=metric"
39
+ )
40
+
41
+ res = requests.get(url, timeout=15)
42
+ data = res.json()
43
+ humidity = data["main"]["humidity"]
44
+ pressure = data["main"]["pressure"]
45
+ wind = data["wind"]["speed"]
46
+ description = data["weather"][0]["description"]
47
+ temp = data["main"]["temp"]
48
+ min_temp = data["main"]["temp_min"]
49
+ max_temp = data["main"]["temp_max"]
50
+ return (
51
+ f"Weather in {location}: {description}, "
52
+ f"Temperature: {temp}°C, Min: {min_temp}°C, Max: {max_temp}°C, "
53
+ f"Humidity: {humidity}%, Pressure: {pressure} hPa, "
54
+ f"Wind Speed: {wind} m/s"
55
+ )
56
+
57
+
58
+ @tool
59
+ def add(a: int, b: int) -> int:
60
+ """Adds two numbers together.
61
+
62
+ Args:
63
+ a (int): The first number.
64
+ b (int): The second number.
65
+ """
66
+ return a + b
67
+
68
+
69
+ @tool
70
+ def get_sum(list_of_numbers: list[int]) -> int:
71
+ """Sums a list of numbers.
72
+
73
+ Args:
74
+ list_of_numbers (list[int]): The list of numbers to sum.
75
+ """
76
+ return sum(list_of_numbers)
77
+
78
+
79
+ @tool
80
+ def subtract(a: int, b: int) -> int:
81
+ """Subtracts the second number from the first.
82
+
83
+ Args:
84
+ a (int): The first number.
85
+ b (int): The second number.
86
+ """
87
+ return a - b
88
+
89
+
90
+ @tool
91
+ def multiply(a: int, b: int) -> int:
92
+ """Multiplies two numbers together.
93
+
94
+ Args:
95
+ a (int): The first number.
96
+ b (int): The second number.
97
+ """
98
+ return a * b
99
+
100
+
101
+ @tool
102
+ def divide(a: int, b: int) -> float:
103
+ """Divides the first number by the second.
104
+
105
+ Args:
106
+ a (int): The first number.
107
+ b (int): The second number.
108
+ """
109
+ if b == 0:
110
+ raise ValueError("Cannot divide by zero.")
111
+ return a / b
112
+
113
+
114
+ @tool
115
+ def get_current_time_and_date() -> str:
116
+ """Returns the current time and date in ISO format."""
117
+ return datetime.now().isoformat()
118
+
119
+
120
+ @tool
121
+ def reverse_text(text: str) -> str:
122
+ """Reverses the given text.
123
+
124
+ Args:
125
+ text (str): The text to reverse.
126
+ """
127
+ return text[::-1]
128
+
129
+
130
+ @tool
131
+ def wiki_search(query: str) -> str:
132
+ """Searches Wikipedia for a given query and returns the summary.
133
+
134
+ Args:
135
+ query (str): The search query.
136
+ """
137
+ search_results = wikipedia.search(query)
138
+ if not search_results:
139
+ return "No results found."
140
+ page_title = search_results[0]
141
+ summary = wikipedia.summary(page_title)
142
+ # Alternatively wikipedia.page(page_title).content[:max_length]
143
+ return f"Title: {page_title}\n\nSummary: {summary}"
144
+
145
+
146
+ @tool
147
+ def web_search(query: str) -> str:
148
+ """Searches the web for a given query and returns the first result.
149
+
150
+ Args:
151
+ query (str): The search query.
152
+ """
153
+ search_tool = DuckDuckGoSearchRun()
154
+ results = search_tool.invoke(query)
155
+ if results:
156
+ return results
157
+ else:
158
+ return "No results found."
159
+
160
+
161
+ @tool
162
+ def visit_website(url: str) -> str:
163
+ """Visits a website and returns the content.
164
+
165
+ Args:
166
+ url (str): The URL of the website to visit.
167
+ """
168
+ loader = WebBaseLoader(url)
169
+ documents = loader.load()
170
+ if documents:
171
+ return documents[0].page_content
172
+ else:
173
+ return "No content found."
174
+
175
+
176
+ @tool
177
+ def get_youtube_transcript(video_url: str, return_timestamps: bool = False) -> str:
178
+ """Fetches the transcript of a YouTube video.
179
+
180
+ Args:
181
+ video_url (str): The URL of the YouTube video.
182
+ return_timestamps (bool): If True, returns timestamps with the transcript. Otherwise, returns only the text.
183
+ """
184
+ try:
185
+ video_id = video_url.split("v=")[-1]
186
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
187
+ if return_timestamps:
188
+ sentences = []
189
+ for t in transcript:
190
+ start = t["start"]
191
+ end = start + t["duration"]
192
+ sentences.append(f"{start:.2f} - {end:.2f}: {t['text']}")
193
+ return "\n".join(sentences)
194
+ else:
195
+ return "\n".join([t["text"] for t in transcript])
196
+ except Exception as e:
197
+ return f"Error fetching transcript: {e}"
198
+
199
+
200
+ @tool
201
+ def get_youtube_video_info(video_url: str) -> str:
202
+ """Fetches information about a YouTube video.
203
+
204
+ Args:
205
+ video_url (str): The URL of the YouTube video.
206
+ """
207
+ try:
208
+ ydl_opts = {
209
+ "quiet": True,
210
+ "skip_download": True,
211
+ }
212
+ with YoutubeDL(ydl_opts) as ydl:
213
+ info = ydl.extract_info(video_url, download=False)
214
+ video_info = {
215
+ "Title": info.get("title"),
216
+ "Description": info.get("description"),
217
+ "Uploader": info.get("uploader"),
218
+ "Upload date": info.get("upload_date"),
219
+ "Duration": info.get("duration"),
220
+ "View count": info.get("view_count"),
221
+ "Like count": info.get("like_count"),
222
+ }
223
+ video_info_filtered = {k: v for k, v in video_info.items() if v is not None}
224
+ video_info_str = "\n".join(
225
+ [f"{k}: {v}" for k, v in video_info_filtered.items()]
226
+ )
227
+ return video_info_str
228
+ except Exception as e:
229
+ return f"Error fetching video info: {e}"
230
+
231
+
232
+ def encode_image(image_path):
233
+ with open(image_path, "rb") as image_file:
234
+ return base64.b64encode(image_file.read()).decode("utf-8")
235
+
236
+
237
+ @tool
238
+ def ask_about_image(image_path: str, question: str) -> str:
239
+ """Performs vision-based question answering on an image.
240
+
241
+ Args:
242
+ image_path (str): The path to the image file.
243
+ question (str): Your question about the image, as a natural language sentence. Provide as much context as possible.
244
+ """
245
+ load_dotenv(find_dotenv())
246
+ llm = init_chat_model("groq:meta-llama/llama-4-scout-17b-16e-instruct")
247
+ prompt = ChatPromptTemplate(
248
+ [
249
+ {
250
+ "role": "user",
251
+ "content": [
252
+ {
253
+ "type": "text",
254
+ "text": "Please write a concise caption for the image that helps answer the following question: {question}",
255
+ },
256
+ {
257
+ "type": "image_url",
258
+ "image_url": {
259
+ "url": "data:image/jpeg;base64,{base64_image}",
260
+ },
261
+ },
262
+ ],
263
+ }
264
+ ]
265
+ )
266
+ chain = prompt | llm
267
+ response = chain.invoke(
268
+ {"question": question, "base64_image": encode_image(image_path)}
269
+ )
270
+ return response.text()
271
+
272
+
273
+ def transcribe_audio(audio_path: str) -> str:
274
+ """Transcribes audio to text.
275
+
276
+ Args:
277
+ audio_path (str): The path to the audio file.
278
+ """
279
+ model = whisper.load_model("base")
280
+ result = model.transcribe(audio_path)
281
+ text = result.text
282
+ return text
283
+
284
+
285
+ def get_table_description(table: pd.DataFrame) -> str:
286
+ """Generates a description of the table. If applicable, calculates sum and mean of numeric
287
+ columns.
288
+
289
+ Args:
290
+ table (pd.DataFrame): The table to describe.
291
+ """
292
+ if table.empty:
293
+ return "The table is empty."
294
+ description = []
295
+ total_sum = 0
296
+ for column in table.select_dtypes(include=[int, float]).columns:
297
+ column_sum = table[column].sum()
298
+ column_mean = table[column].mean()
299
+ description.append(
300
+ f"Column '{column}': Sum = {column_sum}, Mean = {column_mean:.2f}"
301
+ )
302
+ total_sum += column_sum
303
+ if total_sum:
304
+ description.append(f"Total Sum of all numeric columns: {total_sum}")
305
+ if description:
306
+ description = "\n".join(description)
307
+ else:
308
+ description = "No numeric columns to summarize."
309
+ # Add the number of rows and columns
310
+ description += f"\n\nTable has {table.shape[0]} rows and {table.shape[1]} columns."
311
+ df_as_markdown = table.to_markdown()
312
+ description += f"\n\nTable:\n{df_as_markdown}"
313
+ return description
314
+
315
+
316
+ @tool
317
+ def inspect_file_as_text(file_path: str) -> str:
318
+ """This tool reads a file as markdown text. It handles [".csv", ".xlsx", ".pptx", ".wav",
319
+ ".mp3", ".m4a", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT
320
+ HANDLE IMAGES.
321
+
322
+ Args:
323
+ file_path (str): The path to the file you want to read as text. If it is an image, use `vision_qa` tool.
324
+ """
325
+ try:
326
+ suffix = os.path.splitext(file_path)[-1]
327
+ if suffix in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff"]:
328
+ raise Exception(
329
+ "Cannot use inspect_file_as_text tool with images: use `vision_qa` tool instead!"
330
+ )
331
+ if suffix in [".csv", ".tsv", ".xlsx"]:
332
+ if suffix == ".csv":
333
+ df = pd.read_csv(file_path)
334
+ elif suffix == ".tsv":
335
+ df = pd.read_csv(file_path, sep="\t")
336
+ elif suffix == ".xlsx":
337
+ df = pd.read_excel(file_path)
338
+ else:
339
+ raise Exception(f"Unsupported file type: {suffix}")
340
+ table_description = get_table_description(df)
341
+ return table_description
342
+ elif suffix == ".pptx":
343
+ doc = UnstructuredPowerPointLoader(file_path)
344
+ return doc.load()[0].page_content
345
+ elif suffix == ".pdf":
346
+ doc = UnstructuredPDFLoader(file_path)
347
+ return doc.load()[0].page_content
348
+ elif suffix == ".docx":
349
+ doc = UnstructuredWordDocumentLoader(file_path)
350
+ return doc.load()[0].page_content
351
+ elif suffix in [".wav", ".mp3", ".m4a", ".flac"]:
352
+ return transcribe_audio(file_path)
353
+ else:
354
+ # All other text files
355
+ with open(file_path, "r", encoding="utf-8") as file:
356
+ content = file.read()
357
+ return content
358
+ except Exception as e:
359
+ return f"Error file: {e}"