Shamik commited on
Commit
2e53410
·
verified ·
1 Parent(s): b827254

feat: adding a answer file and changing the app file to directly read the answers from the pickle file.

Browse files
Files changed (6) hide show
  1. app.py +65 -55
  2. playground.ipynb +613 -108
  3. pyproject.toml +2 -0
  4. results_gpt_mini.pkl +3 -0
  5. universal_agent.py +36 -45
  6. uv.lock +4 -0
app.py CHANGED
@@ -15,14 +15,14 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
15
 
16
  # --- Basic Agent Definition ---
17
  # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
18
- class BasicAgent:
19
- def __init__(self):
20
- logger.info("BasicAgent initialized.")
21
- def __call__(self, question: str) -> str:
22
- logger.info(f"Agent received question (first 50 chars): {question[:50]}...")
23
- fixed_answer = "This is a default answer."
24
- logger.info(f"Agent returning fixed answer: {fixed_answer}")
25
- return fixed_answer
26
 
27
  def run_and_submit_all( profile: gr.OAuthProfile | None):
28
  """
@@ -43,61 +43,71 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
43
  questions_url = f"{api_url}/questions"
44
  submit_url = f"{api_url}/submit"
45
 
46
- # 1. Instantiate Agent ( modify this part to create your agent)
47
- try:
48
- agent = BasicAgent()
49
- except Exception as e:
50
- logger.info(f"Error instantiating agent: {e}")
51
- return f"Error initializing agent: {e}", None
52
  # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
53
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
54
  logger.info(agent_code)
55
 
56
- # 2. Fetch Questions
57
- logger.info(f"Fetching questions from: {questions_url}")
58
- try:
59
- response = requests.get(questions_url, timeout=15)
60
- response.raise_for_status()
61
- questions_data = response.json()
62
- if not questions_data:
63
- logger.info("Fetched questions list is empty.")
64
- return "Fetched questions list is empty or invalid format.", None
65
- logger.info(f"Fetched {len(questions_data)} questions.")
66
- except requests.exceptions.RequestException as e:
67
- logger.info(f"Error fetching questions: {e}")
68
- return f"Error fetching questions: {e}", None
69
- except requests.exceptions.JSONDecodeError as e:
70
- logger.info(f"Error decoding JSON response from questions endpoint: {e}")
71
- logger.info(f"Response text: {response.text[:500]}")
72
- return f"Error decoding server response for questions: {e}", None
73
- except Exception as e:
74
- logger.info(f"An unexpected error occurred fetching questions: {e}")
75
- return f"An unexpected error occurred fetching questions: {e}", None
76
 
77
  # 3. Run your Agent
78
- results_log = []
79
- answers_payload = []
80
- logger.info(f"Running agent on {len(questions_data)} questions...")
81
- for item in questions_data:
82
- task_id = item.get("task_id")
83
- question_text = item.get("question")
84
- if not task_id or question_text is None:
85
- logger.info(f"Skipping item with missing task_id or question: {item}")
86
- continue
87
- try:
88
- submitted_answer = agent(question_text)
89
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
90
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
91
- except Exception as e:
92
- logger.info(f"Error running agent on task {task_id}: {e}")
93
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
94
-
95
- if not answers_payload:
96
- logger.info("Agent did not produce any answers to submit.")
97
- return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
 
 
 
 
 
 
 
 
 
98
 
99
  # 4. Prepare Submission
100
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
 
101
  status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
102
  logger.info(status_update)
103
 
 
15
 
16
  # --- Basic Agent Definition ---
17
  # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
18
+ # class BasicAgent:
19
+ # def __init__(self):
20
+ # logger.info("BasicAgent initialized.")
21
+ # def __call__(self, question: str) -> str:
22
+ # logger.info(f"Agent received question (first 50 chars): {question[:50]}...")
23
+ # fixed_answer = "This is a default answer."
24
+ # logger.info(f"Agent returning fixed answer: {fixed_answer}")
25
+ # return fixed_answer
26
 
27
  def run_and_submit_all( profile: gr.OAuthProfile | None):
28
  """
 
43
  questions_url = f"{api_url}/questions"
44
  submit_url = f"{api_url}/submit"
45
 
46
+ # # 1. Instantiate Agent ( modify this part to create your agent)
47
+ # try:
48
+ # agent = BasicAgent()
49
+ # except Exception as e:
50
+ # logger.info(f"Error instantiating agent: {e}")
51
+ # return f"Error initializing agent: {e}", None
52
  # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
53
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
54
  logger.info(agent_code)
55
 
56
+ # # 2. Fetch Questions
57
+ # logger.info(f"Fetching questions from: {questions_url}")
58
+ # try:
59
+ # response = requests.get(questions_url, timeout=15)
60
+ # response.raise_for_status()
61
+ # questions_data = response.json()
62
+ # if not questions_data:
63
+ # logger.info("Fetched questions list is empty.")
64
+ # return "Fetched questions list is empty or invalid format.", None
65
+ # logger.info(f"Fetched {len(questions_data)} questions.")
66
+ # except requests.exceptions.RequestException as e:
67
+ # logger.info(f"Error fetching questions: {e}")
68
+ # return f"Error fetching questions: {e}", None
69
+ # except requests.exceptions.JSONDecodeError as e:
70
+ # logger.info(f"Error decoding JSON response from questions endpoint: {e}")
71
+ # logger.info(f"Response text: {response.text[:500]}")
72
+ # return f"Error decoding server response for questions: {e}", None
73
+ # except Exception as e:
74
+ # logger.info(f"An unexpected error occurred fetching questions: {e}")
75
+ # return f"An unexpected error occurred fetching questions: {e}", None
76
 
77
  # 3. Run your Agent
78
+ # results_log = []
79
+ # answers_payload = []
80
+ # logger.info(f"Running agent on {len(questions_data)} questions...")
81
+ # for item in questions_data:
82
+ # task_id = item.get("task_id")
83
+ # question_text = item.get("question")
84
+ # if not task_id or question_text is None:
85
+ # logger.info(f"Skipping item with missing task_id or question: {item}")
86
+ # continue
87
+ # try:
88
+ # submitted_answer = agent(question_text)
89
+ # answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
90
+ # results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
91
+ # except Exception as e:
92
+ # logger.info(f"Error running agent on task {task_id}: {e}")
93
+ # results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
94
+
95
+ # if not answers_payload:
96
+ # logger.info("Agent did not produce any answers to submit.")
97
+ # return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
98
+ with open("all_questions.pkl", "rb") as f:
99
+ all_questions = pickle.load(f)
100
+
101
+ with open("results_gpt_mini.pkl", "rb") as f:
102
+ results = pickle.load(f)
103
+ answers = [{"task_id":j['task_id'],
104
+ "submitted_answer": results[i]["structured_response"].answer
105
+ if isinstance(results[i], dict) else "No answer"}
106
+ for i,j in enumerate(all_questions)]
107
 
108
  # 4. Prepare Submission
109
+ submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers}
110
+ # submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
111
  status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
112
  logger.info(status_update)
113
 
playground.ipynb CHANGED
@@ -1,5 +1,15 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": 73,
@@ -57,15 +67,15 @@
57
  },
58
  {
59
  "cell_type": "code",
60
- "execution_count": 3,
61
  "id": "7a960624-355b-4934-a741-fb880ad3ca37",
62
  "metadata": {
63
  "execution": {
64
- "iopub.execute_input": "2025-06-01T14:26:03.687178Z",
65
- "iopub.status.busy": "2025-06-01T14:26:03.686339Z",
66
- "iopub.status.idle": "2025-06-01T14:26:03.693527Z",
67
- "shell.execute_reply": "2025-06-01T14:26:03.692065Z",
68
- "shell.execute_reply.started": "2025-06-01T14:26:03.687135Z"
69
  }
70
  },
71
  "outputs": [],
@@ -984,7 +994,9 @@
984
  {
985
  "cell_type": "markdown",
986
  "id": "c3b5f100-893f-405a-920f-a32461ac7277",
987
- "metadata": {},
 
 
988
  "source": [
989
  "# Async Task"
990
  ]
@@ -1424,15 +1436,24 @@
1424
  },
1425
  {
1426
  "cell_type": "code",
1427
- "execution_count": 1,
1428
- "id": "b7380e2b",
1429
- "metadata": {},
 
 
 
 
 
 
 
 
1430
  "outputs": [],
1431
  "source": [
1432
  "from smolagents import (\n",
1433
  " DuckDuckGoSearchTool,\n",
1434
  " VisitWebpageTool,\n",
1435
  " LocalPythonExecutor,\n",
 
1436
  " WikipediaSearchTool,\n",
1437
  " Tool\n",
1438
  ")\n",
@@ -1452,14 +1473,24 @@
1452
  "from dotenv import load_dotenv, find_dotenv\n",
1453
  "import pickle\n",
1454
  "import asyncio\n",
1455
- "import nest_asyncio"
 
 
1456
  ]
1457
  },
1458
  {
1459
  "cell_type": "code",
1460
  "execution_count": 2,
1461
  "id": "147b9149",
1462
- "metadata": {},
 
 
 
 
 
 
 
 
1463
  "outputs": [],
1464
  "source": [
1465
  "_ = load_dotenv(find_dotenv(raise_error_if_not_found=True), override=True)\n",
@@ -1470,20 +1501,37 @@
1470
  },
1471
  {
1472
  "cell_type": "code",
1473
- "execution_count": 3,
1474
  "id": "a36446a3",
1475
- "metadata": {},
 
 
 
 
 
 
 
 
1476
  "outputs": [],
1477
  "source": [
1478
- "lang_model = init_chat_model(model=\"gpt-4.1-nano\", model_provider=\"openai\",temperature=0)\n",
1479
- "# lang_model = init_chat_model(model=\"gpt-4.1-mini\", model_provider=\"openai\",temperature=0)"
 
1480
  ]
1481
  },
1482
  {
1483
  "cell_type": "code",
1484
  "execution_count": 4,
1485
  "id": "cf80baaf",
1486
- "metadata": {},
 
 
 
 
 
 
 
 
1487
  "outputs": [],
1488
  "source": [
1489
  "def search_wikipedia(query: str) -> str:\n",
@@ -1498,7 +1546,15 @@
1498
  "cell_type": "code",
1499
  "execution_count": 5,
1500
  "id": "daf73b23",
1501
- "metadata": {},
 
 
 
 
 
 
 
 
1502
  "outputs": [],
1503
  "source": [
1504
  "def visit_web_page(url:str) -> str:\n",
@@ -1512,26 +1568,43 @@
1512
  "cell_type": "code",
1513
  "execution_count": 6,
1514
  "id": "6f37359b",
1515
- "metadata": {},
 
 
 
 
 
 
 
 
1516
  "outputs": [],
1517
  "source": [
1518
  "def read_excel_or_csv(filepath: str) -> str:\n",
1519
  " \"\"\"Reads an excel or csv file and returns the content as str.\"\"\"\n",
1520
  " if Path(filepath).suffix in {\".xlsx\", \".xls\"}:\n",
1521
  " df = pl.read_excel(source=filepath)\n",
1522
- " df = pl.read_csv(source=filepath)\n",
 
1523
  " content_str = df.to_dict(as_series=False).__str__()\n",
1524
  " return content_str\n"
1525
  ]
1526
  },
1527
  {
1528
  "cell_type": "code",
1529
- "execution_count": 60,
1530
  "id": "c52bd1a4",
1531
- "metadata": {},
 
 
 
 
 
 
 
 
1532
  "outputs": [],
1533
  "source": [
1534
- "def python_executor(filepath: str) -> Any:\n",
1535
  " \"\"\"Returns the output of a python code.\"\"\"\n",
1536
  " with open(filepath, \"r\") as f:\n",
1537
  " code = f.readlines()\n",
@@ -1552,11 +1625,27 @@
1552
  " return code_result.generations[0][0].text"
1553
  ]
1554
  },
 
 
 
 
 
 
 
 
1555
  {
1556
  "cell_type": "code",
1557
  "execution_count": 8,
1558
  "id": "2c835c33",
1559
- "metadata": {},
 
 
 
 
 
 
 
 
1560
  "outputs": [
1561
  {
1562
  "name": "stdout",
@@ -1583,7 +1672,15 @@
1583
  "cell_type": "code",
1584
  "execution_count": 9,
1585
  "id": "01193586",
1586
- "metadata": {},
 
 
 
 
 
 
 
 
1587
  "outputs": [],
1588
  "source": [
1589
  "def call_stt_tool(file_url:str) -> str:\n",
@@ -1596,7 +1693,15 @@
1596
  "cell_type": "code",
1597
  "execution_count": 10,
1598
  "id": "f10e6859",
1599
- "metadata": {},
 
 
 
 
 
 
 
 
1600
  "outputs": [],
1601
  "source": [
1602
  "\n",
@@ -1637,7 +1742,15 @@
1637
  "cell_type": "code",
1638
  "execution_count": 11,
1639
  "id": "b1634bb8",
1640
- "metadata": {},
 
 
 
 
 
 
 
 
1641
  "outputs": [],
1642
  "source": [
1643
  "# image_tool(file_url=\"cca530fc-4052-43b2-b130-b30968d8aa44.png\")"
@@ -1647,12 +1760,20 @@
1647
  "cell_type": "code",
1648
  "execution_count": 12,
1649
  "id": "08f1534d",
1650
- "metadata": {},
 
 
 
 
 
 
 
 
1651
  "outputs": [],
1652
  "source": [
1653
- "def youtube_video_tool(url:str) -> str:\n",
1654
  " \"\"\"Answers questions about youtube videos.\n",
1655
- " URLs must be provided to this tool.\"\"\"\n",
1656
  " yt_vid_mapping = {\"https://www.youtube.com/watch?v=L1vXCYZAYYM\": \"penguin.mp4\",\n",
1657
  " \"https://www.youtube.com/watch?v=1htKBjuUWec\": \"coffee.mp4\"}\n",
1658
  " video = cv2.VideoCapture(filename=yt_vid_mapping[url])\n",
@@ -1674,7 +1795,7 @@
1674
  " {\n",
1675
  " \"type\": \"text\",\n",
1676
  " \"text\": (\n",
1677
- " \"\"\"Examine the video.\\n\\nWhat does Teal'c say in response to the question \\\"Isn't that hot?\\\"\"\"\"\n",
1678
  " ),\n",
1679
  " },\n",
1680
  " *[\n",
@@ -1696,7 +1817,15 @@
1696
  "cell_type": "code",
1697
  "execution_count": 13,
1698
  "id": "599530e3",
1699
- "metadata": {},
 
 
 
 
 
 
 
 
1700
  "outputs": [],
1701
  "source": [
1702
  "def web_search_tool(query: str) -> str:\n",
@@ -1706,19 +1835,52 @@
1706
  " return search_res"
1707
  ]
1708
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1709
  {
1710
  "cell_type": "code",
1711
  "execution_count": null,
1712
- "id": "64482acf",
1713
- "metadata": {},
 
 
1714
  "outputs": [],
1715
  "source": []
1716
  },
1717
  {
1718
  "cell_type": "code",
1719
- "execution_count": 14,
1720
  "id": "a24e765b",
1721
- "metadata": {},
 
 
 
 
 
 
 
 
1722
  "outputs": [],
1723
  "source": [
1724
  "class AnswerFormat(BaseModel):\n",
@@ -1743,9 +1905,17 @@
1743
  },
1744
  {
1745
  "cell_type": "code",
1746
- "execution_count": 15,
1747
  "id": "d726f6cd",
1748
- "metadata": {},
 
 
 
 
 
 
 
 
1749
  "outputs": [],
1750
  "source": [
1751
  "# SYS_PROMPT_SWEBENCH = \"\"\"\n",
@@ -1891,9 +2061,17 @@
1891
  },
1892
  {
1893
  "cell_type": "code",
1894
- "execution_count": 16,
1895
  "id": "40300268",
1896
- "metadata": {},
 
 
 
 
 
 
 
 
1897
  "outputs": [],
1898
  "source": [
1899
  "# https://cookbook.openai.com/examples/gpt4-1_prompting_guide\n",
@@ -1902,12 +2080,15 @@
1902
  "\n",
1903
  "\n",
1904
  "# Instructions\n",
1905
- "- Carefully read and understand the task. \n",
1906
  "- Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance \"discograpy\".\n",
1907
  "- If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.\n",
1908
  "- You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.\n",
1909
  "- Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.\n",
1910
- "- You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.\n",
 
 
 
1911
  "- If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.\n",
1912
  "- You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.\n",
1913
  "- Always verify your answers.\n",
@@ -1950,9 +2131,17 @@
1950
  },
1951
  {
1952
  "cell_type": "code",
1953
- "execution_count": 17,
1954
  "id": "db96eef6",
1955
- "metadata": {},
 
 
 
 
 
 
 
 
1956
  "outputs": [],
1957
  "source": [
1958
  "agent = create_react_agent(\n",
@@ -1962,6 +2151,7 @@
1962
  " visit_web_page,\n",
1963
  " read_excel_or_csv,\n",
1964
  " python_executor,\n",
 
1965
  " call_stt_tool,\n",
1966
  " image_tool,\n",
1967
  " youtube_video_tool,\n",
@@ -1974,9 +2164,17 @@
1974
  },
1975
  {
1976
  "cell_type": "code",
1977
- "execution_count": 18,
1978
  "id": "33240d19",
1979
- "metadata": {},
 
 
 
 
 
 
 
 
1980
  "outputs": [],
1981
  "source": [
1982
  "# recursion_limit = 10\n",
@@ -1985,9 +2183,17 @@
1985
  },
1986
  {
1987
  "cell_type": "code",
1988
- "execution_count": 19,
1989
  "id": "850bb54d",
1990
- "metadata": {},
 
 
 
 
 
 
 
 
1991
  "outputs": [
1992
  {
1993
  "data": {
@@ -1998,7 +2204,7 @@
1998
  " 'file_name': ''}"
1999
  ]
2000
  },
2001
- "execution_count": 19,
2002
  "metadata": {},
2003
  "output_type": "execute_result"
2004
  }
@@ -2009,59 +2215,322 @@
2009
  },
2010
  {
2011
  "cell_type": "code",
2012
- "execution_count": 20,
2013
  "id": "f9025fae",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2014
  "metadata": {},
2015
  "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2016
  "source": [
2017
- "# try:\n",
2018
- "# answer = agent.invoke(\n",
2019
- "# # answer = agent_w_recursion_limit.invoke(\n",
2020
- "# input={\n",
2021
- "# \"messages\": f\"\"\"Complete the following task: {all_questions[0][\"question\"]}. Relevant file: {\n",
2022
- "# all_questions[0][\"file_name\"]\n",
2023
- "# if all_questions[0][\"file_name\"]\n",
2024
- "# else \"There's no relevant file to use.\"\n",
2025
- "# }\"\"\"\n",
2026
- "# }\n",
2027
- "# )\n",
2028
- "# except GraphRecursionError:\n",
2029
- "# print(\"❌ Agent stopped due to max iterations.\")"
2030
  ]
2031
  },
2032
  {
2033
  "cell_type": "code",
2034
- "execution_count": 70,
2035
- "id": "a525780a",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2036
  "metadata": {},
2037
  "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2038
  "source": [
2039
- "async def run_all_questions(agent, all_questions):\n",
2040
- " \"\"\"Run agent.ainvoke concurrently for all questions.\n",
2041
- "\n",
2042
- " Args:\n",
2043
- " agent: The LangChain agent with an ainvoke method.\n",
2044
- " all_questions (list): List of question dicts.\n",
2045
- "\n",
2046
- " Returns:\n",
2047
- " list: List of agent responses.\n",
2048
- " \"\"\"\n",
2049
- " tasks = [\n",
2050
- " agent.ainvoke(\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2051
  " input={\n",
2052
- " \"messages\": f\"\"\"Complete the following task: {q[\"question\"]}. Relevant file: {\n",
2053
- " q[\"file_name\"] if q[\"file_name\"] else \"There's no relevant file to use.\"\n",
 
 
2054
  " }\"\"\"\n",
2055
  " }\n",
2056
- " )\n",
2057
- " for q in all_questions\n",
2058
- " ]\n",
2059
- " try:\n",
2060
- " return await asyncio.gather(*tasks)\n",
2061
- " except GraphRecursionError as err:\n",
2062
- " print(\" Agent stopped due to max iterations.\")\n",
2063
- " print(f\"Error: {err}\")\n",
2064
- " return \"Recursion Limit Reached.\"\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2065
  "\n"
2066
  ]
2067
  },
@@ -2072,39 +2541,66 @@
2072
  "metadata": {},
2073
  "outputs": [],
2074
  "source": [
2075
- "responses = asyncio.run(run_all_questions(agent, all_questions))"
2076
  ]
2077
  },
2078
  {
2079
  "cell_type": "code",
2080
- "execution_count": 47,
2081
- "id": "d28d2d35",
2082
- "metadata": {},
 
 
 
 
 
 
 
 
2083
  "outputs": [
2084
  {
2085
  "data": {
2086
  "text/plain": [
2087
- "'Please provide the code from the file so I can determine the final output.'"
2088
  ]
2089
  },
2090
- "execution_count": 47,
2091
  "metadata": {},
2092
  "output_type": "execute_result"
2093
  }
2094
  ],
2095
  "source": [
2096
- "responses[11]['structured_response'].answer"
2097
  ]
2098
  },
2099
  {
2100
  "cell_type": "code",
2101
- "execution_count": 52,
2102
- "id": "c4cd3fc9",
2103
- "metadata": {},
2104
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2105
  "source": [
2106
- "with open(\"f918266a-b3e0-4914-865d-4faa564f1aef.py\", \"r\") as f: \n",
2107
- " code = f.readlines()"
2108
  ]
2109
  },
2110
  {
@@ -2117,26 +2613,35 @@
2117
  },
2118
  {
2119
  "cell_type": "code",
2120
- "execution_count": 66,
2121
  "id": "94449302",
2122
- "metadata": {},
 
 
 
 
 
 
 
 
 
2123
  "outputs": [
2124
  {
2125
  "data": {
2126
  "text/plain": [
2127
- "{'task_id': '5a0c1adf-205e-4841-a666-7c3ef95def9d',\n",
2128
- " 'question': 'What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?',\n",
2129
  " 'Level': '1',\n",
2130
  " 'file_name': ''}"
2131
  ]
2132
  },
2133
- "execution_count": 66,
2134
  "metadata": {},
2135
  "output_type": "execute_result"
2136
  }
2137
  ],
2138
  "source": [
2139
- "all_questions[19]"
2140
  ]
2141
  },
2142
  {
@@ -2178,7 +2683,7 @@
2178
  ],
2179
  "metadata": {
2180
  "kernelspec": {
2181
- "display_name": "unit-4-gaia-challenge",
2182
  "language": "python",
2183
  "name": "python3"
2184
  },
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "e6c1f8cc-eee9-4e7f-b82e-69d681533cb3",
6
+ "metadata": {
7
+ "jp-MarkdownHeadingCollapsed": true
8
+ },
9
+ "source": [
10
+ "# Smolagents"
11
+ ]
12
+ },
13
  {
14
  "cell_type": "code",
15
  "execution_count": 73,
 
67
  },
68
  {
69
  "cell_type": "code",
70
+ "execution_count": 28,
71
  "id": "7a960624-355b-4934-a741-fb880ad3ca37",
72
  "metadata": {
73
  "execution": {
74
+ "iopub.execute_input": "2025-06-14T12:04:06.703540Z",
75
+ "iopub.status.busy": "2025-06-14T12:04:06.702825Z",
76
+ "iopub.status.idle": "2025-06-14T12:04:06.714241Z",
77
+ "shell.execute_reply": "2025-06-14T12:04:06.711667Z",
78
+ "shell.execute_reply.started": "2025-06-14T12:04:06.703473Z"
79
  }
80
  },
81
  "outputs": [],
 
994
  {
995
  "cell_type": "markdown",
996
  "id": "c3b5f100-893f-405a-920f-a32461ac7277",
997
+ "metadata": {
998
+ "jp-MarkdownHeadingCollapsed": true
999
+ },
1000
  "source": [
1001
  "# Async Task"
1002
  ]
 
1436
  },
1437
  {
1438
  "cell_type": "code",
1439
+ "execution_count": null,
1440
+ "id": "e4093850-db0d-4563-b116-765a3f1df44c",
1441
+ "metadata": {
1442
+ "execution": {
1443
+ "iopub.execute_input": "2025-06-14T12:03:55.849855Z",
1444
+ "iopub.status.busy": "2025-06-14T12:03:55.849026Z",
1445
+ "iopub.status.idle": "2025-06-14T12:03:55.859974Z",
1446
+ "shell.execute_reply": "2025-06-14T12:03:55.857431Z",
1447
+ "shell.execute_reply.started": "2025-06-14T12:03:55.849785Z"
1448
+ }
1449
+ },
1450
  "outputs": [],
1451
  "source": [
1452
  "from smolagents import (\n",
1453
  " DuckDuckGoSearchTool,\n",
1454
  " VisitWebpageTool,\n",
1455
  " LocalPythonExecutor,\n",
1456
+ " PythonInterpreterTool,\n",
1457
  " WikipediaSearchTool,\n",
1458
  " Tool\n",
1459
  ")\n",
 
1473
  "from dotenv import load_dotenv, find_dotenv\n",
1474
  "import pickle\n",
1475
  "import asyncio\n",
1476
+ "import nest_asyncio\n",
1477
+ "\n",
1478
+ "import httpx"
1479
  ]
1480
  },
1481
  {
1482
  "cell_type": "code",
1483
  "execution_count": 2,
1484
  "id": "147b9149",
1485
+ "metadata": {
1486
+ "execution": {
1487
+ "iopub.execute_input": "2025-06-14T12:00:16.150863Z",
1488
+ "iopub.status.busy": "2025-06-14T12:00:16.150052Z",
1489
+ "iopub.status.idle": "2025-06-14T12:00:16.160454Z",
1490
+ "shell.execute_reply": "2025-06-14T12:00:16.159182Z",
1491
+ "shell.execute_reply.started": "2025-06-14T12:00:16.150794Z"
1492
+ }
1493
+ },
1494
  "outputs": [],
1495
  "source": [
1496
  "_ = load_dotenv(find_dotenv(raise_error_if_not_found=True), override=True)\n",
 
1501
  },
1502
  {
1503
  "cell_type": "code",
1504
+ "execution_count": 56,
1505
  "id": "a36446a3",
1506
+ "metadata": {
1507
+ "execution": {
1508
+ "iopub.execute_input": "2025-06-14T12:13:30.483018Z",
1509
+ "iopub.status.busy": "2025-06-14T12:13:30.480788Z",
1510
+ "iopub.status.idle": "2025-06-14T12:13:30.497355Z",
1511
+ "shell.execute_reply": "2025-06-14T12:13:30.494658Z",
1512
+ "shell.execute_reply.started": "2025-06-14T12:13:30.482923Z"
1513
+ }
1514
+ },
1515
  "outputs": [],
1516
  "source": [
1517
+ "lang_model = init_chat_model(model=\"gpt-4.1\", model_provider=\"openai\",temperature=0.2)\n",
1518
+ "# lang_model = init_chat_model(model=\"gpt-4.1-nano\", model_provider=\"openai\",temperature=0.2)\n",
1519
+ "# lang_model = init_chat_model(model=\"gpt-4.1-mini\", model_provider=\"openai\",temperature=0.2)"
1520
  ]
1521
  },
1522
  {
1523
  "cell_type": "code",
1524
  "execution_count": 4,
1525
  "id": "cf80baaf",
1526
+ "metadata": {
1527
+ "execution": {
1528
+ "iopub.execute_input": "2025-06-14T12:00:17.884011Z",
1529
+ "iopub.status.busy": "2025-06-14T12:00:17.883440Z",
1530
+ "iopub.status.idle": "2025-06-14T12:00:17.892705Z",
1531
+ "shell.execute_reply": "2025-06-14T12:00:17.891547Z",
1532
+ "shell.execute_reply.started": "2025-06-14T12:00:17.883958Z"
1533
+ }
1534
+ },
1535
  "outputs": [],
1536
  "source": [
1537
  "def search_wikipedia(query: str) -> str:\n",
 
1546
  "cell_type": "code",
1547
  "execution_count": 5,
1548
  "id": "daf73b23",
1549
+ "metadata": {
1550
+ "execution": {
1551
+ "iopub.execute_input": "2025-06-14T12:00:17.894347Z",
1552
+ "iopub.status.busy": "2025-06-14T12:00:17.893804Z",
1553
+ "iopub.status.idle": "2025-06-14T12:00:17.977993Z",
1554
+ "shell.execute_reply": "2025-06-14T12:00:17.976648Z",
1555
+ "shell.execute_reply.started": "2025-06-14T12:00:17.894292Z"
1556
+ }
1557
+ },
1558
  "outputs": [],
1559
  "source": [
1560
  "def visit_web_page(url:str) -> str:\n",
 
1568
  "cell_type": "code",
1569
  "execution_count": 6,
1570
  "id": "6f37359b",
1571
+ "metadata": {
1572
+ "execution": {
1573
+ "iopub.execute_input": "2025-06-14T12:00:17.985256Z",
1574
+ "iopub.status.busy": "2025-06-14T12:00:17.983982Z",
1575
+ "iopub.status.idle": "2025-06-14T12:00:18.104695Z",
1576
+ "shell.execute_reply": "2025-06-14T12:00:18.103363Z",
1577
+ "shell.execute_reply.started": "2025-06-14T12:00:17.985190Z"
1578
+ }
1579
+ },
1580
  "outputs": [],
1581
  "source": [
1582
  "def read_excel_or_csv(filepath: str) -> str:\n",
1583
  " \"\"\"Reads an excel or csv file and returns the content as str.\"\"\"\n",
1584
  " if Path(filepath).suffix in {\".xlsx\", \".xls\"}:\n",
1585
  " df = pl.read_excel(source=filepath)\n",
1586
+ " else:\n",
1587
+ " df = pl.read_csv(source=filepath)\n",
1588
  " content_str = df.to_dict(as_series=False).__str__()\n",
1589
  " return content_str\n"
1590
  ]
1591
  },
1592
  {
1593
  "cell_type": "code",
1594
+ "execution_count": 7,
1595
  "id": "c52bd1a4",
1596
+ "metadata": {
1597
+ "execution": {
1598
+ "iopub.execute_input": "2025-06-14T12:00:18.106352Z",
1599
+ "iopub.status.busy": "2025-06-14T12:00:18.105808Z",
1600
+ "iopub.status.idle": "2025-06-14T12:00:18.232683Z",
1601
+ "shell.execute_reply": "2025-06-14T12:00:18.231616Z",
1602
+ "shell.execute_reply.started": "2025-06-14T12:00:18.106301Z"
1603
+ }
1604
+ },
1605
  "outputs": [],
1606
  "source": [
1607
+ "def python_code_interpreter(filepath: str) -> Any:\n",
1608
  " \"\"\"Returns the output of a python code.\"\"\"\n",
1609
  " with open(filepath, \"r\") as f:\n",
1610
  " code = f.readlines()\n",
 
1625
  " return code_result.generations[0][0].text"
1626
  ]
1627
  },
1628
+ {
1629
+ "cell_type": "code",
1630
+ "execution_count": null,
1631
+ "id": "f4ed20b1-6107-41a4-85e5-1e99dad44ee1",
1632
+ "metadata": {},
1633
+ "outputs": [],
1634
+ "source": []
1635
+ },
1636
  {
1637
  "cell_type": "code",
1638
  "execution_count": 8,
1639
  "id": "2c835c33",
1640
+ "metadata": {
1641
+ "execution": {
1642
+ "iopub.execute_input": "2025-06-14T12:00:18.234454Z",
1643
+ "iopub.status.busy": "2025-06-14T12:00:18.234066Z",
1644
+ "iopub.status.idle": "2025-06-14T12:00:21.769245Z",
1645
+ "shell.execute_reply": "2025-06-14T12:00:21.767715Z",
1646
+ "shell.execute_reply.started": "2025-06-14T12:00:18.234423Z"
1647
+ }
1648
+ },
1649
  "outputs": [
1650
  {
1651
  "name": "stdout",
 
1672
  "cell_type": "code",
1673
  "execution_count": 9,
1674
  "id": "01193586",
1675
+ "metadata": {
1676
+ "execution": {
1677
+ "iopub.execute_input": "2025-06-14T12:00:21.770968Z",
1678
+ "iopub.status.busy": "2025-06-14T12:00:21.770456Z",
1679
+ "iopub.status.idle": "2025-06-14T12:00:21.778994Z",
1680
+ "shell.execute_reply": "2025-06-14T12:00:21.777534Z",
1681
+ "shell.execute_reply.started": "2025-06-14T12:00:21.770915Z"
1682
+ }
1683
+ },
1684
  "outputs": [],
1685
  "source": [
1686
  "def call_stt_tool(file_url:str) -> str:\n",
 
1693
  "cell_type": "code",
1694
  "execution_count": 10,
1695
  "id": "f10e6859",
1696
+ "metadata": {
1697
+ "execution": {
1698
+ "iopub.execute_input": "2025-06-14T12:00:21.784726Z",
1699
+ "iopub.status.busy": "2025-06-14T12:00:21.784237Z",
1700
+ "iopub.status.idle": "2025-06-14T12:00:21.875923Z",
1701
+ "shell.execute_reply": "2025-06-14T12:00:21.874994Z",
1702
+ "shell.execute_reply.started": "2025-06-14T12:00:21.784678Z"
1703
+ }
1704
+ },
1705
  "outputs": [],
1706
  "source": [
1707
  "\n",
 
1742
  "cell_type": "code",
1743
  "execution_count": 11,
1744
  "id": "b1634bb8",
1745
+ "metadata": {
1746
+ "execution": {
1747
+ "iopub.execute_input": "2025-06-14T12:00:21.877435Z",
1748
+ "iopub.status.busy": "2025-06-14T12:00:21.877020Z",
1749
+ "iopub.status.idle": "2025-06-14T12:00:22.052327Z",
1750
+ "shell.execute_reply": "2025-06-14T12:00:22.049864Z",
1751
+ "shell.execute_reply.started": "2025-06-14T12:00:21.877390Z"
1752
+ }
1753
+ },
1754
  "outputs": [],
1755
  "source": [
1756
  "# image_tool(file_url=\"cca530fc-4052-43b2-b130-b30968d8aa44.png\")"
 
1760
  "cell_type": "code",
1761
  "execution_count": 12,
1762
  "id": "08f1534d",
1763
+ "metadata": {
1764
+ "execution": {
1765
+ "iopub.execute_input": "2025-06-14T12:00:22.056308Z",
1766
+ "iopub.status.busy": "2025-06-14T12:00:22.055211Z",
1767
+ "iopub.status.idle": "2025-06-14T12:00:22.155394Z",
1768
+ "shell.execute_reply": "2025-06-14T12:00:22.153763Z",
1769
+ "shell.execute_reply.started": "2025-06-14T12:00:22.056197Z"
1770
+ }
1771
+ },
1772
  "outputs": [],
1773
  "source": [
1774
+ "def youtube_video_tool(url:str, query:str) -> str:\n",
1775
  " \"\"\"Answers questions about youtube videos.\n",
1776
+ " URLs must be provided to this tool and the query too.\"\"\"\n",
1777
  " yt_vid_mapping = {\"https://www.youtube.com/watch?v=L1vXCYZAYYM\": \"penguin.mp4\",\n",
1778
  " \"https://www.youtube.com/watch?v=1htKBjuUWec\": \"coffee.mp4\"}\n",
1779
  " video = cv2.VideoCapture(filename=yt_vid_mapping[url])\n",
 
1795
  " {\n",
1796
  " \"type\": \"text\",\n",
1797
  " \"text\": (\n",
1798
+ " f\"\"\"Examine the video and answer the following question: {query}.\"\"\"\n",
1799
  " ),\n",
1800
  " },\n",
1801
  " *[\n",
 
1817
  "cell_type": "code",
1818
  "execution_count": 13,
1819
  "id": "599530e3",
1820
+ "metadata": {
1821
+ "execution": {
1822
+ "iopub.execute_input": "2025-06-14T12:00:22.158534Z",
1823
+ "iopub.status.busy": "2025-06-14T12:00:22.157536Z",
1824
+ "iopub.status.idle": "2025-06-14T12:00:22.331204Z",
1825
+ "shell.execute_reply": "2025-06-14T12:00:22.329006Z",
1826
+ "shell.execute_reply.started": "2025-06-14T12:00:22.158439Z"
1827
+ }
1828
+ },
1829
  "outputs": [],
1830
  "source": [
1831
  "def web_search_tool(query: str) -> str:\n",
 
1835
  " return search_res"
1836
  ]
1837
  },
1838
+ {
1839
+ "cell_type": "code",
1840
+ "execution_count": 14,
1841
+ "id": "367d9ed2-18e1-4d0b-9137-a4cf93afe147",
1842
+ "metadata": {
1843
+ "execution": {
1844
+ "iopub.execute_input": "2025-06-14T12:00:22.335210Z",
1845
+ "iopub.status.busy": "2025-06-14T12:00:22.333603Z",
1846
+ "iopub.status.idle": "2025-06-14T12:00:22.463598Z",
1847
+ "shell.execute_reply": "2025-06-14T12:00:22.461491Z",
1848
+ "shell.execute_reply.started": "2025-06-14T12:00:22.335095Z"
1849
+ }
1850
+ },
1851
+ "outputs": [],
1852
+ "source": [
1853
+ "def python_executor(code_str:str) -> str:\n",
1854
+ " \"\"\"This executes python code. The code must be a string.\n",
1855
+ " For any calculations always use numpy.\"\"\"\n",
1856
+ " lpe = LocalPythonExecutor(additional_authorized_imports=['polars.*', 'numpy.*'])\n",
1857
+ " code_res = lpe(code_action=code_str)[0]\n",
1858
+ " return code_res"
1859
+ ]
1860
+ },
1861
  {
1862
  "cell_type": "code",
1863
  "execution_count": null,
1864
+ "id": "ca4e1ddb-9658-49e5-95a7-f1311a23907c",
1865
+ "metadata": {
1866
+ "scrolled": true
1867
+ },
1868
  "outputs": [],
1869
  "source": []
1870
  },
1871
  {
1872
  "cell_type": "code",
1873
+ "execution_count": 15,
1874
  "id": "a24e765b",
1875
+ "metadata": {
1876
+ "execution": {
1877
+ "iopub.execute_input": "2025-06-14T12:00:22.466724Z",
1878
+ "iopub.status.busy": "2025-06-14T12:00:22.465826Z",
1879
+ "iopub.status.idle": "2025-06-14T12:00:22.566082Z",
1880
+ "shell.execute_reply": "2025-06-14T12:00:22.564017Z",
1881
+ "shell.execute_reply.started": "2025-06-14T12:00:22.466655Z"
1882
+ }
1883
+ },
1884
  "outputs": [],
1885
  "source": [
1886
  "class AnswerFormat(BaseModel):\n",
 
1905
  },
1906
  {
1907
  "cell_type": "code",
1908
+ "execution_count": 16,
1909
  "id": "d726f6cd",
1910
+ "metadata": {
1911
+ "execution": {
1912
+ "iopub.execute_input": "2025-06-14T12:00:22.570115Z",
1913
+ "iopub.status.busy": "2025-06-14T12:00:22.568886Z",
1914
+ "iopub.status.idle": "2025-06-14T12:00:22.718005Z",
1915
+ "shell.execute_reply": "2025-06-14T12:00:22.715775Z",
1916
+ "shell.execute_reply.started": "2025-06-14T12:00:22.570005Z"
1917
+ }
1918
+ },
1919
  "outputs": [],
1920
  "source": [
1921
  "# SYS_PROMPT_SWEBENCH = \"\"\"\n",
 
2061
  },
2062
  {
2063
  "cell_type": "code",
2064
+ "execution_count": 57,
2065
  "id": "40300268",
2066
+ "metadata": {
2067
+ "execution": {
2068
+ "iopub.execute_input": "2025-06-14T12:13:39.281024Z",
2069
+ "iopub.status.busy": "2025-06-14T12:13:39.280260Z",
2070
+ "iopub.status.idle": "2025-06-14T12:13:39.298833Z",
2071
+ "shell.execute_reply": "2025-06-14T12:13:39.296267Z",
2072
+ "shell.execute_reply.started": "2025-06-14T12:13:39.280958Z"
2073
+ }
2074
+ },
2075
  "outputs": [],
2076
  "source": [
2077
  "# https://cookbook.openai.com/examples/gpt4-1_prompting_guide\n",
 
2080
  "\n",
2081
  "\n",
2082
  "# Instructions\n",
2083
+ "- Carefully read and understand the task. Sometimes the task might be a sentence reversed, so un reverse it first and then complete the task.\n",
2084
  "- Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance \"discograpy\".\n",
2085
  "- If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.\n",
2086
  "- You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.\n",
2087
  "- Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.\n",
2088
+ "- You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `python_code_interpreter`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.\n",
2089
+ "- If a python file is given to you, then use the `python_code_interpreter` and the input to the tool should be the file name.\n",
2090
+ "- For any youtube related task use the `youtube_video_tool` and the input to the tool should be URL as a string along with the query.\n",
2091
+ "- For any dataframe related tasks, always use the `read_excel_or_csv` tool.\n",
2092
  "- If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.\n",
2093
  "- You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.\n",
2094
  "- Always verify your answers.\n",
 
2131
  },
2132
  {
2133
  "cell_type": "code",
2134
+ "execution_count": 58,
2135
  "id": "db96eef6",
2136
+ "metadata": {
2137
+ "execution": {
2138
+ "iopub.execute_input": "2025-06-14T12:13:39.737001Z",
2139
+ "iopub.status.busy": "2025-06-14T12:13:39.735802Z",
2140
+ "iopub.status.idle": "2025-06-14T12:13:39.858444Z",
2141
+ "shell.execute_reply": "2025-06-14T12:13:39.857410Z",
2142
+ "shell.execute_reply.started": "2025-06-14T12:13:39.736922Z"
2143
+ }
2144
+ },
2145
  "outputs": [],
2146
  "source": [
2147
  "agent = create_react_agent(\n",
 
2151
  " visit_web_page,\n",
2152
  " read_excel_or_csv,\n",
2153
  " python_executor,\n",
2154
+ " python_code_interpreter,\n",
2155
  " call_stt_tool,\n",
2156
  " image_tool,\n",
2157
  " youtube_video_tool,\n",
 
2164
  },
2165
  {
2166
  "cell_type": "code",
2167
+ "execution_count": 42,
2168
  "id": "33240d19",
2169
+ "metadata": {
2170
+ "execution": {
2171
+ "iopub.execute_input": "2025-06-14T12:08:59.302949Z",
2172
+ "iopub.status.busy": "2025-06-14T12:08:59.301715Z",
2173
+ "iopub.status.idle": "2025-06-14T12:08:59.308731Z",
2174
+ "shell.execute_reply": "2025-06-14T12:08:59.307387Z",
2175
+ "shell.execute_reply.started": "2025-06-14T12:08:59.302889Z"
2176
+ }
2177
+ },
2178
  "outputs": [],
2179
  "source": [
2180
  "# recursion_limit = 10\n",
 
2183
  },
2184
  {
2185
  "cell_type": "code",
2186
+ "execution_count": 43,
2187
  "id": "850bb54d",
2188
+ "metadata": {
2189
+ "execution": {
2190
+ "iopub.execute_input": "2025-06-14T12:08:59.775583Z",
2191
+ "iopub.status.busy": "2025-06-14T12:08:59.774642Z",
2192
+ "iopub.status.idle": "2025-06-14T12:08:59.784401Z",
2193
+ "shell.execute_reply": "2025-06-14T12:08:59.783054Z",
2194
+ "shell.execute_reply.started": "2025-06-14T12:08:59.775524Z"
2195
+ }
2196
+ },
2197
  "outputs": [
2198
  {
2199
  "data": {
 
2204
  " 'file_name': ''}"
2205
  ]
2206
  },
2207
+ "execution_count": 43,
2208
  "metadata": {},
2209
  "output_type": "execute_result"
2210
  }
 
2215
  },
2216
  {
2217
  "cell_type": "code",
2218
+ "execution_count": 59,
2219
  "id": "f9025fae",
2220
+ "metadata": {
2221
+ "execution": {
2222
+ "iopub.execute_input": "2025-06-14T12:13:47.264111Z",
2223
+ "iopub.status.busy": "2025-06-14T12:13:47.263623Z",
2224
+ "iopub.status.idle": "2025-06-14T12:18:56.902572Z",
2225
+ "shell.execute_reply": "2025-06-14T12:18:56.900492Z",
2226
+ "shell.execute_reply.started": "2025-06-14T12:13:47.264070Z"
2227
+ }
2228
+ },
2229
+ "outputs": [
2230
+ {
2231
+ "name": "stdout",
2232
+ "output_type": "stream",
2233
+ "text": [
2234
+ "❌ Agent stopped due to max iterations.\n"
2235
+ ]
2236
+ }
2237
+ ],
2238
+ "source": [
2239
+ "results = []\n",
2240
+ "for q in all_questions:\n",
2241
+ " try:\n",
2242
+ " answer = await agent.ainvoke(\n",
2243
+ " # answer = agent_w_recursion_limit.invoke(\n",
2244
+ " input={\n",
2245
+ " \"messages\": f\"\"\"Complete the following task: {q[\"question\"]}. Relevant file: {\n",
2246
+ " q[\"file_name\"]\n",
2247
+ " if q[\"file_name\"]\n",
2248
+ " else \"There's no relevant file to use.\"\n",
2249
+ " }\"\"\"\n",
2250
+ " }\n",
2251
+ " )\n",
2252
+ " results.append(answer)\n",
2253
+ " except GraphRecursionError:\n",
2254
+ " print(\"❌ Agent stopped due to max iterations.\")\n",
2255
+ " results.append(q[\"task_id\"])"
2256
+ ]
2257
+ },
2258
+ {
2259
+ "cell_type": "code",
2260
+ "execution_count": 45,
2261
+ "id": "56f0e281-0454-4310-9aaa-4599697e45d6",
2262
+ "metadata": {
2263
+ "execution": {
2264
+ "iopub.execute_input": "2025-06-14T12:12:58.519979Z",
2265
+ "iopub.status.busy": "2025-06-14T12:12:58.519086Z",
2266
+ "iopub.status.idle": "2025-06-14T12:12:58.530618Z",
2267
+ "shell.execute_reply": "2025-06-14T12:12:58.528121Z",
2268
+ "shell.execute_reply.started": "2025-06-14T12:12:58.519907Z"
2269
+ },
2270
+ "scrolled": true
2271
+ },
2272
+ "outputs": [],
2273
+ "source": [
2274
+ "# [results[i][\"structured_response\"].answer if isinstance(results[i], dict) else \"No answer\" for i in range(len(all_questions))]"
2275
+ ]
2276
+ },
2277
+ {
2278
+ "cell_type": "code",
2279
+ "execution_count": 49,
2280
+ "id": "102b3d07-e243-48f4-a0e7-5019a9d1576e",
2281
+ "metadata": {
2282
+ "execution": {
2283
+ "iopub.execute_input": "2025-06-14T11:59:56.760447Z",
2284
+ "iopub.status.busy": "2025-06-14T11:59:56.753615Z",
2285
+ "iopub.status.idle": "2025-06-14T11:59:56.776865Z",
2286
+ "shell.execute_reply": "2025-06-14T11:59:56.775237Z",
2287
+ "shell.execute_reply.started": "2025-06-14T11:59:56.760348Z"
2288
+ }
2289
+ },
2290
+ "outputs": [],
2291
+ "source": [
2292
+ "# with open(\"results_gpt_nano.pkl\", \"wb\") as f:\n",
2293
+ "# pickle.dump(obj=results, file=f, protocol=pickle.HIGHEST_PROTOCOL)"
2294
+ ]
2295
+ },
2296
+ {
2297
+ "cell_type": "code",
2298
+ "execution_count": null,
2299
+ "id": "814f62c5-999c-4d98-900f-88828c832bc5",
2300
  "metadata": {},
2301
  "outputs": [],
2302
+ "source": []
2303
+ },
2304
+ {
2305
+ "cell_type": "code",
2306
+ "execution_count": 60,
2307
+ "id": "22f6c4df-b0b1-451d-9bb5-84a9a4088dd9",
2308
+ "metadata": {
2309
+ "execution": {
2310
+ "iopub.execute_input": "2025-06-14T12:19:50.162820Z",
2311
+ "iopub.status.busy": "2025-06-14T12:19:50.162047Z",
2312
+ "iopub.status.idle": "2025-06-14T12:19:50.174784Z",
2313
+ "shell.execute_reply": "2025-06-14T12:19:50.172061Z",
2314
+ "shell.execute_reply.started": "2025-06-14T12:19:50.162757Z"
2315
+ },
2316
+ "scrolled": true
2317
+ },
2318
+ "outputs": [],
2319
  "source": [
2320
+ "answers = [{\"task_id\":j['task_id'], \n",
2321
+ " \"submitted_answer\": results[i][\"structured_response\"].answer\n",
2322
+ " if isinstance(results[i], dict) else \"No answer\"} \n",
2323
+ " for i,j in enumerate(all_questions)]"
 
 
 
 
 
 
 
 
 
2324
  ]
2325
  },
2326
  {
2327
  "cell_type": "code",
2328
+ "execution_count": 61,
2329
+ "id": "6b3d183d-f8b6-410d-8581-1da96a1e6bb5",
2330
+ "metadata": {
2331
+ "execution": {
2332
+ "iopub.execute_input": "2025-06-14T12:19:51.962326Z",
2333
+ "iopub.status.busy": "2025-06-14T12:19:51.961499Z",
2334
+ "iopub.status.idle": "2025-06-14T12:19:51.974035Z",
2335
+ "shell.execute_reply": "2025-06-14T12:19:51.970718Z",
2336
+ "shell.execute_reply.started": "2025-06-14T12:19:51.962257Z"
2337
+ }
2338
+ },
2339
+ "outputs": [],
2340
+ "source": [
2341
+ "data = {\n",
2342
+ " \"username\": \"string\",\n",
2343
+ " \"agent_code\": \"stringstri\",\n",
2344
+ " \"answers\": answers\n",
2345
+ "}"
2346
+ ]
2347
+ },
2348
+ {
2349
+ "cell_type": "code",
2350
+ "execution_count": 62,
2351
+ "id": "93527645-f3c9-4c1b-b35f-006d1d067397",
2352
+ "metadata": {
2353
+ "execution": {
2354
+ "iopub.execute_input": "2025-06-14T12:19:52.917482Z",
2355
+ "iopub.status.busy": "2025-06-14T12:19:52.916225Z",
2356
+ "iopub.status.idle": "2025-06-14T12:19:54.900881Z",
2357
+ "shell.execute_reply": "2025-06-14T12:19:54.897934Z",
2358
+ "shell.execute_reply.started": "2025-06-14T12:19:52.917365Z"
2359
+ }
2360
+ },
2361
+ "outputs": [],
2362
+ "source": [
2363
+ "answer_check = httpx.post(json=data, url=submit_url)"
2364
+ ]
2365
+ },
2366
+ {
2367
+ "cell_type": "code",
2368
+ "execution_count": 63,
2369
+ "id": "b3f7e6eb-2f8c-4b25-a83b-d3b1e628d09c",
2370
+ "metadata": {
2371
+ "execution": {
2372
+ "iopub.execute_input": "2025-06-14T12:19:54.907705Z",
2373
+ "iopub.status.busy": "2025-06-14T12:19:54.906633Z",
2374
+ "iopub.status.idle": "2025-06-14T12:19:54.924639Z",
2375
+ "shell.execute_reply": "2025-06-14T12:19:54.922534Z",
2376
+ "shell.execute_reply.started": "2025-06-14T12:19:54.907598Z"
2377
+ },
2378
+ "scrolled": true
2379
+ },
2380
+ "outputs": [
2381
+ {
2382
+ "data": {
2383
+ "text/plain": [
2384
+ "{'username': 'string',\n",
2385
+ " 'score': 30.0,\n",
2386
+ " 'correct_count': 6,\n",
2387
+ " 'total_attempted': 20,\n",
2388
+ " 'message': 'Score calculated successfully: 6/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.',\n",
2389
+ " 'timestamp': '2025-06-14T12:19:54.753829+00:00'}"
2390
+ ]
2391
+ },
2392
+ "execution_count": 63,
2393
+ "metadata": {},
2394
+ "output_type": "execute_result"
2395
+ }
2396
+ ],
2397
+ "source": [
2398
+ "answer_check.json()"
2399
+ ]
2400
+ },
2401
+ {
2402
+ "cell_type": "code",
2403
+ "execution_count": null,
2404
+ "id": "ff87396c-e52f-4b6f-bc37-565e3ecf26f7",
2405
  "metadata": {},
2406
  "outputs": [],
2407
+ "source": []
2408
+ },
2409
+ {
2410
+ "cell_type": "code",
2411
+ "execution_count": 108,
2412
+ "id": "c98460f5-8671-4b98-b932-60f599dc7ec5",
2413
+ "metadata": {
2414
+ "execution": {
2415
+ "iopub.execute_input": "2025-06-14T10:27:48.568548Z",
2416
+ "iopub.status.busy": "2025-06-14T10:27:48.567818Z",
2417
+ "iopub.status.idle": "2025-06-14T10:27:48.581051Z",
2418
+ "shell.execute_reply": "2025-06-14T10:27:48.579486Z",
2419
+ "shell.execute_reply.started": "2025-06-14T10:27:48.568492Z"
2420
+ }
2421
+ },
2422
+ "outputs": [
2423
+ {
2424
+ "data": {
2425
+ "text/plain": [
2426
+ "{'task_id': '7bd855d8-463d-4ed5-93ca-5fe35145f733',\n",
2427
+ " 'question': 'The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.',\n",
2428
+ " 'Level': '1',\n",
2429
+ " 'file_name': '7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx'}"
2430
+ ]
2431
+ },
2432
+ "execution_count": 108,
2433
+ "metadata": {},
2434
+ "output_type": "execute_result"
2435
+ }
2436
+ ],
2437
  "source": [
2438
+ "all_questions[18]"
2439
+ ]
2440
+ },
2441
+ {
2442
+ "cell_type": "code",
2443
+ "execution_count": 25,
2444
+ "id": "550688eb-8a32-4b89-85a8-607d74a1b240",
2445
+ "metadata": {
2446
+ "execution": {
2447
+ "iopub.execute_input": "2025-06-14T11:12:34.441436Z",
2448
+ "iopub.status.busy": "2025-06-14T11:12:34.440861Z",
2449
+ "iopub.status.idle": "2025-06-14T11:13:00.795896Z",
2450
+ "shell.execute_reply": "2025-06-14T11:13:00.792624Z",
2451
+ "shell.execute_reply.started": "2025-06-14T11:12:34.441383Z"
2452
+ },
2453
+ "scrolled": true
2454
+ },
2455
+ "outputs": [
2456
+ {
2457
+ "data": {
2458
+ "text/plain": [
2459
+ "{'messages': [HumanMessage(content='Complete the following task: The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.. Relevant file: 7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx', additional_kwargs={}, response_metadata={}, id='9c810520-a1cd-40b2-914e-8fd2f3e240c5'),\n",
2460
+ " AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_zp8tBlFZ1vOSnYqFQvbsgbi6', 'function': {'arguments': '{\"filepath\":\"7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx\"}', 'name': 'read_excel_or_csv'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 38, 'prompt_tokens': 1104, 'total_tokens': 1142, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 1024}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6JmHXTRb96TEdmDFjOpQVDXjGH', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--83bd6bab-e401-4383-9511-e4fe72002375-0', tool_calls=[{'name': 'read_excel_or_csv', 'args': {'filepath': '7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx'}, 'id': 'call_zp8tBlFZ1vOSnYqFQvbsgbi6', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1104, 'output_tokens': 38, 'total_tokens': 1142, 'input_token_details': {'audio': 0, 'cache_read': 1024}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
2461
+ " ToolMessage(content=\"{'Location': ['Pinebrook', 'Wharvton', 'Sagrada', 'Algrimand', 'Marztep', 'San Cecelia', 'Pimento', 'Tinseles', 'Rosdale'], 'Burgers': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007], 'Hot Dogs': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009], 'Salads': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021], 'Fries': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989], 'Ice Cream': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005], 'Soda': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\", name='read_excel_or_csv', id='8b1777a8-387e-417a-a9eb-9af103d701e1', tool_call_id='call_zp8tBlFZ1vOSnYqFQvbsgbi6'),\n",
2462
+ " AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_T5jtJJqyz0zpCvMNwcRZtuqV', 'function': {'arguments': '{\"filepath\":\"7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx\"}', 'name': 'read_excel_or_csv'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 38, 'prompt_tokens': 1439, 'total_tokens': 1477, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6KCZFLq93YDUtQHAG8vGABGAoa', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--ad40832c-c24b-43b5-af46-4eb0a3945439-0', tool_calls=[{'name': 'read_excel_or_csv', 'args': {'filepath': '7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx'}, 'id': 'call_T5jtJJqyz0zpCvMNwcRZtuqV', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1439, 'output_tokens': 38, 'total_tokens': 1477, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
2463
+ " ToolMessage(content=\"{'Location': ['Pinebrook', 'Wharvton', 'Sagrada', 'Algrimand', 'Marztep', 'San Cecelia', 'Pimento', 'Tinseles', 'Rosdale'], 'Burgers': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007], 'Hot Dogs': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009], 'Salads': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021], 'Fries': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989], 'Ice Cream': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005], 'Soda': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\", name='read_excel_or_csv', id='130e5ee1-fd7e-4120-8084-bcebf8d15a9c', tool_call_id='call_T5jtJJqyz0zpCvMNwcRZtuqV'),\n",
2464
+ " AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_IwlcalqVatpwkLnAbK29EVr3', 'function': {'arguments': '{\"filepath\":\"7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx\"}', 'name': 'read_excel_or_csv'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 38, 'prompt_tokens': 1774, 'total_tokens': 1812, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 1408}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6Lz0gaVXIFsxcfcisuIKWlLNLI', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--affe2bdb-f526-4bed-aae4-35cfc208b61a-0', tool_calls=[{'name': 'read_excel_or_csv', 'args': {'filepath': '7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx'}, 'id': 'call_IwlcalqVatpwkLnAbK29EVr3', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1774, 'output_tokens': 38, 'total_tokens': 1812, 'input_token_details': {'audio': 0, 'cache_read': 1408}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
2465
+ " ToolMessage(content=\"{'Location': ['Pinebrook', 'Wharvton', 'Sagrada', 'Algrimand', 'Marztep', 'San Cecelia', 'Pimento', 'Tinseles', 'Rosdale'], 'Burgers': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007], 'Hot Dogs': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009], 'Salads': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021], 'Fries': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989], 'Ice Cream': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005], 'Soda': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\", name='read_excel_or_csv', id='ca8399cc-6d48-43df-b781-c8f064154532', tool_call_id='call_IwlcalqVatpwkLnAbK29EVr3'),\n",
2466
+ " AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_GIEGGs5RtOIT92ajgPAJARpt', 'function': {'arguments': '{\"code_str\":\"import pandas as pd\\\\n\\\\n# Data extracted from the Excel file\\\\ndata = {\\'Location\\': [\\'Pinebrook\\', \\'Wharvton\\', \\'Sagrada\\', \\'Algrimand\\', \\'Marztep\\', \\'San Cecelia\\', \\'Pimento\\', \\'Tinseles\\', \\'Rosdale\\'],\\\\n \\'Burgers\\': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007],\\\\n \\'Hot Dogs\\': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009],\\\\n \\'Salads\\': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021],\\\\n \\'Fries\\': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989],\\\\n \\'Ice Cream\\': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005],\\\\n \\'Soda\\': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\\\\n\\\\ndf = pd.DataFrame(data)\\\\n\\\\n# Prices for each item (assuming these are the prices in USD)\\\\nprices = {\\'Burgers\\': 5.50, \\'Hot Dogs\\': 3.75, \\'Salads\\': 4.25, \\'Fries\\': 2.50, \\'Ice Cream\\': 3.00, \\'Soda\\': 1.50}\\\\n\\\\n# Calculate total sales for food items (excluding drinks which is Soda)\\\\nfood_items = [\\'Burgers\\', \\'Hot Dogs\\', \\'Salads\\', \\'Fries\\', \\'Ice Cream\\']\\\\n\\\\n# Calculate total sales\\\\ntotal_sales = 0\\\\nfor item in food_items:\\\\n total_sales += (df[item].sum() * prices[item])\\\\n\\\\nround(total_sales, 2)\"}', 'name': 'python_executor'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 506, 'prompt_tokens': 2109, 'total_tokens': 2615, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 1792}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6MtqSxoV60bBEmbp8dzJKWyM3Q', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--33c7ee3a-f032-4f24-864b-b773673199db-0', tool_calls=[{'name': 'python_executor', 'args': {'code_str': \"import pandas as pd\\n\\n# Data extracted from the Excel file\\ndata = {'Location': ['Pinebrook', 'Wharvton', 'Sagrada', 'Algrimand', 'Marztep', 'San Cecelia', 'Pimento', 'Tinseles', 'Rosdale'],\\n 'Burgers': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007],\\n 'Hot Dogs': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009],\\n 'Salads': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021],\\n 'Fries': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989],\\n 'Ice Cream': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005],\\n 'Soda': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\\n\\ndf = pd.DataFrame(data)\\n\\n# Prices for each item (assuming these are the prices in USD)\\nprices = {'Burgers': 5.50, 'Hot Dogs': 3.75, 'Salads': 4.25, 'Fries': 2.50, 'Ice Cream': 3.00, 'Soda': 1.50}\\n\\n# Calculate total sales for food items (excluding drinks which is Soda)\\nfood_items = ['Burgers', 'Hot Dogs', 'Salads', 'Fries', 'Ice Cream']\\n\\n# Calculate total sales\\ntotal_sales = 0\\nfor item in food_items:\\n total_sales += (df[item].sum() * prices[item])\\n\\nround(total_sales, 2)\"}, 'id': 'call_GIEGGs5RtOIT92ajgPAJARpt', 'type': 'tool_call'}], usage_metadata={'input_tokens': 2109, 'output_tokens': 506, 'total_tokens': 2615, 'input_token_details': {'audio': 0, 'cache_read': 1792}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
2467
+ " ToolMessage(content='Error: InterpreterError(\"Code execution failed at line \\'import pandas as pd\\' due to: InterpreterError: Import of pandas is not allowed. Authorized imports are: [\\'re\\', \\'statistics\\', \\'queue\\', \\'datetime\\', \\'numpy.*\\', \\'itertools\\', \\'math\\', \\'unicodedata\\', \\'stat\\', \\'collections\\', \\'random\\', \\'polars.*\\', \\'time\\']\")\\n Please fix your mistakes.', name='python_executor', id='56c8bfc0-165e-4ca3-b2c8-f19a71728d3f', tool_call_id='call_GIEGGs5RtOIT92ajgPAJARpt', status='error'),\n",
2468
+ " AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_ukghn9nyQv0MpZ2lPAnpdfjK', 'function': {'arguments': '{\"code_str\":\"import numpy as np\\\\n\\\\n# Data from the Excel file\\\\nburgers = np.array([1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007])\\\\nhot_dogs = np.array([1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009])\\\\nsalads = np.array([2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021])\\\\nfries = np.array([2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989])\\\\nice_cream = np.array([1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005])\\\\n\\\\n# Prices for each food item\\\\nprices = {\\'Burgers\\': 5.50, \\'Hot Dogs\\': 3.75, \\'Salads\\': 4.25, \\'Fries\\': 2.50, \\'Ice Cream\\': 3.00}\\\\n\\\\n# Calculate total sales for each food item\\\\ntotal_burgers = np.sum(burgers) * prices[\\'Burgers\\']\\\\ntotal_hot_dogs = np.sum(hot_dogs) * prices[\\'Hot Dogs\\']\\\\ntotal_salads = np.sum(salads) * prices[\\'Salads\\']\\\\ntotal_fries = np.sum(fries) * prices[\\'Fries\\']\\\\ntotal_ice_cream = np.sum(ice_cream) * prices[\\'Ice Cream\\']\\\\n\\\\n# Calculate total sales from food\\\\ntotal_sales_food = total_burgers + total_hot_dogs + total_salads + total_fries + total_ice_cream\\\\n\\\\nround(total_sales_food, 2)\"}', 'name': 'python_executor'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 451, 'prompt_tokens': 2707, 'total_tokens': 3158, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 2048}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6TIuM5izp01LjdF5U2znulHHxe', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--880f8528-bebd-4f78-8c4c-1dc05df990af-0', tool_calls=[{'name': 'python_executor', 'args': {'code_str': \"import numpy as np\\n\\n# Data from the Excel file\\nburgers = np.array([1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007])\\nhot_dogs = np.array([1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009])\\nsalads = np.array([2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021])\\nfries = np.array([2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989])\\nice_cream = np.array([1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005])\\n\\n# Prices for each food item\\nprices = {'Burgers': 5.50, 'Hot Dogs': 3.75, 'Salads': 4.25, 'Fries': 2.50, 'Ice Cream': 3.00}\\n\\n# Calculate total sales for each food item\\ntotal_burgers = np.sum(burgers) * prices['Burgers']\\ntotal_hot_dogs = np.sum(hot_dogs) * prices['Hot Dogs']\\ntotal_salads = np.sum(salads) * prices['Salads']\\ntotal_fries = np.sum(fries) * prices['Fries']\\ntotal_ice_cream = np.sum(ice_cream) * prices['Ice Cream']\\n\\n# Calculate total sales from food\\ntotal_sales_food = total_burgers + total_hot_dogs + total_salads + total_fries + total_ice_cream\\n\\nround(total_sales_food, 2)\"}, 'id': 'call_ukghn9nyQv0MpZ2lPAnpdfjK', 'type': 'tool_call'}], usage_metadata={'input_tokens': 2707, 'output_tokens': 451, 'total_tokens': 3158, 'input_token_details': {'audio': 0, 'cache_read': 2048}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
2469
+ " ToolMessage(content='Error: InterpreterError(\"Code execution failed at line \\'round(total_sales_food, 2)\\' due to: InterpreterError: Forbidden function evaluation: \\'round\\' is not among the explicitly allowed tools or defined/imported in the preceding code\")\\n Please fix your mistakes.', name='python_executor', id='bae4bb38-046c-4c36-8f26-1daf4c187b61', tool_call_id='call_ukghn9nyQv0MpZ2lPAnpdfjK', status='error'),\n",
2470
+ " AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_XpqHUTDZzi8Rkuw2ubo53CZJ', 'function': {'arguments': '{\"code_str\":\"import numpy as np\\\\n\\\\nburgers = np.array([1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007])\\\\nhot_dogs = np.array([1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009])\\\\nsalads = np.array([2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021])\\\\nfries = np.array([2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989])\\\\nice_cream = np.array([1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005])\\\\n\\\\nprices = {\\'Burgers\\': 5.50, \\'Hot Dogs\\': 3.75, \\'Salads\\': 4.25, \\'Fries\\': 2.50, \\'Ice Cream\\': 3.00}\\\\n\\\\ntotal_burgers = np.sum(burgers) * prices[\\'Burgers\\']\\\\ntotal_hot_dogs = np.sum(hot_dogs) * prices[\\'Hot Dogs\\']\\\\ntotal_salads = np.sum(salads) * prices[\\'Salads\\']\\\\ntotal_fries = np.sum(fries) * prices[\\'Fries\\']\\\\ntotal_ice_cream = np.sum(ice_cream) * prices[\\'Ice Cream\\']\\\\n\\\\ntotal_sales_food = total_burgers + total_hot_dogs + total_salads + total_fries + total_ice_cream\\\\n\\\\n\\\\\"{:.2f}\\\\\".format(total_sales_food)\"}', 'name': 'python_executor'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 427, 'prompt_tokens': 3221, 'total_tokens': 3648, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 1024}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6ayTTFSltIIg72OsMWD12m59Y4', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--16bcd06a-ffb1-4adf-8528-0a2b47b046e1-0', tool_calls=[{'name': 'python_executor', 'args': {'code_str': 'import numpy as np\\n\\nburgers = np.array([1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007])\\nhot_dogs = np.array([1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009])\\nsalads = np.array([2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021])\\nfries = np.array([2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989])\\nice_cream = np.array([1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005])\\n\\nprices = {\\'Burgers\\': 5.50, \\'Hot Dogs\\': 3.75, \\'Salads\\': 4.25, \\'Fries\\': 2.50, \\'Ice Cream\\': 3.00}\\n\\ntotal_burgers = np.sum(burgers) * prices[\\'Burgers\\']\\ntotal_hot_dogs = np.sum(hot_dogs) * prices[\\'Hot Dogs\\']\\ntotal_salads = np.sum(salads) * prices[\\'Salads\\']\\ntotal_fries = np.sum(fries) * prices[\\'Fries\\']\\ntotal_ice_cream = np.sum(ice_cream) * prices[\\'Ice Cream\\']\\n\\ntotal_sales_food = total_burgers + total_hot_dogs + total_salads + total_fries + total_ice_cream\\n\\n\"{:.2f}\".format(total_sales_food)'}, 'id': 'call_XpqHUTDZzi8Rkuw2ubo53CZJ', 'type': 'tool_call'}], usage_metadata={'input_tokens': 3221, 'output_tokens': 427, 'total_tokens': 3648, 'input_token_details': {'audio': 0, 'cache_read': 1024}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
2471
+ " ToolMessage(content='340090.25', name='python_executor', id='70c25b8c-9bca-44c5-9f8e-a786d3641a68', tool_call_id='call_XpqHUTDZzi8Rkuw2ubo53CZJ'),\n",
2472
+ " AIMessage(content='340090.25', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 3660, 'total_tokens': 3665, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 2944}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6g8VihYdHuX0NzmGFcCVxAbDlh', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--663ce019-aac1-4293-b368-8d8c2a8ed302-0', usage_metadata={'input_tokens': 3660, 'output_tokens': 5, 'total_tokens': 3665, 'input_token_details': {'audio': 0, 'cache_read': 2944}, 'output_token_details': {'audio': 0, 'reasoning': 0}})],\n",
2473
+ " 'structured_response': AnswerFormat(thoughts='I calculated the total sales for each food item by summing the quantities sold across all locations and multiplying by their respective prices. Then, I summed these totals to get the total sales from food items only, excluding drinks. The final total sales amount is formatted to two decimal places as requested.', answer='340090.25')}"
2474
+ ]
2475
+ },
2476
+ "execution_count": 25,
2477
+ "metadata": {},
2478
+ "output_type": "execute_result"
2479
+ }
2480
+ ],
2481
+ "source": [
2482
+ "agent.invoke(\n",
2483
  " input={\n",
2484
+ " \"messages\": f\"\"\"Complete the following task: {all_questions[18][\"question\"]}. Relevant file: {\n",
2485
+ " all_questions[18][\"file_name\"]\n",
2486
+ " if all_questions[18][\"file_name\"]\n",
2487
+ " else \"There's no relevant file to use.\"\n",
2488
  " }\"\"\"\n",
2489
  " }\n",
2490
+ " )"
2491
+ ]
2492
+ },
2493
+ {
2494
+ "cell_type": "code",
2495
+ "execution_count": 40,
2496
+ "id": "a525780a",
2497
+ "metadata": {
2498
+ "execution": {
2499
+ "iopub.execute_input": "2025-06-14T09:41:31.611239Z",
2500
+ "iopub.status.busy": "2025-06-14T09:41:31.608132Z",
2501
+ "iopub.status.idle": "2025-06-14T09:41:31.627482Z",
2502
+ "shell.execute_reply": "2025-06-14T09:41:31.625052Z",
2503
+ "shell.execute_reply.started": "2025-06-14T09:41:31.611107Z"
2504
+ }
2505
+ },
2506
+ "outputs": [],
2507
+ "source": [
2508
+ "# async def run_all_questions(agent, all_questions):\n",
2509
+ "# \"\"\"Run agent.ainvoke concurrently for all questions.\n",
2510
+ "\n",
2511
+ "# Args:\n",
2512
+ "# agent: The LangChain agent with an ainvoke method.\n",
2513
+ "# all_questions (list): List of question dicts.\n",
2514
+ "\n",
2515
+ "# Returns:\n",
2516
+ "# list: List of agent responses.\n",
2517
+ "# \"\"\"\n",
2518
+ "# tasks = [\n",
2519
+ "# agent.ainvoke(\n",
2520
+ "# input={\n",
2521
+ "# \"messages\": f\"\"\"Complete the following task: {q[\"question\"]}. Relevant file: {\n",
2522
+ "# q[\"file_name\"] if q[\"file_name\"] else \"There's no relevant file to use.\"\n",
2523
+ "# }\"\"\"\n",
2524
+ "# }\n",
2525
+ "# )\n",
2526
+ "# for q in all_questions\n",
2527
+ "# ]\n",
2528
+ "# try:\n",
2529
+ "# return await asyncio.gather(*tasks)\n",
2530
+ "# except GraphRecursionError as err:\n",
2531
+ "# print(\"❌ Agent stopped due to max iterations.\")\n",
2532
+ "# print(f\"Error: {err}\")\n",
2533
+ "# # return \"Recursion Limit Reached.\"\n",
2534
  "\n"
2535
  ]
2536
  },
 
2541
  "metadata": {},
2542
  "outputs": [],
2543
  "source": [
2544
+ "# responses = asyncio.run(run_all_questions(agent, all_questions))"
2545
  ]
2546
  },
2547
  {
2548
  "cell_type": "code",
2549
+ "execution_count": 43,
2550
+ "id": "12e0f74d-e565-47bb-bf6d-0ba11570b451",
2551
+ "metadata": {
2552
+ "execution": {
2553
+ "iopub.execute_input": "2025-06-14T09:42:47.636798Z",
2554
+ "iopub.status.busy": "2025-06-14T09:42:47.635229Z",
2555
+ "iopub.status.idle": "2025-06-14T09:42:47.649310Z",
2556
+ "shell.execute_reply": "2025-06-14T09:42:47.647870Z",
2557
+ "shell.execute_reply.started": "2025-06-14T09:42:47.636728Z"
2558
+ }
2559
+ },
2560
  "outputs": [
2561
  {
2562
  "data": {
2563
  "text/plain": [
2564
+ "NoneType"
2565
  ]
2566
  },
2567
+ "execution_count": 43,
2568
  "metadata": {},
2569
  "output_type": "execute_result"
2570
  }
2571
  ],
2572
  "source": [
2573
+ "type(responses)"
2574
  ]
2575
  },
2576
  {
2577
  "cell_type": "code",
2578
+ "execution_count": 35,
2579
+ "id": "d28d2d35",
2580
+ "metadata": {
2581
+ "execution": {
2582
+ "iopub.execute_input": "2025-06-14T09:38:24.184443Z",
2583
+ "iopub.status.busy": "2025-06-14T09:38:24.183596Z",
2584
+ "iopub.status.idle": "2025-06-14T09:38:24.554423Z",
2585
+ "shell.execute_reply": "2025-06-14T09:38:24.552528Z",
2586
+ "shell.execute_reply.started": "2025-06-14T09:38:24.184340Z"
2587
+ }
2588
+ },
2589
+ "outputs": [
2590
+ {
2591
+ "ename": "TypeError",
2592
+ "evalue": "string indices must be integers, not 'str'",
2593
+ "output_type": "error",
2594
+ "traceback": [
2595
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
2596
+ "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
2597
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[35]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mresponses\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mstructured_response\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m.answer\n",
2598
+ "\u001b[31mTypeError\u001b[39m: string indices must be integers, not 'str'"
2599
+ ]
2600
+ }
2601
+ ],
2602
  "source": [
2603
+ "responses[2]['structured_response'].answer"
 
2604
  ]
2605
  },
2606
  {
 
2613
  },
2614
  {
2615
  "cell_type": "code",
2616
+ "execution_count": 23,
2617
  "id": "94449302",
2618
+ "metadata": {
2619
+ "execution": {
2620
+ "iopub.execute_input": "2025-06-14T09:36:13.544492Z",
2621
+ "iopub.status.busy": "2025-06-14T09:36:13.541377Z",
2622
+ "iopub.status.idle": "2025-06-14T09:36:13.558603Z",
2623
+ "shell.execute_reply": "2025-06-14T09:36:13.557527Z",
2624
+ "shell.execute_reply.started": "2025-06-14T09:36:13.544415Z"
2625
+ },
2626
+ "scrolled": true
2627
+ },
2628
  "outputs": [
2629
  {
2630
  "data": {
2631
  "text/plain": [
2632
+ "{'task_id': '2d83110e-a098-4ebb-9987-066c06fa42d0',\n",
2633
+ " 'question': '.rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI',\n",
2634
  " 'Level': '1',\n",
2635
  " 'file_name': ''}"
2636
  ]
2637
  },
2638
+ "execution_count": 23,
2639
  "metadata": {},
2640
  "output_type": "execute_result"
2641
  }
2642
  ],
2643
  "source": [
2644
+ "all_questions[2]"
2645
  ]
2646
  },
2647
  {
 
2683
  ],
2684
  "metadata": {
2685
  "kernelspec": {
2686
+ "display_name": "Python 3 (ipykernel)",
2687
  "language": "python",
2688
  "name": "python3"
2689
  },
pyproject.toml CHANGED
@@ -6,6 +6,8 @@ dependencies = [
6
  "accelerate>=1.7.0",
7
  "av>=14.4.0",
8
  "fastexcel>=0.14.0",
 
 
9
  "huggingface-hub[hf-xet]>=0.32.6",
10
  "langchain-community>=0.3.25",
11
  "langchain-yt-dlp>=0.0.8",
 
6
  "accelerate>=1.7.0",
7
  "av>=14.4.0",
8
  "fastexcel>=0.14.0",
9
+ "gradio>=5.34.0",
10
+ "httpx>=0.28.1",
11
  "huggingface-hub[hf-xet]>=0.32.6",
12
  "langchain-community>=0.3.25",
13
  "langchain-yt-dlp>=0.0.8",
results_gpt_mini.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d24e0fcef15657f57ea2d5b1bc8de286cf1a0a0b1919fa19494e8c1359e66f9
3
+ size 486587
universal_agent.py CHANGED
@@ -27,7 +27,7 @@ nest_asyncio.apply()
27
  with open("all_questions.pkl", "rb") as f:
28
  all_questions = pickle.load(f)
29
  lang_model = init_chat_model(
30
- model="gpt-4.1-nano", model_provider="openai", temperature=0
31
  )
32
 
33
 
@@ -56,9 +56,9 @@ def read_excel_or_csv(filepath: str) -> str:
56
  return content_str
57
 
58
 
59
- def python_executor(filepath: str) -> Any:
60
  """Returns the output of a python code."""
61
- with open(filepath) as f:
62
  code = f.readlines()
63
  code_result = lang_model.generate(
64
  messages=[
@@ -76,6 +76,12 @@ def python_executor(filepath: str) -> Any:
76
  )
77
  return code_result.generations[0][0].text
78
 
 
 
 
 
 
 
79
 
80
  stt_tool = Tool.from_space(
81
  space_id="UNSAFESUPERINTELLIGENCE/Minimum-OpenAI-Whisper",
@@ -203,12 +209,15 @@ You are an expert agent - please keep going until the user’s query is complete
203
 
204
 
205
  # Instructions
206
- - Carefully read and understand the task.
207
  - Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance "discograpy".
208
  - If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.
209
  - You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.
210
  - Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.
211
- - You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.
 
 
 
212
  - If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.
213
  - You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.
214
  - Always verify your answers.
@@ -247,6 +256,8 @@ What's 2 +2 ?
247
  4
248
 
249
  """
 
 
250
  agent = create_react_agent(
251
  model=lang_model,
252
  tools=[
@@ -254,6 +265,7 @@ agent = create_react_agent(
254
  visit_web_page,
255
  read_excel_or_csv,
256
  python_executor,
 
257
  call_stt_tool,
258
  image_tool,
259
  youtube_video_tool,
@@ -262,56 +274,35 @@ agent = create_react_agent(
262
  prompt=SYSTEM_PROMPT,
263
  response_format=AnswerFormat,
264
  )
 
 
265
  # recursion_limit = 10
266
  # agent_w_recursion_limit = agent.with_config(recursion_limit=recursion_limit)
267
- all_questions[0]
268
-
269
-
270
- # try:
271
- # answer = agent.invoke(
272
- # # answer = agent_w_recursion_limit.invoke(
273
- # input={
274
- # "messages": f"""Complete the following task: {all_questions[0]["question"]}. Relevant file: {
275
- # all_questions[0]["file_name"]
276
- # if all_questions[0]["file_name"]
277
- # else "There's no relevant file to use."
278
- # }"""
279
- # }
280
- # )
281
- # except GraphRecursionError:
282
- # print("❌ Agent stopped due to max iterations.")
283
- async def run_all_questions(agent, all_questions):
284
- """Run agent.ainvoke concurrently for all questions.
285
-
286
- Args:
287
- agent: The LangChain agent with an ainvoke method.
288
- all_questions (list): List of question dicts.
289
-
290
- Returns:
291
- list: List of agent responses.
292
- """
293
- tasks = [
294
- agent.ainvoke(
295
  input={
296
- "messages": f"""Complete the following task: {
297
- q["question"]
298
- }. Relevant file: {
299
  q["file_name"]
300
  if q["file_name"]
301
  else "There's no relevant file to use."
302
  }"""
303
  }
304
  )
305
- for q in all_questions
306
- ]
307
- try:
308
- return await asyncio.gather(*tasks)
309
- except GraphRecursionError as err:
310
  print("❌ Agent stopped due to max iterations.")
311
- print(f"Error: {err}")
312
- return "Recursion Limit Reached."
313
 
 
 
314
 
315
- responses = asyncio.run(run_all_questions(agent, all_questions))
316
- final_answers = [responses[i]["structured_response"].answer [for i in range(len(all_questions))]]
 
 
317
 
 
27
  with open("all_questions.pkl", "rb") as f:
28
  all_questions = pickle.load(f)
29
  lang_model = init_chat_model(
30
+ model="gpt-4.1-mini", model_provider="openai", temperature=0.2
31
  )
32
 
33
 
 
56
  return content_str
57
 
58
 
59
+ def python_code_interpreter(filepath: str) -> Any:
60
  """Returns the output of a python code."""
61
+ with open(filepath, "r") as f:
62
  code = f.readlines()
63
  code_result = lang_model.generate(
64
  messages=[
 
76
  )
77
  return code_result.generations[0][0].text
78
 
79
+ def python_executor(code_str:str) -> str:
80
+ """This executes python code. The code must be a string.
81
+ For any calculations always use numpy."""
82
+ lpe = LocalPythonExecutor(additional_authorized_imports=['polars.*', 'numpy.*'])
83
+ code_res = lpe(code_action=code_str)[0]
84
+ return code_res
85
 
86
  stt_tool = Tool.from_space(
87
  space_id="UNSAFESUPERINTELLIGENCE/Minimum-OpenAI-Whisper",
 
209
 
210
 
211
  # Instructions
212
+ - Carefully read and understand the task. Sometimes the task might be a sentence reversed, so un reverse it first and then complete the task.
213
  - Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance "discograpy".
214
  - If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.
215
  - You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.
216
  - Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.
217
+ - You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `python_code_interpreter`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.
218
+ - If a python file is given to you, then use the `python_code_interpreter` and the input to the tool should be the file name.
219
+ - For any youtube related task use the `youtube_video_tool` and the input to the tool should be URL as a string along with the query.
220
+ - For any dataframe related tasks, always use the `read_excel_or_csv` tool.
221
  - If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.
222
  - You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.
223
  - Always verify your answers.
 
256
  4
257
 
258
  """
259
+
260
+
261
  agent = create_react_agent(
262
  model=lang_model,
263
  tools=[
 
265
  visit_web_page,
266
  read_excel_or_csv,
267
  python_executor,
268
+ python_code_interpreter,
269
  call_stt_tool,
270
  image_tool,
271
  youtube_video_tool,
 
274
  prompt=SYSTEM_PROMPT,
275
  response_format=AnswerFormat,
276
  )
277
+
278
+
279
  # recursion_limit = 10
280
  # agent_w_recursion_limit = agent.with_config(recursion_limit=recursion_limit)
281
+ # all_questions[0]
282
+
283
+ results = []
284
+ for q in all_questions:
285
+ try:
286
+ answer = await agent.ainvoke(
287
+ # answer = agent_w_recursion_limit.invoke(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  input={
289
+ "messages": f"""Complete the following task: {q["question"]}. Relevant file: {
 
 
290
  q["file_name"]
291
  if q["file_name"]
292
  else "There's no relevant file to use."
293
  }"""
294
  }
295
  )
296
+ results.append(answer)
297
+ except GraphRecursionError:
 
 
 
298
  print("❌ Agent stopped due to max iterations.")
299
+ results.append(q["task_id"])
 
300
 
301
+ # with open("results_gpt_mini.pkl", "wb") as f:
302
+ # pickle.dump(obj=results, file=f, protocol=pickle.HIGHEST_PROTOCOL)
303
 
304
+ answers = [{"task_id":j['task_id'],
305
+ "submitted_answer": results[i]["structured_response"].answer
306
+ if isinstance(results[i], dict) else "No answer"}
307
+ for i,j in enumerate(all_questions)]
308
 
uv.lock CHANGED
@@ -3212,6 +3212,8 @@ dependencies = [
3212
  { name = "accelerate" },
3213
  { name = "av" },
3214
  { name = "fastexcel" },
 
 
3215
  { name = "huggingface-hub", extra = ["hf-xet"] },
3216
  { name = "langchain", extra = ["openai"] },
3217
  { name = "langchain-community" },
@@ -3242,6 +3244,8 @@ requires-dist = [
3242
  { name = "accelerate", specifier = ">=1.7.0" },
3243
  { name = "av", specifier = ">=14.4.0" },
3244
  { name = "fastexcel", specifier = ">=0.14.0" },
 
 
3245
  { name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=0.32.6" },
3246
  { name = "langchain", extras = ["openai"], specifier = ">=0.3.25" },
3247
  { name = "langchain-community", specifier = ">=0.3.25" },
 
3212
  { name = "accelerate" },
3213
  { name = "av" },
3214
  { name = "fastexcel" },
3215
+ { name = "gradio" },
3216
+ { name = "httpx" },
3217
  { name = "huggingface-hub", extra = ["hf-xet"] },
3218
  { name = "langchain", extra = ["openai"] },
3219
  { name = "langchain-community" },
 
3244
  { name = "accelerate", specifier = ">=1.7.0" },
3245
  { name = "av", specifier = ">=14.4.0" },
3246
  { name = "fastexcel", specifier = ">=0.14.0" },
3247
+ { name = "gradio", specifier = ">=5.34.0" },
3248
+ { name = "httpx", specifier = ">=0.28.1" },
3249
  { name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=0.32.6" },
3250
  { name = "langchain", extras = ["openai"], specifier = ">=0.3.25" },
3251
  { name = "langchain-community", specifier = ">=0.3.25" },