unit_4_GAIA_challenge

Paused

App Files Files Community

Shamik commited on Jun 14

Commit

2e53410

verified ·

1 Parent(s): b827254

feat: adding a answer file and changing the app file to directly read the answers from the pickle file.

Browse files

Files changed (6) hide show

app.py +65 -55
playground.ipynb +613 -108
pyproject.toml +2 -0
results_gpt_mini.pkl +3 -0
universal_agent.py +36 -45
uv.lock +4 -0

app.py CHANGED Viewed

@@ -15,14 +15,14 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Basic Agent Definition ---
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
-class BasicAgent:
-    def __init__(self):
-        logger.info("BasicAgent initialized.")
-    def __call__(self, question: str) -> str:
-        logger.info(f"Agent received question (first 50 chars): {question[:50]}...")
-        fixed_answer = "This is a default answer."
-        logger.info(f"Agent returning fixed answer: {fixed_answer}")
-        return fixed_answer
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
@@ -43,61 +43,71 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
-    # 1. Instantiate Agent ( modify this part to create your agent)
-    try:
-        agent = BasicAgent()
-    except Exception as e:
-        logger.info(f"Error instantiating agent: {e}")
-        return f"Error initializing agent: {e}", None
     # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     logger.info(agent_code)
-    # 2. Fetch Questions
-    logger.info(f"Fetching questions from: {questions_url}")
-    try:
-        response = requests.get(questions_url, timeout=15)
-        response.raise_for_status()
-        questions_data = response.json()
-        if not questions_data:
-             logger.info("Fetched questions list is empty.")
-             return "Fetched questions list is empty or invalid format.", None
-        logger.info(f"Fetched {len(questions_data)} questions.")
-    except requests.exceptions.RequestException as e:
-        logger.info(f"Error fetching questions: {e}")
-        return f"Error fetching questions: {e}", None
-    except requests.exceptions.JSONDecodeError as e:
-         logger.info(f"Error decoding JSON response from questions endpoint: {e}")
-         logger.info(f"Response text: {response.text[:500]}")
-         return f"Error decoding server response for questions: {e}", None
-    except Exception as e:
-        logger.info(f"An unexpected error occurred fetching questions: {e}")
-        return f"An unexpected error occurred fetching questions: {e}", None
     # 3. Run your Agent
-    results_log = []
-    answers_payload = []
-    logger.info(f"Running agent on {len(questions_data)} questions...")
-    for item in questions_data:
-        task_id = item.get("task_id")
-        question_text = item.get("question")
-        if not task_id or question_text is None:
-            logger.info(f"Skipping item with missing task_id or question: {item}")
-            continue
-        try:
-            submitted_answer = agent(question_text)
-            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
-            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
-        except Exception as e:
-             logger.info(f"Error running agent on task {task_id}: {e}")
-             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
-    if not answers_payload:
-        logger.info("Agent did not produce any answers to submit.")
-        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
     # 4. Prepare Submission
-    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
     status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
     logger.info(status_update)

 # --- Basic Agent Definition ---
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
+# class BasicAgent:
+#     def __init__(self):
+#         logger.info("BasicAgent initialized.")
+#     def __call__(self, question: str) -> str:
+#         logger.info(f"Agent received question (first 50 chars): {question[:50]}...")
+#         fixed_answer = "This is a default answer."
+#         logger.info(f"Agent returning fixed answer: {fixed_answer}")
+#         return fixed_answer
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
+    # # 1. Instantiate Agent ( modify this part to create your agent)
+    # try:
+    #     agent = BasicAgent()
+    # except Exception as e:
+    #     logger.info(f"Error instantiating agent: {e}")
+    #     return f"Error initializing agent: {e}", None
     # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     logger.info(agent_code)
+    # # 2. Fetch Questions
+    # logger.info(f"Fetching questions from: {questions_url}")
+    # try:
+    #     response = requests.get(questions_url, timeout=15)
+    #     response.raise_for_status()
+    #     questions_data = response.json()
+    #     if not questions_data:
+    #          logger.info("Fetched questions list is empty.")
+    #          return "Fetched questions list is empty or invalid format.", None
+    #     logger.info(f"Fetched {len(questions_data)} questions.")
+    # except requests.exceptions.RequestException as e:
+    #     logger.info(f"Error fetching questions: {e}")
+    #     return f"Error fetching questions: {e}", None
+    # except requests.exceptions.JSONDecodeError as e:
+    #      logger.info(f"Error decoding JSON response from questions endpoint: {e}")
+    #      logger.info(f"Response text: {response.text[:500]}")
+    #      return f"Error decoding server response for questions: {e}", None
+    # except Exception as e:
+    #     logger.info(f"An unexpected error occurred fetching questions: {e}")
+    #     return f"An unexpected error occurred fetching questions: {e}", None
     # 3. Run your Agent
+    # results_log = []
+    # answers_payload = []
+    # logger.info(f"Running agent on {len(questions_data)} questions...")
+    # for item in questions_data:
+    #     task_id = item.get("task_id")
+    #     question_text = item.get("question")
+    #     if not task_id or question_text is None:
+    #         logger.info(f"Skipping item with missing task_id or question: {item}")
+    #         continue
+    #     try:
+    #         submitted_answer = agent(question_text)
+    #         answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
+    #         results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
+    #     except Exception as e:
+    #          logger.info(f"Error running agent on task {task_id}: {e}")
+    #          results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
+    # if not answers_payload:
+    #     logger.info("Agent did not produce any answers to submit.")
+    #     return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
+    with open("all_questions.pkl", "rb") as f:
+        all_questions = pickle.load(f)
+    with open("results_gpt_mini.pkl", "rb") as f:
+        results = pickle.load(f)
+    answers = [{"task_id":j['task_id'],
+            "submitted_answer": results[i]["structured_response"].answer
+            if isinstance(results[i], dict) else "No answer"}
+ for i,j in enumerate(all_questions)]
     # 4. Prepare Submission
+    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers}
+    # submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
     status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
     logger.info(status_update)

playground.ipynb CHANGED Viewed

@@ -1,5 +1,15 @@
 {
  "cells": [
   {
    "cell_type": "code",
    "execution_count": 73,
@@ -57,15 +67,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "7a960624-355b-4934-a741-fb880ad3ca37",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-06-01T14:26:03.687178Z",
-     "iopub.status.busy": "2025-06-01T14:26:03.686339Z",
-     "iopub.status.idle": "2025-06-01T14:26:03.693527Z",
-     "shell.execute_reply": "2025-06-01T14:26:03.692065Z",
-     "shell.execute_reply.started": "2025-06-01T14:26:03.687135Z"
     }
    },
    "outputs": [],
@@ -984,7 +994,9 @@
   {
    "cell_type": "markdown",
    "id": "c3b5f100-893f-405a-920f-a32461ac7277",
-   "metadata": {},
    "source": [
     "# Async Task"
    ]
@@ -1424,15 +1436,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "b7380e2b",
-   "metadata": {},
    "outputs": [],
    "source": [
     "from smolagents import (\n",
     "    DuckDuckGoSearchTool,\n",
     "    VisitWebpageTool,\n",
     "    LocalPythonExecutor,\n",
     "    WikipediaSearchTool,\n",
     "    Tool\n",
     ")\n",
@@ -1452,14 +1473,24 @@
     "from dotenv import load_dotenv, find_dotenv\n",
     "import pickle\n",
     "import asyncio\n",
-    "import nest_asyncio"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
    "id": "147b9149",
-   "metadata": {},
    "outputs": [],
    "source": [
     "_ = load_dotenv(find_dotenv(raise_error_if_not_found=True), override=True)\n",
@@ -1470,20 +1501,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "a36446a3",
-   "metadata": {},
    "outputs": [],
    "source": [
-    "lang_model = init_chat_model(model=\"gpt-4.1-nano\", model_provider=\"openai\",temperature=0)\n",
-    "# lang_model = init_chat_model(model=\"gpt-4.1-mini\", model_provider=\"openai\",temperature=0)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
    "id": "cf80baaf",
-   "metadata": {},
    "outputs": [],
    "source": [
     "def search_wikipedia(query: str) -> str:\n",
@@ -1498,7 +1546,15 @@
    "cell_type": "code",
    "execution_count": 5,
    "id": "daf73b23",
-   "metadata": {},
    "outputs": [],
    "source": [
     "def visit_web_page(url:str) -> str:\n",
@@ -1512,26 +1568,43 @@
    "cell_type": "code",
    "execution_count": 6,
    "id": "6f37359b",
-   "metadata": {},
    "outputs": [],
    "source": [
     "def read_excel_or_csv(filepath: str) -> str:\n",
     "    \"\"\"Reads an excel or csv file and returns the content as str.\"\"\"\n",
     "    if Path(filepath).suffix in {\".xlsx\", \".xls\"}:\n",
     "        df = pl.read_excel(source=filepath)\n",
-    "    df = pl.read_csv(source=filepath)\n",
     "    content_str = df.to_dict(as_series=False).__str__()\n",
     "    return content_str\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
    "id": "c52bd1a4",
-   "metadata": {},
    "outputs": [],
    "source": [
-    "def python_executor(filepath: str) -> Any:\n",
     "    \"\"\"Returns the output of a python code.\"\"\"\n",
     "    with open(filepath, \"r\") as f:\n",
     "        code = f.readlines()\n",
@@ -1552,11 +1625,27 @@
     "    return code_result.generations[0][0].text"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
    "id": "2c835c33",
-   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -1583,7 +1672,15 @@
    "cell_type": "code",
    "execution_count": 9,
    "id": "01193586",
-   "metadata": {},
    "outputs": [],
    "source": [
     "def call_stt_tool(file_url:str) -> str:\n",
@@ -1596,7 +1693,15 @@
    "cell_type": "code",
    "execution_count": 10,
    "id": "f10e6859",
-   "metadata": {},
    "outputs": [],
    "source": [
     "\n",
@@ -1637,7 +1742,15 @@
    "cell_type": "code",
    "execution_count": 11,
    "id": "b1634bb8",
-   "metadata": {},
    "outputs": [],
    "source": [
     "# image_tool(file_url=\"cca530fc-4052-43b2-b130-b30968d8aa44.png\")"
@@ -1647,12 +1760,20 @@
    "cell_type": "code",
    "execution_count": 12,
    "id": "08f1534d",
-   "metadata": {},
    "outputs": [],
    "source": [
-    "def youtube_video_tool(url:str) -> str:\n",
     "    \"\"\"Answers questions about youtube videos.\n",
-    "    URLs must be provided to this tool.\"\"\"\n",
     "    yt_vid_mapping = {\"https://www.youtube.com/watch?v=L1vXCYZAYYM\": \"penguin.mp4\",\n",
     "                  \"https://www.youtube.com/watch?v=1htKBjuUWec\": \"coffee.mp4\"}\n",
     "    video = cv2.VideoCapture(filename=yt_vid_mapping[url])\n",
@@ -1674,7 +1795,7 @@
     "                    {\n",
     "                        \"type\": \"text\",\n",
     "                        \"text\": (\n",
-    "                            \"\"\"Examine the video.\\n\\nWhat does Teal'c say in response to the question \\\"Isn't that hot?\\\"\"\"\"\n",
     "                        ),\n",
     "                    },\n",
     "                    *[\n",
@@ -1696,7 +1817,15 @@
    "cell_type": "code",
    "execution_count": 13,
    "id": "599530e3",
-   "metadata": {},
    "outputs": [],
    "source": [
     "def web_search_tool(query: str) -> str:\n",
@@ -1706,19 +1835,52 @@
     "    return search_res"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "64482acf",
-   "metadata": {},
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "id": "a24e765b",
-   "metadata": {},
    "outputs": [],
    "source": [
     "class AnswerFormat(BaseModel):\n",
@@ -1743,9 +1905,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "id": "d726f6cd",
-   "metadata": {},
    "outputs": [],
    "source": [
     "# SYS_PROMPT_SWEBENCH = \"\"\"\n",
@@ -1891,9 +2061,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
    "id": "40300268",
-   "metadata": {},
    "outputs": [],
    "source": [
     "# https://cookbook.openai.com/examples/gpt4-1_prompting_guide\n",
@@ -1902,12 +2080,15 @@
     "\n",
     "\n",
     "# Instructions\n",
-    "- Carefully read and understand the task. \n",
     "- Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance \"discograpy\".\n",
     "- If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.\n",
     "- You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.\n",
     "- Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.\n",
-    "- You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.\n",
     "- If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.\n",
     "- You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.\n",
     "- Always verify your answers.\n",
@@ -1950,9 +2131,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
    "id": "db96eef6",
-   "metadata": {},
    "outputs": [],
    "source": [
     "agent = create_react_agent(\n",
@@ -1962,6 +2151,7 @@
     "        visit_web_page,\n",
     "        read_excel_or_csv,\n",
     "        python_executor,\n",
     "        call_stt_tool,\n",
     "        image_tool,\n",
     "        youtube_video_tool,\n",
@@ -1974,9 +2164,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
    "id": "33240d19",
-   "metadata": {},
    "outputs": [],
    "source": [
     "# recursion_limit = 10\n",
@@ -1985,9 +2183,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
    "id": "850bb54d",
-   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -1998,7 +2204,7 @@
        " 'file_name': ''}"
       ]
      },
-     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2009,59 +2215,322 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
    "id": "f9025fae",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# try:\n",
-    "#     answer = agent.invoke(\n",
-    "#     # answer = agent_w_recursion_limit.invoke(\n",
-    "#         input={\n",
-    "#             \"messages\": f\"\"\"Complete the following task: {all_questions[0][\"question\"]}. Relevant file: {\n",
-    "#                 all_questions[0][\"file_name\"]\n",
-    "#                 if all_questions[0][\"file_name\"]\n",
-    "#                 else \"There's no relevant file to use.\"\n",
-    "#             }\"\"\"\n",
-    "#         }\n",
-    "#     )\n",
-    "# except GraphRecursionError:\n",
-    "#     print(\"❌ Agent stopped due to max iterations.\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
-   "id": "a525780a",
    "metadata": {},
    "outputs": [],
    "source": [
-    "async def run_all_questions(agent, all_questions):\n",
-    "    \"\"\"Run agent.ainvoke concurrently for all questions.\n",
-    "\n",
-    "    Args:\n",
-    "        agent: The LangChain agent with an ainvoke method.\n",
-    "        all_questions (list): List of question dicts.\n",
-    "\n",
-    "    Returns:\n",
-    "        list: List of agent responses.\n",
-    "    \"\"\"\n",
-    "    tasks = [\n",
-    "        agent.ainvoke(\n",
     "            input={\n",
-    "                \"messages\": f\"\"\"Complete the following task: {q[\"question\"]}. Relevant file: {\n",
-    "                    q[\"file_name\"] if q[\"file_name\"] else \"There's no relevant file to use.\"\n",
     "                }\"\"\"\n",
     "            }\n",
-    "        )\n",
-    "        for q in all_questions\n",
-    "    ]\n",
-    "    try:\n",
-    "        return await asyncio.gather(*tasks)\n",
-    "    except GraphRecursionError as err:\n",
-    "        print(\"❌ Agent stopped due to max iterations.\")\n",
-    "        print(f\"Error: {err}\")\n",
-    "        return \"Recursion Limit Reached.\"\n",
     "\n"
    ]
   },
@@ -2072,39 +2541,66 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "responses = asyncio.run(run_all_questions(agent, all_questions))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
-   "id": "d28d2d35",
-   "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'Please provide the code from the file so I can determine the final output.'"
       ]
      },
-     "execution_count": 47,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "responses[11]['structured_response'].answer"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
-   "id": "c4cd3fc9",
-   "metadata": {},
-   "outputs": [],
    "source": [
-    "with open(\"f918266a-b3e0-4914-865d-4faa564f1aef.py\", \"r\") as f: \n",
-    "    code = f.readlines()"
    ]
   },
   {
@@ -2117,26 +2613,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
    "id": "94449302",
-   "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'task_id': '5a0c1adf-205e-4841-a666-7c3ef95def9d',\n",
-       " 'question': 'What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?',\n",
        " 'Level': '1',\n",
        " 'file_name': ''}"
       ]
      },
-     "execution_count": 66,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "all_questions[19]"
    ]
   },
   {
@@ -2178,7 +2683,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "unit-4-gaia-challenge",
    "language": "python",
    "name": "python3"
   },

 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e6c1f8cc-eee9-4e7f-b82e-69d681533cb3",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "# Smolagents"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 73,
   },
   {
    "cell_type": "code",
+   "execution_count": 28,
    "id": "7a960624-355b-4934-a741-fb880ad3ca37",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-06-14T12:04:06.703540Z",
+     "iopub.status.busy": "2025-06-14T12:04:06.702825Z",
+     "iopub.status.idle": "2025-06-14T12:04:06.714241Z",
+     "shell.execute_reply": "2025-06-14T12:04:06.711667Z",
+     "shell.execute_reply.started": "2025-06-14T12:04:06.703473Z"
     }
    },
    "outputs": [],
   {
    "cell_type": "markdown",
    "id": "c3b5f100-893f-405a-920f-a32461ac7277",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
    "source": [
     "# Async Task"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
+   "id": "e4093850-db0d-4563-b116-765a3f1df44c",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:03:55.849855Z",
+     "iopub.status.busy": "2025-06-14T12:03:55.849026Z",
+     "iopub.status.idle": "2025-06-14T12:03:55.859974Z",
+     "shell.execute_reply": "2025-06-14T12:03:55.857431Z",
+     "shell.execute_reply.started": "2025-06-14T12:03:55.849785Z"
+    }
+   },
    "outputs": [],
    "source": [
     "from smolagents import (\n",
     "    DuckDuckGoSearchTool,\n",
     "    VisitWebpageTool,\n",
     "    LocalPythonExecutor,\n",
+    "    PythonInterpreterTool,\n",
     "    WikipediaSearchTool,\n",
     "    Tool\n",
     ")\n",
     "from dotenv import load_dotenv, find_dotenv\n",
     "import pickle\n",
     "import asyncio\n",
+    "import nest_asyncio\n",
+    "\n",
+    "import httpx"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
    "id": "147b9149",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:00:16.150863Z",
+     "iopub.status.busy": "2025-06-14T12:00:16.150052Z",
+     "iopub.status.idle": "2025-06-14T12:00:16.160454Z",
+     "shell.execute_reply": "2025-06-14T12:00:16.159182Z",
+     "shell.execute_reply.started": "2025-06-14T12:00:16.150794Z"
+    }
+   },
    "outputs": [],
    "source": [
     "_ = load_dotenv(find_dotenv(raise_error_if_not_found=True), override=True)\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 56,
    "id": "a36446a3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:13:30.483018Z",
+     "iopub.status.busy": "2025-06-14T12:13:30.480788Z",
+     "iopub.status.idle": "2025-06-14T12:13:30.497355Z",
+     "shell.execute_reply": "2025-06-14T12:13:30.494658Z",
+     "shell.execute_reply.started": "2025-06-14T12:13:30.482923Z"
+    }
+   },
    "outputs": [],
    "source": [
+    "lang_model = init_chat_model(model=\"gpt-4.1\", model_provider=\"openai\",temperature=0.2)\n",
+    "# lang_model = init_chat_model(model=\"gpt-4.1-nano\", model_provider=\"openai\",temperature=0.2)\n",
+    "# lang_model = init_chat_model(model=\"gpt-4.1-mini\", model_provider=\"openai\",temperature=0.2)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
    "id": "cf80baaf",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:00:17.884011Z",
+     "iopub.status.busy": "2025-06-14T12:00:17.883440Z",
+     "iopub.status.idle": "2025-06-14T12:00:17.892705Z",
+     "shell.execute_reply": "2025-06-14T12:00:17.891547Z",
+     "shell.execute_reply.started": "2025-06-14T12:00:17.883958Z"
+    }
+   },
    "outputs": [],
    "source": [
     "def search_wikipedia(query: str) -> str:\n",
    "cell_type": "code",
    "execution_count": 5,
    "id": "daf73b23",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:00:17.894347Z",
+     "iopub.status.busy": "2025-06-14T12:00:17.893804Z",
+     "iopub.status.idle": "2025-06-14T12:00:17.977993Z",
+     "shell.execute_reply": "2025-06-14T12:00:17.976648Z",
+     "shell.execute_reply.started": "2025-06-14T12:00:17.894292Z"
+    }
+   },
    "outputs": [],
    "source": [
     "def visit_web_page(url:str) -> str:\n",
    "cell_type": "code",
    "execution_count": 6,
    "id": "6f37359b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:00:17.985256Z",
+     "iopub.status.busy": "2025-06-14T12:00:17.983982Z",
+     "iopub.status.idle": "2025-06-14T12:00:18.104695Z",
+     "shell.execute_reply": "2025-06-14T12:00:18.103363Z",
+     "shell.execute_reply.started": "2025-06-14T12:00:17.985190Z"
+    }
+   },
    "outputs": [],
    "source": [
     "def read_excel_or_csv(filepath: str) -> str:\n",
     "    \"\"\"Reads an excel or csv file and returns the content as str.\"\"\"\n",
     "    if Path(filepath).suffix in {\".xlsx\", \".xls\"}:\n",
     "        df = pl.read_excel(source=filepath)\n",
+    "    else:\n",
+    "        df = pl.read_csv(source=filepath)\n",
     "    content_str = df.to_dict(as_series=False).__str__()\n",
     "    return content_str\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "id": "c52bd1a4",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:00:18.106352Z",
+     "iopub.status.busy": "2025-06-14T12:00:18.105808Z",
+     "iopub.status.idle": "2025-06-14T12:00:18.232683Z",
+     "shell.execute_reply": "2025-06-14T12:00:18.231616Z",
+     "shell.execute_reply.started": "2025-06-14T12:00:18.106301Z"
+    }
+   },
    "outputs": [],
    "source": [
+    "def python_code_interpreter(filepath: str) -> Any:\n",
     "    \"\"\"Returns the output of a python code.\"\"\"\n",
     "    with open(filepath, \"r\") as f:\n",
     "        code = f.readlines()\n",
     "    return code_result.generations[0][0].text"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4ed20b1-6107-41a4-85e5-1e99dad44ee1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 8,
    "id": "2c835c33",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:00:18.234454Z",
+     "iopub.status.busy": "2025-06-14T12:00:18.234066Z",
+     "iopub.status.idle": "2025-06-14T12:00:21.769245Z",
+     "shell.execute_reply": "2025-06-14T12:00:21.767715Z",
+     "shell.execute_reply.started": "2025-06-14T12:00:18.234423Z"
+    }
+   },
    "outputs": [
     {
      "name": "stdout",
    "cell_type": "code",
    "execution_count": 9,
    "id": "01193586",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:00:21.770968Z",
+     "iopub.status.busy": "2025-06-14T12:00:21.770456Z",
+     "iopub.status.idle": "2025-06-14T12:00:21.778994Z",
+     "shell.execute_reply": "2025-06-14T12:00:21.777534Z",
+     "shell.execute_reply.started": "2025-06-14T12:00:21.770915Z"
+    }
+   },
    "outputs": [],
    "source": [
     "def call_stt_tool(file_url:str) -> str:\n",
    "cell_type": "code",
    "execution_count": 10,
    "id": "f10e6859",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:00:21.784726Z",
+     "iopub.status.busy": "2025-06-14T12:00:21.784237Z",
+     "iopub.status.idle": "2025-06-14T12:00:21.875923Z",
+     "shell.execute_reply": "2025-06-14T12:00:21.874994Z",
+     "shell.execute_reply.started": "2025-06-14T12:00:21.784678Z"
+    }
+   },
    "outputs": [],
    "source": [
     "\n",
    "cell_type": "code",
    "execution_count": 11,
    "id": "b1634bb8",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:00:21.877435Z",
+     "iopub.status.busy": "2025-06-14T12:00:21.877020Z",
+     "iopub.status.idle": "2025-06-14T12:00:22.052327Z",
+     "shell.execute_reply": "2025-06-14T12:00:22.049864Z",
+     "shell.execute_reply.started": "2025-06-14T12:00:21.877390Z"
+    }
+   },
    "outputs": [],
    "source": [
     "# image_tool(file_url=\"cca530fc-4052-43b2-b130-b30968d8aa44.png\")"
    "cell_type": "code",
    "execution_count": 12,
    "id": "08f1534d",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:00:22.056308Z",
+     "iopub.status.busy": "2025-06-14T12:00:22.055211Z",
+     "iopub.status.idle": "2025-06-14T12:00:22.155394Z",
+     "shell.execute_reply": "2025-06-14T12:00:22.153763Z",
+     "shell.execute_reply.started": "2025-06-14T12:00:22.056197Z"
+    }
+   },
    "outputs": [],
    "source": [
+    "def youtube_video_tool(url:str, query:str) -> str:\n",
     "    \"\"\"Answers questions about youtube videos.\n",
+    "    URLs must be provided to this tool and the query too.\"\"\"\n",
     "    yt_vid_mapping = {\"https://www.youtube.com/watch?v=L1vXCYZAYYM\": \"penguin.mp4\",\n",
     "                  \"https://www.youtube.com/watch?v=1htKBjuUWec\": \"coffee.mp4\"}\n",
     "    video = cv2.VideoCapture(filename=yt_vid_mapping[url])\n",
     "                    {\n",
     "                        \"type\": \"text\",\n",
     "                        \"text\": (\n",
+    "                            f\"\"\"Examine the video and answer the following question: {query}.\"\"\"\n",
     "                        ),\n",
     "                    },\n",
     "                    *[\n",
    "cell_type": "code",
    "execution_count": 13,
    "id": "599530e3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:00:22.158534Z",
+     "iopub.status.busy": "2025-06-14T12:00:22.157536Z",
+     "iopub.status.idle": "2025-06-14T12:00:22.331204Z",
+     "shell.execute_reply": "2025-06-14T12:00:22.329006Z",
+     "shell.execute_reply.started": "2025-06-14T12:00:22.158439Z"
+    }
+   },
    "outputs": [],
    "source": [
     "def web_search_tool(query: str) -> str:\n",
     "    return search_res"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "367d9ed2-18e1-4d0b-9137-a4cf93afe147",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:00:22.335210Z",
+     "iopub.status.busy": "2025-06-14T12:00:22.333603Z",
+     "iopub.status.idle": "2025-06-14T12:00:22.463598Z",
+     "shell.execute_reply": "2025-06-14T12:00:22.461491Z",
+     "shell.execute_reply.started": "2025-06-14T12:00:22.335095Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def python_executor(code_str:str) -> str:\n",
+    "    \"\"\"This executes python code. The code must be a string.\n",
+    "    For any calculations always use numpy.\"\"\"\n",
+    "    lpe = LocalPythonExecutor(additional_authorized_imports=['polars.*', 'numpy.*'])\n",
+    "    code_res = lpe(code_action=code_str)[0]\n",
+    "    return code_res"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "ca4e1ddb-9658-49e5-95a7-f1311a23907c",
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "id": "a24e765b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:00:22.466724Z",
+     "iopub.status.busy": "2025-06-14T12:00:22.465826Z",
+     "iopub.status.idle": "2025-06-14T12:00:22.566082Z",
+     "shell.execute_reply": "2025-06-14T12:00:22.564017Z",
+     "shell.execute_reply.started": "2025-06-14T12:00:22.466655Z"
+    }
+   },
    "outputs": [],
    "source": [
     "class AnswerFormat(BaseModel):\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "id": "d726f6cd",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:00:22.570115Z",
+     "iopub.status.busy": "2025-06-14T12:00:22.568886Z",
+     "iopub.status.idle": "2025-06-14T12:00:22.718005Z",
+     "shell.execute_reply": "2025-06-14T12:00:22.715775Z",
+     "shell.execute_reply.started": "2025-06-14T12:00:22.570005Z"
+    }
+   },
    "outputs": [],
    "source": [
     "# SYS_PROMPT_SWEBENCH = \"\"\"\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 57,
    "id": "40300268",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:13:39.281024Z",
+     "iopub.status.busy": "2025-06-14T12:13:39.280260Z",
+     "iopub.status.idle": "2025-06-14T12:13:39.298833Z",
+     "shell.execute_reply": "2025-06-14T12:13:39.296267Z",
+     "shell.execute_reply.started": "2025-06-14T12:13:39.280958Z"
+    }
+   },
    "outputs": [],
    "source": [
     "# https://cookbook.openai.com/examples/gpt4-1_prompting_guide\n",
     "\n",
     "\n",
     "# Instructions\n",
+    "- Carefully read and understand the task. Sometimes the task might be a sentence reversed, so un reverse it first and then complete the task.\n",
     "- Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance \"discograpy\".\n",
     "- If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.\n",
     "- You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.\n",
     "- Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.\n",
+    "- You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `python_code_interpreter`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.\n",
+    "- If a python file is given to you, then use the `python_code_interpreter` and the input to the tool should be the file name.\n",
+    "- For any youtube related task use the `youtube_video_tool` and the input to the tool should be URL as a string along with the query.\n",
+    "- For any dataframe related tasks, always use the `read_excel_or_csv` tool.\n",
     "- If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.\n",
     "- You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.\n",
     "- Always verify your answers.\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 58,
    "id": "db96eef6",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:13:39.737001Z",
+     "iopub.status.busy": "2025-06-14T12:13:39.735802Z",
+     "iopub.status.idle": "2025-06-14T12:13:39.858444Z",
+     "shell.execute_reply": "2025-06-14T12:13:39.857410Z",
+     "shell.execute_reply.started": "2025-06-14T12:13:39.736922Z"
+    }
+   },
    "outputs": [],
    "source": [
     "agent = create_react_agent(\n",
     "        visit_web_page,\n",
     "        read_excel_or_csv,\n",
     "        python_executor,\n",
+    "        python_code_interpreter,\n",
     "        call_stt_tool,\n",
     "        image_tool,\n",
     "        youtube_video_tool,\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 42,
    "id": "33240d19",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:08:59.302949Z",
+     "iopub.status.busy": "2025-06-14T12:08:59.301715Z",
+     "iopub.status.idle": "2025-06-14T12:08:59.308731Z",
+     "shell.execute_reply": "2025-06-14T12:08:59.307387Z",
+     "shell.execute_reply.started": "2025-06-14T12:08:59.302889Z"
+    }
+   },
    "outputs": [],
    "source": [
     "# recursion_limit = 10\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 43,
    "id": "850bb54d",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:08:59.775583Z",
+     "iopub.status.busy": "2025-06-14T12:08:59.774642Z",
+     "iopub.status.idle": "2025-06-14T12:08:59.784401Z",
+     "shell.execute_reply": "2025-06-14T12:08:59.783054Z",
+     "shell.execute_reply.started": "2025-06-14T12:08:59.775524Z"
+    }
+   },
    "outputs": [
     {
      "data": {
        " 'file_name': ''}"
       ]
      },
+     "execution_count": 43,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 59,
    "id": "f9025fae",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:13:47.264111Z",
+     "iopub.status.busy": "2025-06-14T12:13:47.263623Z",
+     "iopub.status.idle": "2025-06-14T12:18:56.902572Z",
+     "shell.execute_reply": "2025-06-14T12:18:56.900492Z",
+     "shell.execute_reply.started": "2025-06-14T12:13:47.264070Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "❌ Agent stopped due to max iterations.\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = []\n",
+    "for q in all_questions:\n",
+    "    try:\n",
+    "        answer = await agent.ainvoke(\n",
+    "        # answer = agent_w_recursion_limit.invoke(\n",
+    "            input={\n",
+    "                \"messages\": f\"\"\"Complete the following task: {q[\"question\"]}. Relevant file: {\n",
+    "                    q[\"file_name\"]\n",
+    "                    if q[\"file_name\"]\n",
+    "                    else \"There's no relevant file to use.\"\n",
+    "                }\"\"\"\n",
+    "            }\n",
+    "        )\n",
+    "        results.append(answer)\n",
+    "    except GraphRecursionError:\n",
+    "        print(\"❌ Agent stopped due to max iterations.\")\n",
+    "        results.append(q[\"task_id\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "56f0e281-0454-4310-9aaa-4599697e45d6",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:12:58.519979Z",
+     "iopub.status.busy": "2025-06-14T12:12:58.519086Z",
+     "iopub.status.idle": "2025-06-14T12:12:58.530618Z",
+     "shell.execute_reply": "2025-06-14T12:12:58.528121Z",
+     "shell.execute_reply.started": "2025-06-14T12:12:58.519907Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# [results[i][\"structured_response\"].answer if isinstance(results[i], dict) else \"No answer\" for i in range(len(all_questions))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "102b3d07-e243-48f4-a0e7-5019a9d1576e",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T11:59:56.760447Z",
+     "iopub.status.busy": "2025-06-14T11:59:56.753615Z",
+     "iopub.status.idle": "2025-06-14T11:59:56.776865Z",
+     "shell.execute_reply": "2025-06-14T11:59:56.775237Z",
+     "shell.execute_reply.started": "2025-06-14T11:59:56.760348Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# with open(\"results_gpt_nano.pkl\", \"wb\") as f:\n",
+    "#     pickle.dump(obj=results, file=f, protocol=pickle.HIGHEST_PROTOCOL)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "814f62c5-999c-4d98-900f-88828c832bc5",
    "metadata": {},
    "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "22f6c4df-b0b1-451d-9bb5-84a9a4088dd9",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:19:50.162820Z",
+     "iopub.status.busy": "2025-06-14T12:19:50.162047Z",
+     "iopub.status.idle": "2025-06-14T12:19:50.174784Z",
+     "shell.execute_reply": "2025-06-14T12:19:50.172061Z",
+     "shell.execute_reply.started": "2025-06-14T12:19:50.162757Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [],
    "source": [
+    "answers = [{\"task_id\":j['task_id'], \n",
+    "            \"submitted_answer\": results[i][\"structured_response\"].answer\n",
+    "            if isinstance(results[i], dict) else \"No answer\"} \n",
+    " for i,j in enumerate(all_questions)]"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 61,
+   "id": "6b3d183d-f8b6-410d-8581-1da96a1e6bb5",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:19:51.962326Z",
+     "iopub.status.busy": "2025-06-14T12:19:51.961499Z",
+     "iopub.status.idle": "2025-06-14T12:19:51.974035Z",
+     "shell.execute_reply": "2025-06-14T12:19:51.970718Z",
+     "shell.execute_reply.started": "2025-06-14T12:19:51.962257Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "data = {\n",
+    "  \"username\": \"string\",\n",
+    "  \"agent_code\": \"stringstri\",\n",
+    "  \"answers\": answers\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "93527645-f3c9-4c1b-b35f-006d1d067397",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:19:52.917482Z",
+     "iopub.status.busy": "2025-06-14T12:19:52.916225Z",
+     "iopub.status.idle": "2025-06-14T12:19:54.900881Z",
+     "shell.execute_reply": "2025-06-14T12:19:54.897934Z",
+     "shell.execute_reply.started": "2025-06-14T12:19:52.917365Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "answer_check = httpx.post(json=data, url=submit_url)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "b3f7e6eb-2f8c-4b25-a83b-d3b1e628d09c",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T12:19:54.907705Z",
+     "iopub.status.busy": "2025-06-14T12:19:54.906633Z",
+     "iopub.status.idle": "2025-06-14T12:19:54.924639Z",
+     "shell.execute_reply": "2025-06-14T12:19:54.922534Z",
+     "shell.execute_reply.started": "2025-06-14T12:19:54.907598Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'username': 'string',\n",
+       " 'score': 30.0,\n",
+       " 'correct_count': 6,\n",
+       " 'total_attempted': 20,\n",
+       " 'message': 'Score calculated successfully: 6/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.',\n",
+       " 'timestamp': '2025-06-14T12:19:54.753829+00:00'}"
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "answer_check.json()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff87396c-e52f-4b6f-bc37-565e3ecf26f7",
    "metadata": {},
    "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "id": "c98460f5-8671-4b98-b932-60f599dc7ec5",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T10:27:48.568548Z",
+     "iopub.status.busy": "2025-06-14T10:27:48.567818Z",
+     "iopub.status.idle": "2025-06-14T10:27:48.581051Z",
+     "shell.execute_reply": "2025-06-14T10:27:48.579486Z",
+     "shell.execute_reply.started": "2025-06-14T10:27:48.568492Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'task_id': '7bd855d8-463d-4ed5-93ca-5fe35145f733',\n",
+       " 'question': 'The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.',\n",
+       " 'Level': '1',\n",
+       " 'file_name': '7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx'}"
+      ]
+     },
+     "execution_count": 108,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "all_questions[18]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "550688eb-8a32-4b89-85a8-607d74a1b240",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T11:12:34.441436Z",
+     "iopub.status.busy": "2025-06-14T11:12:34.440861Z",
+     "iopub.status.idle": "2025-06-14T11:13:00.795896Z",
+     "shell.execute_reply": "2025-06-14T11:13:00.792624Z",
+     "shell.execute_reply.started": "2025-06-14T11:12:34.441383Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'messages': [HumanMessage(content='Complete the following task: The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.. Relevant file: 7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx', additional_kwargs={}, response_metadata={}, id='9c810520-a1cd-40b2-914e-8fd2f3e240c5'),\n",
+       "  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_zp8tBlFZ1vOSnYqFQvbsgbi6', 'function': {'arguments': '{\"filepath\":\"7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx\"}', 'name': 'read_excel_or_csv'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 38, 'prompt_tokens': 1104, 'total_tokens': 1142, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 1024}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6JmHXTRb96TEdmDFjOpQVDXjGH', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--83bd6bab-e401-4383-9511-e4fe72002375-0', tool_calls=[{'name': 'read_excel_or_csv', 'args': {'filepath': '7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx'}, 'id': 'call_zp8tBlFZ1vOSnYqFQvbsgbi6', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1104, 'output_tokens': 38, 'total_tokens': 1142, 'input_token_details': {'audio': 0, 'cache_read': 1024}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
+       "  ToolMessage(content=\"{'Location': ['Pinebrook', 'Wharvton', 'Sagrada', 'Algrimand', 'Marztep', 'San Cecelia', 'Pimento', 'Tinseles', 'Rosdale'], 'Burgers': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007], 'Hot Dogs': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009], 'Salads': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021], 'Fries': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989], 'Ice Cream': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005], 'Soda': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\", name='read_excel_or_csv', id='8b1777a8-387e-417a-a9eb-9af103d701e1', tool_call_id='call_zp8tBlFZ1vOSnYqFQvbsgbi6'),\n",
+       "  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_T5jtJJqyz0zpCvMNwcRZtuqV', 'function': {'arguments': '{\"filepath\":\"7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx\"}', 'name': 'read_excel_or_csv'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 38, 'prompt_tokens': 1439, 'total_tokens': 1477, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6KCZFLq93YDUtQHAG8vGABGAoa', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--ad40832c-c24b-43b5-af46-4eb0a3945439-0', tool_calls=[{'name': 'read_excel_or_csv', 'args': {'filepath': '7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx'}, 'id': 'call_T5jtJJqyz0zpCvMNwcRZtuqV', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1439, 'output_tokens': 38, 'total_tokens': 1477, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
+       "  ToolMessage(content=\"{'Location': ['Pinebrook', 'Wharvton', 'Sagrada', 'Algrimand', 'Marztep', 'San Cecelia', 'Pimento', 'Tinseles', 'Rosdale'], 'Burgers': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007], 'Hot Dogs': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009], 'Salads': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021], 'Fries': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989], 'Ice Cream': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005], 'Soda': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\", name='read_excel_or_csv', id='130e5ee1-fd7e-4120-8084-bcebf8d15a9c', tool_call_id='call_T5jtJJqyz0zpCvMNwcRZtuqV'),\n",
+       "  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_IwlcalqVatpwkLnAbK29EVr3', 'function': {'arguments': '{\"filepath\":\"7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx\"}', 'name': 'read_excel_or_csv'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 38, 'prompt_tokens': 1774, 'total_tokens': 1812, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 1408}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6Lz0gaVXIFsxcfcisuIKWlLNLI', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--affe2bdb-f526-4bed-aae4-35cfc208b61a-0', tool_calls=[{'name': 'read_excel_or_csv', 'args': {'filepath': '7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx'}, 'id': 'call_IwlcalqVatpwkLnAbK29EVr3', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1774, 'output_tokens': 38, 'total_tokens': 1812, 'input_token_details': {'audio': 0, 'cache_read': 1408}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
+       "  ToolMessage(content=\"{'Location': ['Pinebrook', 'Wharvton', 'Sagrada', 'Algrimand', 'Marztep', 'San Cecelia', 'Pimento', 'Tinseles', 'Rosdale'], 'Burgers': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007], 'Hot Dogs': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009], 'Salads': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021], 'Fries': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989], 'Ice Cream': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005], 'Soda': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\", name='read_excel_or_csv', id='ca8399cc-6d48-43df-b781-c8f064154532', tool_call_id='call_IwlcalqVatpwkLnAbK29EVr3'),\n",
+       "  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_GIEGGs5RtOIT92ajgPAJARpt', 'function': {'arguments': '{\"code_str\":\"import pandas as pd\\\\n\\\\n# Data extracted from the Excel file\\\\ndata = {\\'Location\\': [\\'Pinebrook\\', \\'Wharvton\\', \\'Sagrada\\', \\'Algrimand\\', \\'Marztep\\', \\'San Cecelia\\', \\'Pimento\\', \\'Tinseles\\', \\'Rosdale\\'],\\\\n        \\'Burgers\\': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007],\\\\n        \\'Hot Dogs\\': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009],\\\\n        \\'Salads\\': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021],\\\\n        \\'Fries\\': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989],\\\\n        \\'Ice Cream\\': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005],\\\\n        \\'Soda\\': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\\\\n\\\\ndf = pd.DataFrame(data)\\\\n\\\\n# Prices for each item (assuming these are the prices in USD)\\\\nprices = {\\'Burgers\\': 5.50, \\'Hot Dogs\\': 3.75, \\'Salads\\': 4.25, \\'Fries\\': 2.50, \\'Ice Cream\\': 3.00, \\'Soda\\': 1.50}\\\\n\\\\n# Calculate total sales for food items (excluding drinks which is Soda)\\\\nfood_items = [\\'Burgers\\', \\'Hot Dogs\\', \\'Salads\\', \\'Fries\\', \\'Ice Cream\\']\\\\n\\\\n# Calculate total sales\\\\ntotal_sales = 0\\\\nfor item in food_items:\\\\n    total_sales += (df[item].sum() * prices[item])\\\\n\\\\nround(total_sales, 2)\"}', 'name': 'python_executor'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 506, 'prompt_tokens': 2109, 'total_tokens': 2615, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 1792}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6MtqSxoV60bBEmbp8dzJKWyM3Q', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--33c7ee3a-f032-4f24-864b-b773673199db-0', tool_calls=[{'name': 'python_executor', 'args': {'code_str': \"import pandas as pd\\n\\n# Data extracted from the Excel file\\ndata = {'Location': ['Pinebrook', 'Wharvton', 'Sagrada', 'Algrimand', 'Marztep', 'San Cecelia', 'Pimento', 'Tinseles', 'Rosdale'],\\n        'Burgers': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007],\\n        'Hot Dogs': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009],\\n        'Salads': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021],\\n        'Fries': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989],\\n        'Ice Cream': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005],\\n        'Soda': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\\n\\ndf = pd.DataFrame(data)\\n\\n# Prices for each item (assuming these are the prices in USD)\\nprices = {'Burgers': 5.50, 'Hot Dogs': 3.75, 'Salads': 4.25, 'Fries': 2.50, 'Ice Cream': 3.00, 'Soda': 1.50}\\n\\n# Calculate total sales for food items (excluding drinks which is Soda)\\nfood_items = ['Burgers', 'Hot Dogs', 'Salads', 'Fries', 'Ice Cream']\\n\\n# Calculate total sales\\ntotal_sales = 0\\nfor item in food_items:\\n    total_sales += (df[item].sum() * prices[item])\\n\\nround(total_sales, 2)\"}, 'id': 'call_GIEGGs5RtOIT92ajgPAJARpt', 'type': 'tool_call'}], usage_metadata={'input_tokens': 2109, 'output_tokens': 506, 'total_tokens': 2615, 'input_token_details': {'audio': 0, 'cache_read': 1792}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
+       "  ToolMessage(content='Error: InterpreterError(\"Code execution failed at line \\'import pandas as pd\\' due to: InterpreterError: Import of pandas is not allowed. Authorized imports are: [\\'re\\', \\'statistics\\', \\'queue\\', \\'datetime\\', \\'numpy.*\\', \\'itertools\\', \\'math\\', \\'unicodedata\\', \\'stat\\', \\'collections\\', \\'random\\', \\'polars.*\\', \\'time\\']\")\\n Please fix your mistakes.', name='python_executor', id='56c8bfc0-165e-4ca3-b2c8-f19a71728d3f', tool_call_id='call_GIEGGs5RtOIT92ajgPAJARpt', status='error'),\n",
+       "  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_ukghn9nyQv0MpZ2lPAnpdfjK', 'function': {'arguments': '{\"code_str\":\"import numpy as np\\\\n\\\\n# Data from the Excel file\\\\nburgers = np.array([1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007])\\\\nhot_dogs = np.array([1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009])\\\\nsalads = np.array([2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021])\\\\nfries = np.array([2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989])\\\\nice_cream = np.array([1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005])\\\\n\\\\n# Prices for each food item\\\\nprices = {\\'Burgers\\': 5.50, \\'Hot Dogs\\': 3.75, \\'Salads\\': 4.25, \\'Fries\\': 2.50, \\'Ice Cream\\': 3.00}\\\\n\\\\n# Calculate total sales for each food item\\\\ntotal_burgers = np.sum(burgers) * prices[\\'Burgers\\']\\\\ntotal_hot_dogs = np.sum(hot_dogs) * prices[\\'Hot Dogs\\']\\\\ntotal_salads = np.sum(salads) * prices[\\'Salads\\']\\\\ntotal_fries = np.sum(fries) * prices[\\'Fries\\']\\\\ntotal_ice_cream = np.sum(ice_cream) * prices[\\'Ice Cream\\']\\\\n\\\\n# Calculate total sales from food\\\\ntotal_sales_food = total_burgers + total_hot_dogs + total_salads + total_fries + total_ice_cream\\\\n\\\\nround(total_sales_food, 2)\"}', 'name': 'python_executor'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 451, 'prompt_tokens': 2707, 'total_tokens': 3158, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 2048}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6TIuM5izp01LjdF5U2znulHHxe', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--880f8528-bebd-4f78-8c4c-1dc05df990af-0', tool_calls=[{'name': 'python_executor', 'args': {'code_str': \"import numpy as np\\n\\n# Data from the Excel file\\nburgers = np.array([1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007])\\nhot_dogs = np.array([1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009])\\nsalads = np.array([2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021])\\nfries = np.array([2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989])\\nice_cream = np.array([1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005])\\n\\n# Prices for each food item\\nprices = {'Burgers': 5.50, 'Hot Dogs': 3.75, 'Salads': 4.25, 'Fries': 2.50, 'Ice Cream': 3.00}\\n\\n# Calculate total sales for each food item\\ntotal_burgers = np.sum(burgers) * prices['Burgers']\\ntotal_hot_dogs = np.sum(hot_dogs) * prices['Hot Dogs']\\ntotal_salads = np.sum(salads) * prices['Salads']\\ntotal_fries = np.sum(fries) * prices['Fries']\\ntotal_ice_cream = np.sum(ice_cream) * prices['Ice Cream']\\n\\n# Calculate total sales from food\\ntotal_sales_food = total_burgers + total_hot_dogs + total_salads + total_fries + total_ice_cream\\n\\nround(total_sales_food, 2)\"}, 'id': 'call_ukghn9nyQv0MpZ2lPAnpdfjK', 'type': 'tool_call'}], usage_metadata={'input_tokens': 2707, 'output_tokens': 451, 'total_tokens': 3158, 'input_token_details': {'audio': 0, 'cache_read': 2048}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
+       "  ToolMessage(content='Error: InterpreterError(\"Code execution failed at line \\'round(total_sales_food, 2)\\' due to: InterpreterError: Forbidden function evaluation: \\'round\\' is not among the explicitly allowed tools or defined/imported in the preceding code\")\\n Please fix your mistakes.', name='python_executor', id='bae4bb38-046c-4c36-8f26-1daf4c187b61', tool_call_id='call_ukghn9nyQv0MpZ2lPAnpdfjK', status='error'),\n",
+       "  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_XpqHUTDZzi8Rkuw2ubo53CZJ', 'function': {'arguments': '{\"code_str\":\"import numpy as np\\\\n\\\\nburgers = np.array([1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007])\\\\nhot_dogs = np.array([1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009])\\\\nsalads = np.array([2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021])\\\\nfries = np.array([2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989])\\\\nice_cream = np.array([1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005])\\\\n\\\\nprices = {\\'Burgers\\': 5.50, \\'Hot Dogs\\': 3.75, \\'Salads\\': 4.25, \\'Fries\\': 2.50, \\'Ice Cream\\': 3.00}\\\\n\\\\ntotal_burgers = np.sum(burgers) * prices[\\'Burgers\\']\\\\ntotal_hot_dogs = np.sum(hot_dogs) * prices[\\'Hot Dogs\\']\\\\ntotal_salads = np.sum(salads) * prices[\\'Salads\\']\\\\ntotal_fries = np.sum(fries) * prices[\\'Fries\\']\\\\ntotal_ice_cream = np.sum(ice_cream) * prices[\\'Ice Cream\\']\\\\n\\\\ntotal_sales_food = total_burgers + total_hot_dogs + total_salads + total_fries + total_ice_cream\\\\n\\\\n\\\\\"{:.2f}\\\\\".format(total_sales_food)\"}', 'name': 'python_executor'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 427, 'prompt_tokens': 3221, 'total_tokens': 3648, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 1024}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6ayTTFSltIIg72OsMWD12m59Y4', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--16bcd06a-ffb1-4adf-8528-0a2b47b046e1-0', tool_calls=[{'name': 'python_executor', 'args': {'code_str': 'import numpy as np\\n\\nburgers = np.array([1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007])\\nhot_dogs = np.array([1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009])\\nsalads = np.array([2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021])\\nfries = np.array([2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989])\\nice_cream = np.array([1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005])\\n\\nprices = {\\'Burgers\\': 5.50, \\'Hot Dogs\\': 3.75, \\'Salads\\': 4.25, \\'Fries\\': 2.50, \\'Ice Cream\\': 3.00}\\n\\ntotal_burgers = np.sum(burgers) * prices[\\'Burgers\\']\\ntotal_hot_dogs = np.sum(hot_dogs) * prices[\\'Hot Dogs\\']\\ntotal_salads = np.sum(salads) * prices[\\'Salads\\']\\ntotal_fries = np.sum(fries) * prices[\\'Fries\\']\\ntotal_ice_cream = np.sum(ice_cream) * prices[\\'Ice Cream\\']\\n\\ntotal_sales_food = total_burgers + total_hot_dogs + total_salads + total_fries + total_ice_cream\\n\\n\"{:.2f}\".format(total_sales_food)'}, 'id': 'call_XpqHUTDZzi8Rkuw2ubo53CZJ', 'type': 'tool_call'}], usage_metadata={'input_tokens': 3221, 'output_tokens': 427, 'total_tokens': 3648, 'input_token_details': {'audio': 0, 'cache_read': 1024}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
+       "  ToolMessage(content='340090.25', name='python_executor', id='70c25b8c-9bca-44c5-9f8e-a786d3641a68', tool_call_id='call_XpqHUTDZzi8Rkuw2ubo53CZJ'),\n",
+       "  AIMessage(content='340090.25', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 3660, 'total_tokens': 3665, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 2944}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6g8VihYdHuX0NzmGFcCVxAbDlh', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--663ce019-aac1-4293-b368-8d8c2a8ed302-0', usage_metadata={'input_tokens': 3660, 'output_tokens': 5, 'total_tokens': 3665, 'input_token_details': {'audio': 0, 'cache_read': 2944}, 'output_token_details': {'audio': 0, 'reasoning': 0}})],\n",
+       " 'structured_response': AnswerFormat(thoughts='I calculated the total sales for each food item by summing the quantities sold across all locations and multiplying by their respective prices. Then, I summed these totals to get the total sales from food items only, excluding drinks. The final total sales amount is formatted to two decimal places as requested.', answer='340090.25')}"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "agent.invoke(\n",
     "            input={\n",
+    "                \"messages\": f\"\"\"Complete the following task: {all_questions[18][\"question\"]}. Relevant file: {\n",
+    "                    all_questions[18][\"file_name\"]\n",
+    "                    if all_questions[18][\"file_name\"]\n",
+    "                    else \"There's no relevant file to use.\"\n",
     "                }\"\"\"\n",
     "            }\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "a525780a",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T09:41:31.611239Z",
+     "iopub.status.busy": "2025-06-14T09:41:31.608132Z",
+     "iopub.status.idle": "2025-06-14T09:41:31.627482Z",
+     "shell.execute_reply": "2025-06-14T09:41:31.625052Z",
+     "shell.execute_reply.started": "2025-06-14T09:41:31.611107Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# async def run_all_questions(agent, all_questions):\n",
+    "#     \"\"\"Run agent.ainvoke concurrently for all questions.\n",
+    "\n",
+    "#     Args:\n",
+    "#         agent: The LangChain agent with an ainvoke method.\n",
+    "#         all_questions (list): List of question dicts.\n",
+    "\n",
+    "#     Returns:\n",
+    "#         list: List of agent responses.\n",
+    "#     \"\"\"\n",
+    "#     tasks = [\n",
+    "#         agent.ainvoke(\n",
+    "#             input={\n",
+    "#                 \"messages\": f\"\"\"Complete the following task: {q[\"question\"]}. Relevant file: {\n",
+    "#                     q[\"file_name\"] if q[\"file_name\"] else \"There's no relevant file to use.\"\n",
+    "#                 }\"\"\"\n",
+    "#             }\n",
+    "#         )\n",
+    "#         for q in all_questions\n",
+    "#     ]\n",
+    "#     try:\n",
+    "#         return await asyncio.gather(*tasks)\n",
+    "#     except GraphRecursionError as err:\n",
+    "#         print(\"❌ Agent stopped due to max iterations.\")\n",
+    "#         print(f\"Error: {err}\")\n",
+    "#         # return \"Recursion Limit Reached.\"\n",
     "\n"
    ]
   },
    "metadata": {},
    "outputs": [],
    "source": [
+    "# responses = asyncio.run(run_all_questions(agent, all_questions))"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 43,
+   "id": "12e0f74d-e565-47bb-bf6d-0ba11570b451",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T09:42:47.636798Z",
+     "iopub.status.busy": "2025-06-14T09:42:47.635229Z",
+     "iopub.status.idle": "2025-06-14T09:42:47.649310Z",
+     "shell.execute_reply": "2025-06-14T09:42:47.647870Z",
+     "shell.execute_reply.started": "2025-06-14T09:42:47.636728Z"
+    }
+   },
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "NoneType"
       ]
      },
+     "execution_count": 43,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "type(responses)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 35,
+   "id": "d28d2d35",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T09:38:24.184443Z",
+     "iopub.status.busy": "2025-06-14T09:38:24.183596Z",
+     "iopub.status.idle": "2025-06-14T09:38:24.554423Z",
+     "shell.execute_reply": "2025-06-14T09:38:24.552528Z",
+     "shell.execute_reply.started": "2025-06-14T09:38:24.184340Z"
+    }
+   },
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "string indices must be integers, not 'str'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mTypeError\u001b[39m                                 Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[35]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mresponses\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mstructured_response\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m.answer\n",
+      "\u001b[31mTypeError\u001b[39m: string indices must be integers, not 'str'"
+     ]
+    }
+   ],
    "source": [
+    "responses[2]['structured_response'].answer"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 23,
    "id": "94449302",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T09:36:13.544492Z",
+     "iopub.status.busy": "2025-06-14T09:36:13.541377Z",
+     "iopub.status.idle": "2025-06-14T09:36:13.558603Z",
+     "shell.execute_reply": "2025-06-14T09:36:13.557527Z",
+     "shell.execute_reply.started": "2025-06-14T09:36:13.544415Z"
+    },
+    "scrolled": true
+   },
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "{'task_id': '2d83110e-a098-4ebb-9987-066c06fa42d0',\n",
+       " 'question': '.rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI',\n",
        " 'Level': '1',\n",
        " 'file_name': ''}"
       ]
      },
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "all_questions[2]"
    ]
   },
   {
  ],
  "metadata": {
   "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },

pyproject.toml CHANGED Viewed

@@ -6,6 +6,8 @@ dependencies = [
     "accelerate>=1.7.0",
     "av>=14.4.0",
     "fastexcel>=0.14.0",
     "huggingface-hub[hf-xet]>=0.32.6",
     "langchain-community>=0.3.25",
     "langchain-yt-dlp>=0.0.8",

     "accelerate>=1.7.0",
     "av>=14.4.0",
     "fastexcel>=0.14.0",
+    "gradio>=5.34.0",
+    "httpx>=0.28.1",
     "huggingface-hub[hf-xet]>=0.32.6",
     "langchain-community>=0.3.25",
     "langchain-yt-dlp>=0.0.8",

results_gpt_mini.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d24e0fcef15657f57ea2d5b1bc8de286cf1a0a0b1919fa19494e8c1359e66f9
+size 486587

universal_agent.py CHANGED Viewed

@@ -27,7 +27,7 @@ nest_asyncio.apply()
 with open("all_questions.pkl", "rb") as f:
     all_questions = pickle.load(f)
 lang_model = init_chat_model(
-    model="gpt-4.1-nano", model_provider="openai", temperature=0
 )
@@ -56,9 +56,9 @@ def read_excel_or_csv(filepath: str) -> str:
     return content_str
-def python_executor(filepath: str) -> Any:
     """Returns the output of a python code."""
-    with open(filepath) as f:
         code = f.readlines()
     code_result = lang_model.generate(
         messages=[
@@ -76,6 +76,12 @@ def python_executor(filepath: str) -> Any:
     )
     return code_result.generations[0][0].text
 stt_tool = Tool.from_space(
     space_id="UNSAFESUPERINTELLIGENCE/Minimum-OpenAI-Whisper",
@@ -203,12 +209,15 @@ You are an expert agent - please keep going until the user’s query is complete
 # Instructions
-- Carefully read and understand the task.
 - Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance "discograpy".
 - If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.
 - You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.
 - Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.
-- You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.
 - If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.
 - You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.
 - Always verify your answers.
@@ -247,6 +256,8 @@ What's 2 +2 ?
 4
 """
 agent = create_react_agent(
     model=lang_model,
     tools=[
@@ -254,6 +265,7 @@ agent = create_react_agent(
         visit_web_page,
         read_excel_or_csv,
         python_executor,
         call_stt_tool,
         image_tool,
         youtube_video_tool,
@@ -262,56 +274,35 @@ agent = create_react_agent(
     prompt=SYSTEM_PROMPT,
     response_format=AnswerFormat,
 )
 # recursion_limit = 10
 # agent_w_recursion_limit = agent.with_config(recursion_limit=recursion_limit)
-all_questions[0]
-# try:
-#     answer = agent.invoke(
-#     # answer = agent_w_recursion_limit.invoke(
-#         input={
-#             "messages": f"""Complete the following task: {all_questions[0]["question"]}. Relevant file: {
-#                 all_questions[0]["file_name"]
-#                 if all_questions[0]["file_name"]
-#                 else "There's no relevant file to use."
-#             }"""
-#         }
-#     )
-# except GraphRecursionError:
-#     print("❌ Agent stopped due to max iterations.")
-async def run_all_questions(agent, all_questions):
-    """Run agent.ainvoke concurrently for all questions.
-    Args:
-        agent: The LangChain agent with an ainvoke method.
-        all_questions (list): List of question dicts.
-    Returns:
-        list: List of agent responses.
-    """
-    tasks = [
-        agent.ainvoke(
             input={
-                "messages": f"""Complete the following task: {
-                    q["question"]
-                }. Relevant file: {
                     q["file_name"]
                     if q["file_name"]
                     else "There's no relevant file to use."
                 }"""
             }
         )
-        for q in all_questions
-    ]
-    try:
-        return await asyncio.gather(*tasks)
-    except GraphRecursionError as err:
         print("❌ Agent stopped due to max iterations.")
-        print(f"Error: {err}")
-        return "Recursion Limit Reached."
-responses = asyncio.run(run_all_questions(agent, all_questions))
-final_answers = [responses[i]["structured_response"].answer [for i in range(len(all_questions))]]

 with open("all_questions.pkl", "rb") as f:
     all_questions = pickle.load(f)
 lang_model = init_chat_model(
+    model="gpt-4.1-mini", model_provider="openai", temperature=0.2
 )
     return content_str
+def python_code_interpreter(filepath: str) -> Any:
     """Returns the output of a python code."""
+    with open(filepath, "r") as f:
         code = f.readlines()
     code_result = lang_model.generate(
         messages=[
     )
     return code_result.generations[0][0].text
+def python_executor(code_str:str) -> str:
+    """This executes python code. The code must be a string.
+    For any calculations always use numpy."""
+    lpe = LocalPythonExecutor(additional_authorized_imports=['polars.*', 'numpy.*'])
+    code_res = lpe(code_action=code_str)[0]
+    return code_res
 stt_tool = Tool.from_space(
     space_id="UNSAFESUPERINTELLIGENCE/Minimum-OpenAI-Whisper",
 # Instructions
+- Carefully read and understand the task. Sometimes the task might be a sentence reversed, so un reverse it first and then complete the task.
 - Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance "discograpy".
 - If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.
 - You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.
 - Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.
+- You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `python_code_interpreter`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.
+- If a python file is given to you, then use the `python_code_interpreter` and the input to the tool should be the file name.
+- For any youtube related task use the `youtube_video_tool` and the input to the tool should be URL as a string along with the query.
+- For any dataframe related tasks, always use the `read_excel_or_csv` tool.
 - If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.
 - You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.
 - Always verify your answers.
 4
 """
 agent = create_react_agent(
     model=lang_model,
     tools=[
         visit_web_page,
         read_excel_or_csv,
         python_executor,
+        python_code_interpreter,
         call_stt_tool,
         image_tool,
         youtube_video_tool,
     prompt=SYSTEM_PROMPT,
     response_format=AnswerFormat,
 )
 # recursion_limit = 10
 # agent_w_recursion_limit = agent.with_config(recursion_limit=recursion_limit)
+# all_questions[0]
+results = []
+for q in all_questions:
+    try:
+        answer = await agent.ainvoke(
+        # answer = agent_w_recursion_limit.invoke(
             input={
+                "messages": f"""Complete the following task: {q["question"]}. Relevant file: {
                     q["file_name"]
                     if q["file_name"]
                     else "There's no relevant file to use."
                 }"""
             }
         )
+        results.append(answer)
+    except GraphRecursionError:
         print("❌ Agent stopped due to max iterations.")
+        results.append(q["task_id"])
+# with open("results_gpt_mini.pkl", "wb") as f:
+#     pickle.dump(obj=results, file=f, protocol=pickle.HIGHEST_PROTOCOL)
+answers = [{"task_id":j['task_id'],
+            "submitted_answer": results[i]["structured_response"].answer
+            if isinstance(results[i], dict) else "No answer"}
+ for i,j in enumerate(all_questions)]

uv.lock CHANGED Viewed

@@ -3212,6 +3212,8 @@ dependencies = [
     { name = "accelerate" },
     { name = "av" },
     { name = "fastexcel" },
     { name = "huggingface-hub", extra = ["hf-xet"] },
     { name = "langchain", extra = ["openai"] },
     { name = "langchain-community" },
@@ -3242,6 +3244,8 @@ requires-dist = [
     { name = "accelerate", specifier = ">=1.7.0" },
     { name = "av", specifier = ">=14.4.0" },
     { name = "fastexcel", specifier = ">=0.14.0" },
     { name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=0.32.6" },
     { name = "langchain", extras = ["openai"], specifier = ">=0.3.25" },
     { name = "langchain-community", specifier = ">=0.3.25" },

     { name = "accelerate" },
     { name = "av" },
     { name = "fastexcel" },
+    { name = "gradio" },
+    { name = "httpx" },
     { name = "huggingface-hub", extra = ["hf-xet"] },
     { name = "langchain", extra = ["openai"] },
     { name = "langchain-community" },
     { name = "accelerate", specifier = ">=1.7.0" },
     { name = "av", specifier = ">=14.4.0" },
     { name = "fastexcel", specifier = ">=0.14.0" },
+    { name = "gradio", specifier = ">=5.34.0" },
+    { name = "httpx", specifier = ">=0.28.1" },
     { name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=0.32.6" },
     { name = "langchain", extras = ["openai"], specifier = ">=0.3.25" },
     { name = "langchain-community", specifier = ">=0.3.25" },