feat: adding a answer file and changing the app file to directly read the answers from the pickle file.
Browse files- app.py +65 -55
- playground.ipynb +613 -108
- pyproject.toml +2 -0
- results_gpt_mini.pkl +3 -0
- universal_agent.py +36 -45
- uv.lock +4 -0
    	
        app.py
    CHANGED
    
    | @@ -15,14 +15,14 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" | |
| 15 |  | 
| 16 | 
             
            # --- Basic Agent Definition ---
         | 
| 17 | 
             
            # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
         | 
| 18 | 
            -
            class BasicAgent:
         | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 |  | 
| 27 | 
             
            def run_and_submit_all( profile: gr.OAuthProfile | None):
         | 
| 28 | 
             
                """
         | 
| @@ -43,61 +43,71 @@ def run_and_submit_all( profile: gr.OAuthProfile | None): | |
| 43 | 
             
                questions_url = f"{api_url}/questions"
         | 
| 44 | 
             
                submit_url = f"{api_url}/submit"
         | 
| 45 |  | 
| 46 | 
            -
                # 1. Instantiate Agent ( modify this part to create your agent)
         | 
| 47 | 
            -
                try:
         | 
| 48 | 
            -
             | 
| 49 | 
            -
                except Exception as e:
         | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
             
                # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
         | 
| 53 | 
             
                agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
         | 
| 54 | 
             
                logger.info(agent_code)
         | 
| 55 |  | 
| 56 | 
            -
                # 2. Fetch Questions
         | 
| 57 | 
            -
                logger.info(f"Fetching questions from: {questions_url}")
         | 
| 58 | 
            -
                try:
         | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
             | 
| 66 | 
            -
                except requests.exceptions.RequestException as e:
         | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
                except requests.exceptions.JSONDecodeError as e:
         | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 73 | 
            -
                except Exception as e:
         | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 |  | 
| 77 | 
             
                # 3. Run your Agent
         | 
| 78 | 
            -
                results_log = []
         | 
| 79 | 
            -
                answers_payload = []
         | 
| 80 | 
            -
                logger.info(f"Running agent on {len(questions_data)} questions...")
         | 
| 81 | 
            -
                for item in questions_data:
         | 
| 82 | 
            -
             | 
| 83 | 
            -
             | 
| 84 | 
            -
             | 
| 85 | 
            -
             | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 89 | 
            -
             | 
| 90 | 
            -
             | 
| 91 | 
            -
             | 
| 92 | 
            -
             | 
| 93 | 
            -
             | 
| 94 | 
            -
             | 
| 95 | 
            -
                if not answers_payload:
         | 
| 96 | 
            -
             | 
| 97 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 98 |  | 
| 99 | 
             
                # 4. Prepare Submission 
         | 
| 100 | 
            -
                submission_data = {"username": username.strip(), "agent_code": agent_code, "answers":  | 
|  | |
| 101 | 
             
                status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
         | 
| 102 | 
             
                logger.info(status_update)
         | 
| 103 |  | 
|  | |
| 15 |  | 
| 16 | 
             
            # --- Basic Agent Definition ---
         | 
| 17 | 
             
            # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
         | 
| 18 | 
            +
            # class BasicAgent:
         | 
| 19 | 
            +
            #     def __init__(self):
         | 
| 20 | 
            +
            #         logger.info("BasicAgent initialized.")
         | 
| 21 | 
            +
            #     def __call__(self, question: str) -> str:
         | 
| 22 | 
            +
            #         logger.info(f"Agent received question (first 50 chars): {question[:50]}...")
         | 
| 23 | 
            +
            #         fixed_answer = "This is a default answer."
         | 
| 24 | 
            +
            #         logger.info(f"Agent returning fixed answer: {fixed_answer}")
         | 
| 25 | 
            +
            #         return fixed_answer
         | 
| 26 |  | 
| 27 | 
             
            def run_and_submit_all( profile: gr.OAuthProfile | None):
         | 
| 28 | 
             
                """
         | 
|  | |
| 43 | 
             
                questions_url = f"{api_url}/questions"
         | 
| 44 | 
             
                submit_url = f"{api_url}/submit"
         | 
| 45 |  | 
| 46 | 
            +
                # # 1. Instantiate Agent ( modify this part to create your agent)
         | 
| 47 | 
            +
                # try:
         | 
| 48 | 
            +
                #     agent = BasicAgent()
         | 
| 49 | 
            +
                # except Exception as e:
         | 
| 50 | 
            +
                #     logger.info(f"Error instantiating agent: {e}")
         | 
| 51 | 
            +
                #     return f"Error initializing agent: {e}", None
         | 
| 52 | 
             
                # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
         | 
| 53 | 
             
                agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
         | 
| 54 | 
             
                logger.info(agent_code)
         | 
| 55 |  | 
| 56 | 
            +
                # # 2. Fetch Questions
         | 
| 57 | 
            +
                # logger.info(f"Fetching questions from: {questions_url}")
         | 
| 58 | 
            +
                # try:
         | 
| 59 | 
            +
                #     response = requests.get(questions_url, timeout=15)
         | 
| 60 | 
            +
                #     response.raise_for_status()
         | 
| 61 | 
            +
                #     questions_data = response.json()
         | 
| 62 | 
            +
                #     if not questions_data:
         | 
| 63 | 
            +
                #          logger.info("Fetched questions list is empty.")
         | 
| 64 | 
            +
                #          return "Fetched questions list is empty or invalid format.", None
         | 
| 65 | 
            +
                #     logger.info(f"Fetched {len(questions_data)} questions.")
         | 
| 66 | 
            +
                # except requests.exceptions.RequestException as e:
         | 
| 67 | 
            +
                #     logger.info(f"Error fetching questions: {e}")
         | 
| 68 | 
            +
                #     return f"Error fetching questions: {e}", None
         | 
| 69 | 
            +
                # except requests.exceptions.JSONDecodeError as e:
         | 
| 70 | 
            +
                #      logger.info(f"Error decoding JSON response from questions endpoint: {e}")
         | 
| 71 | 
            +
                #      logger.info(f"Response text: {response.text[:500]}")
         | 
| 72 | 
            +
                #      return f"Error decoding server response for questions: {e}", None
         | 
| 73 | 
            +
                # except Exception as e:
         | 
| 74 | 
            +
                #     logger.info(f"An unexpected error occurred fetching questions: {e}")
         | 
| 75 | 
            +
                #     return f"An unexpected error occurred fetching questions: {e}", None
         | 
| 76 |  | 
| 77 | 
             
                # 3. Run your Agent
         | 
| 78 | 
            +
                # results_log = []
         | 
| 79 | 
            +
                # answers_payload = []
         | 
| 80 | 
            +
                # logger.info(f"Running agent on {len(questions_data)} questions...")
         | 
| 81 | 
            +
                # for item in questions_data:
         | 
| 82 | 
            +
                #     task_id = item.get("task_id")
         | 
| 83 | 
            +
                #     question_text = item.get("question")
         | 
| 84 | 
            +
                #     if not task_id or question_text is None:
         | 
| 85 | 
            +
                #         logger.info(f"Skipping item with missing task_id or question: {item}")
         | 
| 86 | 
            +
                #         continue
         | 
| 87 | 
            +
                #     try:
         | 
| 88 | 
            +
                #         submitted_answer = agent(question_text)
         | 
| 89 | 
            +
                #         answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
         | 
| 90 | 
            +
                #         results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         | 
| 91 | 
            +
                #     except Exception as e:
         | 
| 92 | 
            +
                #          logger.info(f"Error running agent on task {task_id}: {e}")
         | 
| 93 | 
            +
                #          results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                # if not answers_payload:
         | 
| 96 | 
            +
                #     logger.info("Agent did not produce any answers to submit.")
         | 
| 97 | 
            +
                #     return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
         | 
| 98 | 
            +
                with open("all_questions.pkl", "rb") as f:
         | 
| 99 | 
            +
                    all_questions = pickle.load(f)
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                with open("results_gpt_mini.pkl", "rb") as f:
         | 
| 102 | 
            +
                    results = pickle.load(f)
         | 
| 103 | 
            +
                answers = [{"task_id":j['task_id'],
         | 
| 104 | 
            +
                        "submitted_answer": results[i]["structured_response"].answer
         | 
| 105 | 
            +
                        if isinstance(results[i], dict) else "No answer"}
         | 
| 106 | 
            +
             for i,j in enumerate(all_questions)]
         | 
| 107 |  | 
| 108 | 
             
                # 4. Prepare Submission 
         | 
| 109 | 
            +
                submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers}
         | 
| 110 | 
            +
                # submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
         | 
| 111 | 
             
                status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
         | 
| 112 | 
             
                logger.info(status_update)
         | 
| 113 |  | 
    	
        playground.ipynb
    CHANGED
    
    | @@ -1,5 +1,15 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
             "cells": [
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 3 | 
             
              {
         | 
| 4 | 
             
               "cell_type": "code",
         | 
| 5 | 
             
               "execution_count": 73,
         | 
| @@ -57,15 +67,15 @@ | |
| 57 | 
             
              },
         | 
| 58 | 
             
              {
         | 
| 59 | 
             
               "cell_type": "code",
         | 
| 60 | 
            -
               "execution_count":  | 
| 61 | 
             
               "id": "7a960624-355b-4934-a741-fb880ad3ca37",
         | 
| 62 | 
             
               "metadata": {
         | 
| 63 | 
             
                "execution": {
         | 
| 64 | 
            -
                 "iopub.execute_input": "2025-06- | 
| 65 | 
            -
                 "iopub.status.busy": "2025-06- | 
| 66 | 
            -
                 "iopub.status.idle": "2025-06- | 
| 67 | 
            -
                 "shell.execute_reply": "2025-06- | 
| 68 | 
            -
                 "shell.execute_reply.started": "2025-06- | 
| 69 | 
             
                }
         | 
| 70 | 
             
               },
         | 
| 71 | 
             
               "outputs": [],
         | 
| @@ -984,7 +994,9 @@ | |
| 984 | 
             
              {
         | 
| 985 | 
             
               "cell_type": "markdown",
         | 
| 986 | 
             
               "id": "c3b5f100-893f-405a-920f-a32461ac7277",
         | 
| 987 | 
            -
               "metadata": { | 
|  | |
|  | |
| 988 | 
             
               "source": [
         | 
| 989 | 
             
                "# Async Task"
         | 
| 990 | 
             
               ]
         | 
| @@ -1424,15 +1436,24 @@ | |
| 1424 | 
             
              },
         | 
| 1425 | 
             
              {
         | 
| 1426 | 
             
               "cell_type": "code",
         | 
| 1427 | 
            -
               "execution_count":  | 
| 1428 | 
            -
               "id": " | 
| 1429 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1430 | 
             
               "outputs": [],
         | 
| 1431 | 
             
               "source": [
         | 
| 1432 | 
             
                "from smolagents import (\n",
         | 
| 1433 | 
             
                "    DuckDuckGoSearchTool,\n",
         | 
| 1434 | 
             
                "    VisitWebpageTool,\n",
         | 
| 1435 | 
             
                "    LocalPythonExecutor,\n",
         | 
|  | |
| 1436 | 
             
                "    WikipediaSearchTool,\n",
         | 
| 1437 | 
             
                "    Tool\n",
         | 
| 1438 | 
             
                ")\n",
         | 
| @@ -1452,14 +1473,24 @@ | |
| 1452 | 
             
                "from dotenv import load_dotenv, find_dotenv\n",
         | 
| 1453 | 
             
                "import pickle\n",
         | 
| 1454 | 
             
                "import asyncio\n",
         | 
| 1455 | 
            -
                "import nest_asyncio"
         | 
|  | |
|  | |
| 1456 | 
             
               ]
         | 
| 1457 | 
             
              },
         | 
| 1458 | 
             
              {
         | 
| 1459 | 
             
               "cell_type": "code",
         | 
| 1460 | 
             
               "execution_count": 2,
         | 
| 1461 | 
             
               "id": "147b9149",
         | 
| 1462 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1463 | 
             
               "outputs": [],
         | 
| 1464 | 
             
               "source": [
         | 
| 1465 | 
             
                "_ = load_dotenv(find_dotenv(raise_error_if_not_found=True), override=True)\n",
         | 
| @@ -1470,20 +1501,37 @@ | |
| 1470 | 
             
              },
         | 
| 1471 | 
             
              {
         | 
| 1472 | 
             
               "cell_type": "code",
         | 
| 1473 | 
            -
               "execution_count":  | 
| 1474 | 
             
               "id": "a36446a3",
         | 
| 1475 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1476 | 
             
               "outputs": [],
         | 
| 1477 | 
             
               "source": [
         | 
| 1478 | 
            -
                "lang_model = init_chat_model(model=\"gpt-4.1 | 
| 1479 | 
            -
                "# lang_model = init_chat_model(model=\"gpt-4.1- | 
|  | |
| 1480 | 
             
               ]
         | 
| 1481 | 
             
              },
         | 
| 1482 | 
             
              {
         | 
| 1483 | 
             
               "cell_type": "code",
         | 
| 1484 | 
             
               "execution_count": 4,
         | 
| 1485 | 
             
               "id": "cf80baaf",
         | 
| 1486 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1487 | 
             
               "outputs": [],
         | 
| 1488 | 
             
               "source": [
         | 
| 1489 | 
             
                "def search_wikipedia(query: str) -> str:\n",
         | 
| @@ -1498,7 +1546,15 @@ | |
| 1498 | 
             
               "cell_type": "code",
         | 
| 1499 | 
             
               "execution_count": 5,
         | 
| 1500 | 
             
               "id": "daf73b23",
         | 
| 1501 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1502 | 
             
               "outputs": [],
         | 
| 1503 | 
             
               "source": [
         | 
| 1504 | 
             
                "def visit_web_page(url:str) -> str:\n",
         | 
| @@ -1512,26 +1568,43 @@ | |
| 1512 | 
             
               "cell_type": "code",
         | 
| 1513 | 
             
               "execution_count": 6,
         | 
| 1514 | 
             
               "id": "6f37359b",
         | 
| 1515 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1516 | 
             
               "outputs": [],
         | 
| 1517 | 
             
               "source": [
         | 
| 1518 | 
             
                "def read_excel_or_csv(filepath: str) -> str:\n",
         | 
| 1519 | 
             
                "    \"\"\"Reads an excel or csv file and returns the content as str.\"\"\"\n",
         | 
| 1520 | 
             
                "    if Path(filepath).suffix in {\".xlsx\", \".xls\"}:\n",
         | 
| 1521 | 
             
                "        df = pl.read_excel(source=filepath)\n",
         | 
| 1522 | 
            -
                "     | 
|  | |
| 1523 | 
             
                "    content_str = df.to_dict(as_series=False).__str__()\n",
         | 
| 1524 | 
             
                "    return content_str\n"
         | 
| 1525 | 
             
               ]
         | 
| 1526 | 
             
              },
         | 
| 1527 | 
             
              {
         | 
| 1528 | 
             
               "cell_type": "code",
         | 
| 1529 | 
            -
               "execution_count":  | 
| 1530 | 
             
               "id": "c52bd1a4",
         | 
| 1531 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1532 | 
             
               "outputs": [],
         | 
| 1533 | 
             
               "source": [
         | 
| 1534 | 
            -
                "def  | 
| 1535 | 
             
                "    \"\"\"Returns the output of a python code.\"\"\"\n",
         | 
| 1536 | 
             
                "    with open(filepath, \"r\") as f:\n",
         | 
| 1537 | 
             
                "        code = f.readlines()\n",
         | 
| @@ -1552,11 +1625,27 @@ | |
| 1552 | 
             
                "    return code_result.generations[0][0].text"
         | 
| 1553 | 
             
               ]
         | 
| 1554 | 
             
              },
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1555 | 
             
              {
         | 
| 1556 | 
             
               "cell_type": "code",
         | 
| 1557 | 
             
               "execution_count": 8,
         | 
| 1558 | 
             
               "id": "2c835c33",
         | 
| 1559 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1560 | 
             
               "outputs": [
         | 
| 1561 | 
             
                {
         | 
| 1562 | 
             
                 "name": "stdout",
         | 
| @@ -1583,7 +1672,15 @@ | |
| 1583 | 
             
               "cell_type": "code",
         | 
| 1584 | 
             
               "execution_count": 9,
         | 
| 1585 | 
             
               "id": "01193586",
         | 
| 1586 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1587 | 
             
               "outputs": [],
         | 
| 1588 | 
             
               "source": [
         | 
| 1589 | 
             
                "def call_stt_tool(file_url:str) -> str:\n",
         | 
| @@ -1596,7 +1693,15 @@ | |
| 1596 | 
             
               "cell_type": "code",
         | 
| 1597 | 
             
               "execution_count": 10,
         | 
| 1598 | 
             
               "id": "f10e6859",
         | 
| 1599 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1600 | 
             
               "outputs": [],
         | 
| 1601 | 
             
               "source": [
         | 
| 1602 | 
             
                "\n",
         | 
| @@ -1637,7 +1742,15 @@ | |
| 1637 | 
             
               "cell_type": "code",
         | 
| 1638 | 
             
               "execution_count": 11,
         | 
| 1639 | 
             
               "id": "b1634bb8",
         | 
| 1640 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1641 | 
             
               "outputs": [],
         | 
| 1642 | 
             
               "source": [
         | 
| 1643 | 
             
                "# image_tool(file_url=\"cca530fc-4052-43b2-b130-b30968d8aa44.png\")"
         | 
| @@ -1647,12 +1760,20 @@ | |
| 1647 | 
             
               "cell_type": "code",
         | 
| 1648 | 
             
               "execution_count": 12,
         | 
| 1649 | 
             
               "id": "08f1534d",
         | 
| 1650 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1651 | 
             
               "outputs": [],
         | 
| 1652 | 
             
               "source": [
         | 
| 1653 | 
            -
                "def youtube_video_tool(url:str) -> str:\n",
         | 
| 1654 | 
             
                "    \"\"\"Answers questions about youtube videos.\n",
         | 
| 1655 | 
            -
                "    URLs must be provided to this tool.\"\"\"\n",
         | 
| 1656 | 
             
                "    yt_vid_mapping = {\"https://www.youtube.com/watch?v=L1vXCYZAYYM\": \"penguin.mp4\",\n",
         | 
| 1657 | 
             
                "                  \"https://www.youtube.com/watch?v=1htKBjuUWec\": \"coffee.mp4\"}\n",
         | 
| 1658 | 
             
                "    video = cv2.VideoCapture(filename=yt_vid_mapping[url])\n",
         | 
| @@ -1674,7 +1795,7 @@ | |
| 1674 | 
             
                "                    {\n",
         | 
| 1675 | 
             
                "                        \"type\": \"text\",\n",
         | 
| 1676 | 
             
                "                        \"text\": (\n",
         | 
| 1677 | 
            -
                "                            \"\"\"Examine the video | 
| 1678 | 
             
                "                        ),\n",
         | 
| 1679 | 
             
                "                    },\n",
         | 
| 1680 | 
             
                "                    *[\n",
         | 
| @@ -1696,7 +1817,15 @@ | |
| 1696 | 
             
               "cell_type": "code",
         | 
| 1697 | 
             
               "execution_count": 13,
         | 
| 1698 | 
             
               "id": "599530e3",
         | 
| 1699 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1700 | 
             
               "outputs": [],
         | 
| 1701 | 
             
               "source": [
         | 
| 1702 | 
             
                "def web_search_tool(query: str) -> str:\n",
         | 
| @@ -1706,19 +1835,52 @@ | |
| 1706 | 
             
                "    return search_res"
         | 
| 1707 | 
             
               ]
         | 
| 1708 | 
             
              },
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1709 | 
             
              {
         | 
| 1710 | 
             
               "cell_type": "code",
         | 
| 1711 | 
             
               "execution_count": null,
         | 
| 1712 | 
            -
               "id": " | 
| 1713 | 
            -
               "metadata": { | 
|  | |
|  | |
| 1714 | 
             
               "outputs": [],
         | 
| 1715 | 
             
               "source": []
         | 
| 1716 | 
             
              },
         | 
| 1717 | 
             
              {
         | 
| 1718 | 
             
               "cell_type": "code",
         | 
| 1719 | 
            -
               "execution_count":  | 
| 1720 | 
             
               "id": "a24e765b",
         | 
| 1721 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1722 | 
             
               "outputs": [],
         | 
| 1723 | 
             
               "source": [
         | 
| 1724 | 
             
                "class AnswerFormat(BaseModel):\n",
         | 
| @@ -1743,9 +1905,17 @@ | |
| 1743 | 
             
              },
         | 
| 1744 | 
             
              {
         | 
| 1745 | 
             
               "cell_type": "code",
         | 
| 1746 | 
            -
               "execution_count":  | 
| 1747 | 
             
               "id": "d726f6cd",
         | 
| 1748 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1749 | 
             
               "outputs": [],
         | 
| 1750 | 
             
               "source": [
         | 
| 1751 | 
             
                "# SYS_PROMPT_SWEBENCH = \"\"\"\n",
         | 
| @@ -1891,9 +2061,17 @@ | |
| 1891 | 
             
              },
         | 
| 1892 | 
             
              {
         | 
| 1893 | 
             
               "cell_type": "code",
         | 
| 1894 | 
            -
               "execution_count":  | 
| 1895 | 
             
               "id": "40300268",
         | 
| 1896 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1897 | 
             
               "outputs": [],
         | 
| 1898 | 
             
               "source": [
         | 
| 1899 | 
             
                "# https://cookbook.openai.com/examples/gpt4-1_prompting_guide\n",
         | 
| @@ -1902,12 +2080,15 @@ | |
| 1902 | 
             
                "\n",
         | 
| 1903 | 
             
                "\n",
         | 
| 1904 | 
             
                "# Instructions\n",
         | 
| 1905 | 
            -
                "- Carefully read and understand the task.  | 
| 1906 | 
             
                "- Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance \"discograpy\".\n",
         | 
| 1907 | 
             
                "- If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.\n",
         | 
| 1908 | 
             
                "- You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.\n",
         | 
| 1909 | 
             
                "- Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.\n",
         | 
| 1910 | 
            -
                "- You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.\n",
         | 
|  | |
|  | |
|  | |
| 1911 | 
             
                "- If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.\n",
         | 
| 1912 | 
             
                "- You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.\n",
         | 
| 1913 | 
             
                "- Always verify your answers.\n",
         | 
| @@ -1950,9 +2131,17 @@ | |
| 1950 | 
             
              },
         | 
| 1951 | 
             
              {
         | 
| 1952 | 
             
               "cell_type": "code",
         | 
| 1953 | 
            -
               "execution_count":  | 
| 1954 | 
             
               "id": "db96eef6",
         | 
| 1955 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1956 | 
             
               "outputs": [],
         | 
| 1957 | 
             
               "source": [
         | 
| 1958 | 
             
                "agent = create_react_agent(\n",
         | 
| @@ -1962,6 +2151,7 @@ | |
| 1962 | 
             
                "        visit_web_page,\n",
         | 
| 1963 | 
             
                "        read_excel_or_csv,\n",
         | 
| 1964 | 
             
                "        python_executor,\n",
         | 
|  | |
| 1965 | 
             
                "        call_stt_tool,\n",
         | 
| 1966 | 
             
                "        image_tool,\n",
         | 
| 1967 | 
             
                "        youtube_video_tool,\n",
         | 
| @@ -1974,9 +2164,17 @@ | |
| 1974 | 
             
              },
         | 
| 1975 | 
             
              {
         | 
| 1976 | 
             
               "cell_type": "code",
         | 
| 1977 | 
            -
               "execution_count":  | 
| 1978 | 
             
               "id": "33240d19",
         | 
| 1979 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1980 | 
             
               "outputs": [],
         | 
| 1981 | 
             
               "source": [
         | 
| 1982 | 
             
                "# recursion_limit = 10\n",
         | 
| @@ -1985,9 +2183,17 @@ | |
| 1985 | 
             
              },
         | 
| 1986 | 
             
              {
         | 
| 1987 | 
             
               "cell_type": "code",
         | 
| 1988 | 
            -
               "execution_count":  | 
| 1989 | 
             
               "id": "850bb54d",
         | 
| 1990 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 1991 | 
             
               "outputs": [
         | 
| 1992 | 
             
                {
         | 
| 1993 | 
             
                 "data": {
         | 
| @@ -1998,7 +2204,7 @@ | |
| 1998 | 
             
                   " 'file_name': ''}"
         | 
| 1999 | 
             
                  ]
         | 
| 2000 | 
             
                 },
         | 
| 2001 | 
            -
                 "execution_count":  | 
| 2002 | 
             
                 "metadata": {},
         | 
| 2003 | 
             
                 "output_type": "execute_result"
         | 
| 2004 | 
             
                }
         | 
| @@ -2009,59 +2215,322 @@ | |
| 2009 | 
             
              },
         | 
| 2010 | 
             
              {
         | 
| 2011 | 
             
               "cell_type": "code",
         | 
| 2012 | 
            -
               "execution_count":  | 
| 2013 | 
             
               "id": "f9025fae",
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 2014 | 
             
               "metadata": {},
         | 
| 2015 | 
             
               "outputs": [],
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 2016 | 
             
               "source": [
         | 
| 2017 | 
            -
                " | 
| 2018 | 
            -
                " | 
| 2019 | 
            -
                " | 
| 2020 | 
            -
                " | 
| 2021 | 
            -
                "#             \"messages\": f\"\"\"Complete the following task: {all_questions[0][\"question\"]}. Relevant file: {\n",
         | 
| 2022 | 
            -
                "#                 all_questions[0][\"file_name\"]\n",
         | 
| 2023 | 
            -
                "#                 if all_questions[0][\"file_name\"]\n",
         | 
| 2024 | 
            -
                "#                 else \"There's no relevant file to use.\"\n",
         | 
| 2025 | 
            -
                "#             }\"\"\"\n",
         | 
| 2026 | 
            -
                "#         }\n",
         | 
| 2027 | 
            -
                "#     )\n",
         | 
| 2028 | 
            -
                "# except GraphRecursionError:\n",
         | 
| 2029 | 
            -
                "#     print(\"❌ Agent stopped due to max iterations.\")"
         | 
| 2030 | 
             
               ]
         | 
| 2031 | 
             
              },
         | 
| 2032 | 
             
              {
         | 
| 2033 | 
             
               "cell_type": "code",
         | 
| 2034 | 
            -
               "execution_count":  | 
| 2035 | 
            -
               "id": " | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 2036 | 
             
               "metadata": {},
         | 
| 2037 | 
             
               "outputs": [],
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 2038 | 
             
               "source": [
         | 
| 2039 | 
            -
                " | 
| 2040 | 
            -
             | 
| 2041 | 
            -
             | 
| 2042 | 
            -
             | 
| 2043 | 
            -
             | 
| 2044 | 
            -
             | 
| 2045 | 
            -
             | 
| 2046 | 
            -
             | 
| 2047 | 
            -
                " | 
| 2048 | 
            -
             | 
| 2049 | 
            -
             | 
| 2050 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 2051 | 
             
                "            input={\n",
         | 
| 2052 | 
            -
                "                \"messages\": f\"\"\"Complete the following task: { | 
| 2053 | 
            -
                "                     | 
|  | |
|  | |
| 2054 | 
             
                "                }\"\"\"\n",
         | 
| 2055 | 
             
                "            }\n",
         | 
| 2056 | 
            -
                "        ) | 
| 2057 | 
            -
             | 
| 2058 | 
            -
             | 
| 2059 | 
            -
             | 
| 2060 | 
            -
             | 
| 2061 | 
            -
             | 
| 2062 | 
            -
             | 
| 2063 | 
            -
             | 
| 2064 | 
            -
                " | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 2065 | 
             
                "\n"
         | 
| 2066 | 
             
               ]
         | 
| 2067 | 
             
              },
         | 
| @@ -2072,39 +2541,66 @@ | |
| 2072 | 
             
               "metadata": {},
         | 
| 2073 | 
             
               "outputs": [],
         | 
| 2074 | 
             
               "source": [
         | 
| 2075 | 
            -
                "responses = asyncio.run(run_all_questions(agent, all_questions))"
         | 
| 2076 | 
             
               ]
         | 
| 2077 | 
             
              },
         | 
| 2078 | 
             
              {
         | 
| 2079 | 
             
               "cell_type": "code",
         | 
| 2080 | 
            -
               "execution_count":  | 
| 2081 | 
            -
               "id": " | 
| 2082 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 2083 | 
             
               "outputs": [
         | 
| 2084 | 
             
                {
         | 
| 2085 | 
             
                 "data": {
         | 
| 2086 | 
             
                  "text/plain": [
         | 
| 2087 | 
            -
                   " | 
| 2088 | 
             
                  ]
         | 
| 2089 | 
             
                 },
         | 
| 2090 | 
            -
                 "execution_count":  | 
| 2091 | 
             
                 "metadata": {},
         | 
| 2092 | 
             
                 "output_type": "execute_result"
         | 
| 2093 | 
             
                }
         | 
| 2094 | 
             
               ],
         | 
| 2095 | 
             
               "source": [
         | 
| 2096 | 
            -
                "responses | 
| 2097 | 
             
               ]
         | 
| 2098 | 
             
              },
         | 
| 2099 | 
             
              {
         | 
| 2100 | 
             
               "cell_type": "code",
         | 
| 2101 | 
            -
               "execution_count":  | 
| 2102 | 
            -
               "id": " | 
| 2103 | 
            -
               "metadata": { | 
| 2104 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 2105 | 
             
               "source": [
         | 
| 2106 | 
            -
                " | 
| 2107 | 
            -
                "    code = f.readlines()"
         | 
| 2108 | 
             
               ]
         | 
| 2109 | 
             
              },
         | 
| 2110 | 
             
              {
         | 
| @@ -2117,26 +2613,35 @@ | |
| 2117 | 
             
              },
         | 
| 2118 | 
             
              {
         | 
| 2119 | 
             
               "cell_type": "code",
         | 
| 2120 | 
            -
               "execution_count":  | 
| 2121 | 
             
               "id": "94449302",
         | 
| 2122 | 
            -
               "metadata": { | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 2123 | 
             
               "outputs": [
         | 
| 2124 | 
             
                {
         | 
| 2125 | 
             
                 "data": {
         | 
| 2126 | 
             
                  "text/plain": [
         | 
| 2127 | 
            -
                   "{'task_id': ' | 
| 2128 | 
            -
                   " 'question': ' | 
| 2129 | 
             
                   " 'Level': '1',\n",
         | 
| 2130 | 
             
                   " 'file_name': ''}"
         | 
| 2131 | 
             
                  ]
         | 
| 2132 | 
             
                 },
         | 
| 2133 | 
            -
                 "execution_count":  | 
| 2134 | 
             
                 "metadata": {},
         | 
| 2135 | 
             
                 "output_type": "execute_result"
         | 
| 2136 | 
             
                }
         | 
| 2137 | 
             
               ],
         | 
| 2138 | 
             
               "source": [
         | 
| 2139 | 
            -
                "all_questions[ | 
| 2140 | 
             
               ]
         | 
| 2141 | 
             
              },
         | 
| 2142 | 
             
              {
         | 
| @@ -2178,7 +2683,7 @@ | |
| 2178 | 
             
             ],
         | 
| 2179 | 
             
             "metadata": {
         | 
| 2180 | 
             
              "kernelspec": {
         | 
| 2181 | 
            -
               "display_name": " | 
| 2182 | 
             
               "language": "python",
         | 
| 2183 | 
             
               "name": "python3"
         | 
| 2184 | 
             
              },
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
             "cells": [
         | 
| 3 | 
            +
              {
         | 
| 4 | 
            +
               "cell_type": "markdown",
         | 
| 5 | 
            +
               "id": "e6c1f8cc-eee9-4e7f-b82e-69d681533cb3",
         | 
| 6 | 
            +
               "metadata": {
         | 
| 7 | 
            +
                "jp-MarkdownHeadingCollapsed": true
         | 
| 8 | 
            +
               },
         | 
| 9 | 
            +
               "source": [
         | 
| 10 | 
            +
                "# Smolagents"
         | 
| 11 | 
            +
               ]
         | 
| 12 | 
            +
              },
         | 
| 13 | 
             
              {
         | 
| 14 | 
             
               "cell_type": "code",
         | 
| 15 | 
             
               "execution_count": 73,
         | 
|  | |
| 67 | 
             
              },
         | 
| 68 | 
             
              {
         | 
| 69 | 
             
               "cell_type": "code",
         | 
| 70 | 
            +
               "execution_count": 28,
         | 
| 71 | 
             
               "id": "7a960624-355b-4934-a741-fb880ad3ca37",
         | 
| 72 | 
             
               "metadata": {
         | 
| 73 | 
             
                "execution": {
         | 
| 74 | 
            +
                 "iopub.execute_input": "2025-06-14T12:04:06.703540Z",
         | 
| 75 | 
            +
                 "iopub.status.busy": "2025-06-14T12:04:06.702825Z",
         | 
| 76 | 
            +
                 "iopub.status.idle": "2025-06-14T12:04:06.714241Z",
         | 
| 77 | 
            +
                 "shell.execute_reply": "2025-06-14T12:04:06.711667Z",
         | 
| 78 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:04:06.703473Z"
         | 
| 79 | 
             
                }
         | 
| 80 | 
             
               },
         | 
| 81 | 
             
               "outputs": [],
         | 
|  | |
| 994 | 
             
              {
         | 
| 995 | 
             
               "cell_type": "markdown",
         | 
| 996 | 
             
               "id": "c3b5f100-893f-405a-920f-a32461ac7277",
         | 
| 997 | 
            +
               "metadata": {
         | 
| 998 | 
            +
                "jp-MarkdownHeadingCollapsed": true
         | 
| 999 | 
            +
               },
         | 
| 1000 | 
             
               "source": [
         | 
| 1001 | 
             
                "# Async Task"
         | 
| 1002 | 
             
               ]
         | 
|  | |
| 1436 | 
             
              },
         | 
| 1437 | 
             
              {
         | 
| 1438 | 
             
               "cell_type": "code",
         | 
| 1439 | 
            +
               "execution_count": null,
         | 
| 1440 | 
            +
               "id": "e4093850-db0d-4563-b116-765a3f1df44c",
         | 
| 1441 | 
            +
               "metadata": {
         | 
| 1442 | 
            +
                "execution": {
         | 
| 1443 | 
            +
                 "iopub.execute_input": "2025-06-14T12:03:55.849855Z",
         | 
| 1444 | 
            +
                 "iopub.status.busy": "2025-06-14T12:03:55.849026Z",
         | 
| 1445 | 
            +
                 "iopub.status.idle": "2025-06-14T12:03:55.859974Z",
         | 
| 1446 | 
            +
                 "shell.execute_reply": "2025-06-14T12:03:55.857431Z",
         | 
| 1447 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:03:55.849785Z"
         | 
| 1448 | 
            +
                }
         | 
| 1449 | 
            +
               },
         | 
| 1450 | 
             
               "outputs": [],
         | 
| 1451 | 
             
               "source": [
         | 
| 1452 | 
             
                "from smolagents import (\n",
         | 
| 1453 | 
             
                "    DuckDuckGoSearchTool,\n",
         | 
| 1454 | 
             
                "    VisitWebpageTool,\n",
         | 
| 1455 | 
             
                "    LocalPythonExecutor,\n",
         | 
| 1456 | 
            +
                "    PythonInterpreterTool,\n",
         | 
| 1457 | 
             
                "    WikipediaSearchTool,\n",
         | 
| 1458 | 
             
                "    Tool\n",
         | 
| 1459 | 
             
                ")\n",
         | 
|  | |
| 1473 | 
             
                "from dotenv import load_dotenv, find_dotenv\n",
         | 
| 1474 | 
             
                "import pickle\n",
         | 
| 1475 | 
             
                "import asyncio\n",
         | 
| 1476 | 
            +
                "import nest_asyncio\n",
         | 
| 1477 | 
            +
                "\n",
         | 
| 1478 | 
            +
                "import httpx"
         | 
| 1479 | 
             
               ]
         | 
| 1480 | 
             
              },
         | 
| 1481 | 
             
              {
         | 
| 1482 | 
             
               "cell_type": "code",
         | 
| 1483 | 
             
               "execution_count": 2,
         | 
| 1484 | 
             
               "id": "147b9149",
         | 
| 1485 | 
            +
               "metadata": {
         | 
| 1486 | 
            +
                "execution": {
         | 
| 1487 | 
            +
                 "iopub.execute_input": "2025-06-14T12:00:16.150863Z",
         | 
| 1488 | 
            +
                 "iopub.status.busy": "2025-06-14T12:00:16.150052Z",
         | 
| 1489 | 
            +
                 "iopub.status.idle": "2025-06-14T12:00:16.160454Z",
         | 
| 1490 | 
            +
                 "shell.execute_reply": "2025-06-14T12:00:16.159182Z",
         | 
| 1491 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:00:16.150794Z"
         | 
| 1492 | 
            +
                }
         | 
| 1493 | 
            +
               },
         | 
| 1494 | 
             
               "outputs": [],
         | 
| 1495 | 
             
               "source": [
         | 
| 1496 | 
             
                "_ = load_dotenv(find_dotenv(raise_error_if_not_found=True), override=True)\n",
         | 
|  | |
| 1501 | 
             
              },
         | 
| 1502 | 
             
              {
         | 
| 1503 | 
             
               "cell_type": "code",
         | 
| 1504 | 
            +
               "execution_count": 56,
         | 
| 1505 | 
             
               "id": "a36446a3",
         | 
| 1506 | 
            +
               "metadata": {
         | 
| 1507 | 
            +
                "execution": {
         | 
| 1508 | 
            +
                 "iopub.execute_input": "2025-06-14T12:13:30.483018Z",
         | 
| 1509 | 
            +
                 "iopub.status.busy": "2025-06-14T12:13:30.480788Z",
         | 
| 1510 | 
            +
                 "iopub.status.idle": "2025-06-14T12:13:30.497355Z",
         | 
| 1511 | 
            +
                 "shell.execute_reply": "2025-06-14T12:13:30.494658Z",
         | 
| 1512 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:13:30.482923Z"
         | 
| 1513 | 
            +
                }
         | 
| 1514 | 
            +
               },
         | 
| 1515 | 
             
               "outputs": [],
         | 
| 1516 | 
             
               "source": [
         | 
| 1517 | 
            +
                "lang_model = init_chat_model(model=\"gpt-4.1\", model_provider=\"openai\",temperature=0.2)\n",
         | 
| 1518 | 
            +
                "# lang_model = init_chat_model(model=\"gpt-4.1-nano\", model_provider=\"openai\",temperature=0.2)\n",
         | 
| 1519 | 
            +
                "# lang_model = init_chat_model(model=\"gpt-4.1-mini\", model_provider=\"openai\",temperature=0.2)"
         | 
| 1520 | 
             
               ]
         | 
| 1521 | 
             
              },
         | 
| 1522 | 
             
              {
         | 
| 1523 | 
             
               "cell_type": "code",
         | 
| 1524 | 
             
               "execution_count": 4,
         | 
| 1525 | 
             
               "id": "cf80baaf",
         | 
| 1526 | 
            +
               "metadata": {
         | 
| 1527 | 
            +
                "execution": {
         | 
| 1528 | 
            +
                 "iopub.execute_input": "2025-06-14T12:00:17.884011Z",
         | 
| 1529 | 
            +
                 "iopub.status.busy": "2025-06-14T12:00:17.883440Z",
         | 
| 1530 | 
            +
                 "iopub.status.idle": "2025-06-14T12:00:17.892705Z",
         | 
| 1531 | 
            +
                 "shell.execute_reply": "2025-06-14T12:00:17.891547Z",
         | 
| 1532 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:00:17.883958Z"
         | 
| 1533 | 
            +
                }
         | 
| 1534 | 
            +
               },
         | 
| 1535 | 
             
               "outputs": [],
         | 
| 1536 | 
             
               "source": [
         | 
| 1537 | 
             
                "def search_wikipedia(query: str) -> str:\n",
         | 
|  | |
| 1546 | 
             
               "cell_type": "code",
         | 
| 1547 | 
             
               "execution_count": 5,
         | 
| 1548 | 
             
               "id": "daf73b23",
         | 
| 1549 | 
            +
               "metadata": {
         | 
| 1550 | 
            +
                "execution": {
         | 
| 1551 | 
            +
                 "iopub.execute_input": "2025-06-14T12:00:17.894347Z",
         | 
| 1552 | 
            +
                 "iopub.status.busy": "2025-06-14T12:00:17.893804Z",
         | 
| 1553 | 
            +
                 "iopub.status.idle": "2025-06-14T12:00:17.977993Z",
         | 
| 1554 | 
            +
                 "shell.execute_reply": "2025-06-14T12:00:17.976648Z",
         | 
| 1555 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:00:17.894292Z"
         | 
| 1556 | 
            +
                }
         | 
| 1557 | 
            +
               },
         | 
| 1558 | 
             
               "outputs": [],
         | 
| 1559 | 
             
               "source": [
         | 
| 1560 | 
             
                "def visit_web_page(url:str) -> str:\n",
         | 
|  | |
| 1568 | 
             
               "cell_type": "code",
         | 
| 1569 | 
             
               "execution_count": 6,
         | 
| 1570 | 
             
               "id": "6f37359b",
         | 
| 1571 | 
            +
               "metadata": {
         | 
| 1572 | 
            +
                "execution": {
         | 
| 1573 | 
            +
                 "iopub.execute_input": "2025-06-14T12:00:17.985256Z",
         | 
| 1574 | 
            +
                 "iopub.status.busy": "2025-06-14T12:00:17.983982Z",
         | 
| 1575 | 
            +
                 "iopub.status.idle": "2025-06-14T12:00:18.104695Z",
         | 
| 1576 | 
            +
                 "shell.execute_reply": "2025-06-14T12:00:18.103363Z",
         | 
| 1577 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:00:17.985190Z"
         | 
| 1578 | 
            +
                }
         | 
| 1579 | 
            +
               },
         | 
| 1580 | 
             
               "outputs": [],
         | 
| 1581 | 
             
               "source": [
         | 
| 1582 | 
             
                "def read_excel_or_csv(filepath: str) -> str:\n",
         | 
| 1583 | 
             
                "    \"\"\"Reads an excel or csv file and returns the content as str.\"\"\"\n",
         | 
| 1584 | 
             
                "    if Path(filepath).suffix in {\".xlsx\", \".xls\"}:\n",
         | 
| 1585 | 
             
                "        df = pl.read_excel(source=filepath)\n",
         | 
| 1586 | 
            +
                "    else:\n",
         | 
| 1587 | 
            +
                "        df = pl.read_csv(source=filepath)\n",
         | 
| 1588 | 
             
                "    content_str = df.to_dict(as_series=False).__str__()\n",
         | 
| 1589 | 
             
                "    return content_str\n"
         | 
| 1590 | 
             
               ]
         | 
| 1591 | 
             
              },
         | 
| 1592 | 
             
              {
         | 
| 1593 | 
             
               "cell_type": "code",
         | 
| 1594 | 
            +
               "execution_count": 7,
         | 
| 1595 | 
             
               "id": "c52bd1a4",
         | 
| 1596 | 
            +
               "metadata": {
         | 
| 1597 | 
            +
                "execution": {
         | 
| 1598 | 
            +
                 "iopub.execute_input": "2025-06-14T12:00:18.106352Z",
         | 
| 1599 | 
            +
                 "iopub.status.busy": "2025-06-14T12:00:18.105808Z",
         | 
| 1600 | 
            +
                 "iopub.status.idle": "2025-06-14T12:00:18.232683Z",
         | 
| 1601 | 
            +
                 "shell.execute_reply": "2025-06-14T12:00:18.231616Z",
         | 
| 1602 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:00:18.106301Z"
         | 
| 1603 | 
            +
                }
         | 
| 1604 | 
            +
               },
         | 
| 1605 | 
             
               "outputs": [],
         | 
| 1606 | 
             
               "source": [
         | 
| 1607 | 
            +
                "def python_code_interpreter(filepath: str) -> Any:\n",
         | 
| 1608 | 
             
                "    \"\"\"Returns the output of a python code.\"\"\"\n",
         | 
| 1609 | 
             
                "    with open(filepath, \"r\") as f:\n",
         | 
| 1610 | 
             
                "        code = f.readlines()\n",
         | 
|  | |
| 1625 | 
             
                "    return code_result.generations[0][0].text"
         | 
| 1626 | 
             
               ]
         | 
| 1627 | 
             
              },
         | 
| 1628 | 
            +
              {
         | 
| 1629 | 
            +
               "cell_type": "code",
         | 
| 1630 | 
            +
               "execution_count": null,
         | 
| 1631 | 
            +
               "id": "f4ed20b1-6107-41a4-85e5-1e99dad44ee1",
         | 
| 1632 | 
            +
               "metadata": {},
         | 
| 1633 | 
            +
               "outputs": [],
         | 
| 1634 | 
            +
               "source": []
         | 
| 1635 | 
            +
              },
         | 
| 1636 | 
             
              {
         | 
| 1637 | 
             
               "cell_type": "code",
         | 
| 1638 | 
             
               "execution_count": 8,
         | 
| 1639 | 
             
               "id": "2c835c33",
         | 
| 1640 | 
            +
               "metadata": {
         | 
| 1641 | 
            +
                "execution": {
         | 
| 1642 | 
            +
                 "iopub.execute_input": "2025-06-14T12:00:18.234454Z",
         | 
| 1643 | 
            +
                 "iopub.status.busy": "2025-06-14T12:00:18.234066Z",
         | 
| 1644 | 
            +
                 "iopub.status.idle": "2025-06-14T12:00:21.769245Z",
         | 
| 1645 | 
            +
                 "shell.execute_reply": "2025-06-14T12:00:21.767715Z",
         | 
| 1646 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:00:18.234423Z"
         | 
| 1647 | 
            +
                }
         | 
| 1648 | 
            +
               },
         | 
| 1649 | 
             
               "outputs": [
         | 
| 1650 | 
             
                {
         | 
| 1651 | 
             
                 "name": "stdout",
         | 
|  | |
| 1672 | 
             
               "cell_type": "code",
         | 
| 1673 | 
             
               "execution_count": 9,
         | 
| 1674 | 
             
               "id": "01193586",
         | 
| 1675 | 
            +
               "metadata": {
         | 
| 1676 | 
            +
                "execution": {
         | 
| 1677 | 
            +
                 "iopub.execute_input": "2025-06-14T12:00:21.770968Z",
         | 
| 1678 | 
            +
                 "iopub.status.busy": "2025-06-14T12:00:21.770456Z",
         | 
| 1679 | 
            +
                 "iopub.status.idle": "2025-06-14T12:00:21.778994Z",
         | 
| 1680 | 
            +
                 "shell.execute_reply": "2025-06-14T12:00:21.777534Z",
         | 
| 1681 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:00:21.770915Z"
         | 
| 1682 | 
            +
                }
         | 
| 1683 | 
            +
               },
         | 
| 1684 | 
             
               "outputs": [],
         | 
| 1685 | 
             
               "source": [
         | 
| 1686 | 
             
                "def call_stt_tool(file_url:str) -> str:\n",
         | 
|  | |
| 1693 | 
             
               "cell_type": "code",
         | 
| 1694 | 
             
               "execution_count": 10,
         | 
| 1695 | 
             
               "id": "f10e6859",
         | 
| 1696 | 
            +
               "metadata": {
         | 
| 1697 | 
            +
                "execution": {
         | 
| 1698 | 
            +
                 "iopub.execute_input": "2025-06-14T12:00:21.784726Z",
         | 
| 1699 | 
            +
                 "iopub.status.busy": "2025-06-14T12:00:21.784237Z",
         | 
| 1700 | 
            +
                 "iopub.status.idle": "2025-06-14T12:00:21.875923Z",
         | 
| 1701 | 
            +
                 "shell.execute_reply": "2025-06-14T12:00:21.874994Z",
         | 
| 1702 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:00:21.784678Z"
         | 
| 1703 | 
            +
                }
         | 
| 1704 | 
            +
               },
         | 
| 1705 | 
             
               "outputs": [],
         | 
| 1706 | 
             
               "source": [
         | 
| 1707 | 
             
                "\n",
         | 
|  | |
| 1742 | 
             
               "cell_type": "code",
         | 
| 1743 | 
             
               "execution_count": 11,
         | 
| 1744 | 
             
               "id": "b1634bb8",
         | 
| 1745 | 
            +
               "metadata": {
         | 
| 1746 | 
            +
                "execution": {
         | 
| 1747 | 
            +
                 "iopub.execute_input": "2025-06-14T12:00:21.877435Z",
         | 
| 1748 | 
            +
                 "iopub.status.busy": "2025-06-14T12:00:21.877020Z",
         | 
| 1749 | 
            +
                 "iopub.status.idle": "2025-06-14T12:00:22.052327Z",
         | 
| 1750 | 
            +
                 "shell.execute_reply": "2025-06-14T12:00:22.049864Z",
         | 
| 1751 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:00:21.877390Z"
         | 
| 1752 | 
            +
                }
         | 
| 1753 | 
            +
               },
         | 
| 1754 | 
             
               "outputs": [],
         | 
| 1755 | 
             
               "source": [
         | 
| 1756 | 
             
                "# image_tool(file_url=\"cca530fc-4052-43b2-b130-b30968d8aa44.png\")"
         | 
|  | |
| 1760 | 
             
               "cell_type": "code",
         | 
| 1761 | 
             
               "execution_count": 12,
         | 
| 1762 | 
             
               "id": "08f1534d",
         | 
| 1763 | 
            +
               "metadata": {
         | 
| 1764 | 
            +
                "execution": {
         | 
| 1765 | 
            +
                 "iopub.execute_input": "2025-06-14T12:00:22.056308Z",
         | 
| 1766 | 
            +
                 "iopub.status.busy": "2025-06-14T12:00:22.055211Z",
         | 
| 1767 | 
            +
                 "iopub.status.idle": "2025-06-14T12:00:22.155394Z",
         | 
| 1768 | 
            +
                 "shell.execute_reply": "2025-06-14T12:00:22.153763Z",
         | 
| 1769 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:00:22.056197Z"
         | 
| 1770 | 
            +
                }
         | 
| 1771 | 
            +
               },
         | 
| 1772 | 
             
               "outputs": [],
         | 
| 1773 | 
             
               "source": [
         | 
| 1774 | 
            +
                "def youtube_video_tool(url:str, query:str) -> str:\n",
         | 
| 1775 | 
             
                "    \"\"\"Answers questions about youtube videos.\n",
         | 
| 1776 | 
            +
                "    URLs must be provided to this tool and the query too.\"\"\"\n",
         | 
| 1777 | 
             
                "    yt_vid_mapping = {\"https://www.youtube.com/watch?v=L1vXCYZAYYM\": \"penguin.mp4\",\n",
         | 
| 1778 | 
             
                "                  \"https://www.youtube.com/watch?v=1htKBjuUWec\": \"coffee.mp4\"}\n",
         | 
| 1779 | 
             
                "    video = cv2.VideoCapture(filename=yt_vid_mapping[url])\n",
         | 
|  | |
| 1795 | 
             
                "                    {\n",
         | 
| 1796 | 
             
                "                        \"type\": \"text\",\n",
         | 
| 1797 | 
             
                "                        \"text\": (\n",
         | 
| 1798 | 
            +
                "                            f\"\"\"Examine the video and answer the following question: {query}.\"\"\"\n",
         | 
| 1799 | 
             
                "                        ),\n",
         | 
| 1800 | 
             
                "                    },\n",
         | 
| 1801 | 
             
                "                    *[\n",
         | 
|  | |
| 1817 | 
             
               "cell_type": "code",
         | 
| 1818 | 
             
               "execution_count": 13,
         | 
| 1819 | 
             
               "id": "599530e3",
         | 
| 1820 | 
            +
               "metadata": {
         | 
| 1821 | 
            +
                "execution": {
         | 
| 1822 | 
            +
                 "iopub.execute_input": "2025-06-14T12:00:22.158534Z",
         | 
| 1823 | 
            +
                 "iopub.status.busy": "2025-06-14T12:00:22.157536Z",
         | 
| 1824 | 
            +
                 "iopub.status.idle": "2025-06-14T12:00:22.331204Z",
         | 
| 1825 | 
            +
                 "shell.execute_reply": "2025-06-14T12:00:22.329006Z",
         | 
| 1826 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:00:22.158439Z"
         | 
| 1827 | 
            +
                }
         | 
| 1828 | 
            +
               },
         | 
| 1829 | 
             
               "outputs": [],
         | 
| 1830 | 
             
               "source": [
         | 
| 1831 | 
             
                "def web_search_tool(query: str) -> str:\n",
         | 
|  | |
| 1835 | 
             
                "    return search_res"
         | 
| 1836 | 
             
               ]
         | 
| 1837 | 
             
              },
         | 
| 1838 | 
            +
              {
         | 
| 1839 | 
            +
               "cell_type": "code",
         | 
| 1840 | 
            +
               "execution_count": 14,
         | 
| 1841 | 
            +
               "id": "367d9ed2-18e1-4d0b-9137-a4cf93afe147",
         | 
| 1842 | 
            +
               "metadata": {
         | 
| 1843 | 
            +
                "execution": {
         | 
| 1844 | 
            +
                 "iopub.execute_input": "2025-06-14T12:00:22.335210Z",
         | 
| 1845 | 
            +
                 "iopub.status.busy": "2025-06-14T12:00:22.333603Z",
         | 
| 1846 | 
            +
                 "iopub.status.idle": "2025-06-14T12:00:22.463598Z",
         | 
| 1847 | 
            +
                 "shell.execute_reply": "2025-06-14T12:00:22.461491Z",
         | 
| 1848 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:00:22.335095Z"
         | 
| 1849 | 
            +
                }
         | 
| 1850 | 
            +
               },
         | 
| 1851 | 
            +
               "outputs": [],
         | 
| 1852 | 
            +
               "source": [
         | 
| 1853 | 
            +
                "def python_executor(code_str:str) -> str:\n",
         | 
| 1854 | 
            +
                "    \"\"\"This executes python code. The code must be a string.\n",
         | 
| 1855 | 
            +
                "    For any calculations always use numpy.\"\"\"\n",
         | 
| 1856 | 
            +
                "    lpe = LocalPythonExecutor(additional_authorized_imports=['polars.*', 'numpy.*'])\n",
         | 
| 1857 | 
            +
                "    code_res = lpe(code_action=code_str)[0]\n",
         | 
| 1858 | 
            +
                "    return code_res"
         | 
| 1859 | 
            +
               ]
         | 
| 1860 | 
            +
              },
         | 
| 1861 | 
             
              {
         | 
| 1862 | 
             
               "cell_type": "code",
         | 
| 1863 | 
             
               "execution_count": null,
         | 
| 1864 | 
            +
               "id": "ca4e1ddb-9658-49e5-95a7-f1311a23907c",
         | 
| 1865 | 
            +
               "metadata": {
         | 
| 1866 | 
            +
                "scrolled": true
         | 
| 1867 | 
            +
               },
         | 
| 1868 | 
             
               "outputs": [],
         | 
| 1869 | 
             
               "source": []
         | 
| 1870 | 
             
              },
         | 
| 1871 | 
             
              {
         | 
| 1872 | 
             
               "cell_type": "code",
         | 
| 1873 | 
            +
               "execution_count": 15,
         | 
| 1874 | 
             
               "id": "a24e765b",
         | 
| 1875 | 
            +
               "metadata": {
         | 
| 1876 | 
            +
                "execution": {
         | 
| 1877 | 
            +
                 "iopub.execute_input": "2025-06-14T12:00:22.466724Z",
         | 
| 1878 | 
            +
                 "iopub.status.busy": "2025-06-14T12:00:22.465826Z",
         | 
| 1879 | 
            +
                 "iopub.status.idle": "2025-06-14T12:00:22.566082Z",
         | 
| 1880 | 
            +
                 "shell.execute_reply": "2025-06-14T12:00:22.564017Z",
         | 
| 1881 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:00:22.466655Z"
         | 
| 1882 | 
            +
                }
         | 
| 1883 | 
            +
               },
         | 
| 1884 | 
             
               "outputs": [],
         | 
| 1885 | 
             
               "source": [
         | 
| 1886 | 
             
                "class AnswerFormat(BaseModel):\n",
         | 
|  | |
| 1905 | 
             
              },
         | 
| 1906 | 
             
              {
         | 
| 1907 | 
             
               "cell_type": "code",
         | 
| 1908 | 
            +
               "execution_count": 16,
         | 
| 1909 | 
             
               "id": "d726f6cd",
         | 
| 1910 | 
            +
               "metadata": {
         | 
| 1911 | 
            +
                "execution": {
         | 
| 1912 | 
            +
                 "iopub.execute_input": "2025-06-14T12:00:22.570115Z",
         | 
| 1913 | 
            +
                 "iopub.status.busy": "2025-06-14T12:00:22.568886Z",
         | 
| 1914 | 
            +
                 "iopub.status.idle": "2025-06-14T12:00:22.718005Z",
         | 
| 1915 | 
            +
                 "shell.execute_reply": "2025-06-14T12:00:22.715775Z",
         | 
| 1916 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:00:22.570005Z"
         | 
| 1917 | 
            +
                }
         | 
| 1918 | 
            +
               },
         | 
| 1919 | 
             
               "outputs": [],
         | 
| 1920 | 
             
               "source": [
         | 
| 1921 | 
             
                "# SYS_PROMPT_SWEBENCH = \"\"\"\n",
         | 
|  | |
| 2061 | 
             
              },
         | 
| 2062 | 
             
              {
         | 
| 2063 | 
             
               "cell_type": "code",
         | 
| 2064 | 
            +
               "execution_count": 57,
         | 
| 2065 | 
             
               "id": "40300268",
         | 
| 2066 | 
            +
               "metadata": {
         | 
| 2067 | 
            +
                "execution": {
         | 
| 2068 | 
            +
                 "iopub.execute_input": "2025-06-14T12:13:39.281024Z",
         | 
| 2069 | 
            +
                 "iopub.status.busy": "2025-06-14T12:13:39.280260Z",
         | 
| 2070 | 
            +
                 "iopub.status.idle": "2025-06-14T12:13:39.298833Z",
         | 
| 2071 | 
            +
                 "shell.execute_reply": "2025-06-14T12:13:39.296267Z",
         | 
| 2072 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:13:39.280958Z"
         | 
| 2073 | 
            +
                }
         | 
| 2074 | 
            +
               },
         | 
| 2075 | 
             
               "outputs": [],
         | 
| 2076 | 
             
               "source": [
         | 
| 2077 | 
             
                "# https://cookbook.openai.com/examples/gpt4-1_prompting_guide\n",
         | 
|  | |
| 2080 | 
             
                "\n",
         | 
| 2081 | 
             
                "\n",
         | 
| 2082 | 
             
                "# Instructions\n",
         | 
| 2083 | 
            +
                "- Carefully read and understand the task. Sometimes the task might be a sentence reversed, so un reverse it first and then complete the task.\n",
         | 
| 2084 | 
             
                "- Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance \"discograpy\".\n",
         | 
| 2085 | 
             
                "- If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.\n",
         | 
| 2086 | 
             
                "- You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.\n",
         | 
| 2087 | 
             
                "- Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.\n",
         | 
| 2088 | 
            +
                "- You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `python_code_interpreter`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.\n",
         | 
| 2089 | 
            +
                "- If a python file is given to you, then use the `python_code_interpreter` and the input to the tool should be the file name.\n",
         | 
| 2090 | 
            +
                "- For any youtube related task use the `youtube_video_tool` and the input to the tool should be URL as a string along with the query.\n",
         | 
| 2091 | 
            +
                "- For any dataframe related tasks, always use the `read_excel_or_csv` tool.\n",
         | 
| 2092 | 
             
                "- If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.\n",
         | 
| 2093 | 
             
                "- You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.\n",
         | 
| 2094 | 
             
                "- Always verify your answers.\n",
         | 
|  | |
| 2131 | 
             
              },
         | 
| 2132 | 
             
              {
         | 
| 2133 | 
             
               "cell_type": "code",
         | 
| 2134 | 
            +
               "execution_count": 58,
         | 
| 2135 | 
             
               "id": "db96eef6",
         | 
| 2136 | 
            +
               "metadata": {
         | 
| 2137 | 
            +
                "execution": {
         | 
| 2138 | 
            +
                 "iopub.execute_input": "2025-06-14T12:13:39.737001Z",
         | 
| 2139 | 
            +
                 "iopub.status.busy": "2025-06-14T12:13:39.735802Z",
         | 
| 2140 | 
            +
                 "iopub.status.idle": "2025-06-14T12:13:39.858444Z",
         | 
| 2141 | 
            +
                 "shell.execute_reply": "2025-06-14T12:13:39.857410Z",
         | 
| 2142 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:13:39.736922Z"
         | 
| 2143 | 
            +
                }
         | 
| 2144 | 
            +
               },
         | 
| 2145 | 
             
               "outputs": [],
         | 
| 2146 | 
             
               "source": [
         | 
| 2147 | 
             
                "agent = create_react_agent(\n",
         | 
|  | |
| 2151 | 
             
                "        visit_web_page,\n",
         | 
| 2152 | 
             
                "        read_excel_or_csv,\n",
         | 
| 2153 | 
             
                "        python_executor,\n",
         | 
| 2154 | 
            +
                "        python_code_interpreter,\n",
         | 
| 2155 | 
             
                "        call_stt_tool,\n",
         | 
| 2156 | 
             
                "        image_tool,\n",
         | 
| 2157 | 
             
                "        youtube_video_tool,\n",
         | 
|  | |
| 2164 | 
             
              },
         | 
| 2165 | 
             
              {
         | 
| 2166 | 
             
               "cell_type": "code",
         | 
| 2167 | 
            +
               "execution_count": 42,
         | 
| 2168 | 
             
               "id": "33240d19",
         | 
| 2169 | 
            +
               "metadata": {
         | 
| 2170 | 
            +
                "execution": {
         | 
| 2171 | 
            +
                 "iopub.execute_input": "2025-06-14T12:08:59.302949Z",
         | 
| 2172 | 
            +
                 "iopub.status.busy": "2025-06-14T12:08:59.301715Z",
         | 
| 2173 | 
            +
                 "iopub.status.idle": "2025-06-14T12:08:59.308731Z",
         | 
| 2174 | 
            +
                 "shell.execute_reply": "2025-06-14T12:08:59.307387Z",
         | 
| 2175 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:08:59.302889Z"
         | 
| 2176 | 
            +
                }
         | 
| 2177 | 
            +
               },
         | 
| 2178 | 
             
               "outputs": [],
         | 
| 2179 | 
             
               "source": [
         | 
| 2180 | 
             
                "# recursion_limit = 10\n",
         | 
|  | |
| 2183 | 
             
              },
         | 
| 2184 | 
             
              {
         | 
| 2185 | 
             
               "cell_type": "code",
         | 
| 2186 | 
            +
               "execution_count": 43,
         | 
| 2187 | 
             
               "id": "850bb54d",
         | 
| 2188 | 
            +
               "metadata": {
         | 
| 2189 | 
            +
                "execution": {
         | 
| 2190 | 
            +
                 "iopub.execute_input": "2025-06-14T12:08:59.775583Z",
         | 
| 2191 | 
            +
                 "iopub.status.busy": "2025-06-14T12:08:59.774642Z",
         | 
| 2192 | 
            +
                 "iopub.status.idle": "2025-06-14T12:08:59.784401Z",
         | 
| 2193 | 
            +
                 "shell.execute_reply": "2025-06-14T12:08:59.783054Z",
         | 
| 2194 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:08:59.775524Z"
         | 
| 2195 | 
            +
                }
         | 
| 2196 | 
            +
               },
         | 
| 2197 | 
             
               "outputs": [
         | 
| 2198 | 
             
                {
         | 
| 2199 | 
             
                 "data": {
         | 
|  | |
| 2204 | 
             
                   " 'file_name': ''}"
         | 
| 2205 | 
             
                  ]
         | 
| 2206 | 
             
                 },
         | 
| 2207 | 
            +
                 "execution_count": 43,
         | 
| 2208 | 
             
                 "metadata": {},
         | 
| 2209 | 
             
                 "output_type": "execute_result"
         | 
| 2210 | 
             
                }
         | 
|  | |
| 2215 | 
             
              },
         | 
| 2216 | 
             
              {
         | 
| 2217 | 
             
               "cell_type": "code",
         | 
| 2218 | 
            +
               "execution_count": 59,
         | 
| 2219 | 
             
               "id": "f9025fae",
         | 
| 2220 | 
            +
               "metadata": {
         | 
| 2221 | 
            +
                "execution": {
         | 
| 2222 | 
            +
                 "iopub.execute_input": "2025-06-14T12:13:47.264111Z",
         | 
| 2223 | 
            +
                 "iopub.status.busy": "2025-06-14T12:13:47.263623Z",
         | 
| 2224 | 
            +
                 "iopub.status.idle": "2025-06-14T12:18:56.902572Z",
         | 
| 2225 | 
            +
                 "shell.execute_reply": "2025-06-14T12:18:56.900492Z",
         | 
| 2226 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:13:47.264070Z"
         | 
| 2227 | 
            +
                }
         | 
| 2228 | 
            +
               },
         | 
| 2229 | 
            +
               "outputs": [
         | 
| 2230 | 
            +
                {
         | 
| 2231 | 
            +
                 "name": "stdout",
         | 
| 2232 | 
            +
                 "output_type": "stream",
         | 
| 2233 | 
            +
                 "text": [
         | 
| 2234 | 
            +
                  "❌ Agent stopped due to max iterations.\n"
         | 
| 2235 | 
            +
                 ]
         | 
| 2236 | 
            +
                }
         | 
| 2237 | 
            +
               ],
         | 
| 2238 | 
            +
               "source": [
         | 
| 2239 | 
            +
                "results = []\n",
         | 
| 2240 | 
            +
                "for q in all_questions:\n",
         | 
| 2241 | 
            +
                "    try:\n",
         | 
| 2242 | 
            +
                "        answer = await agent.ainvoke(\n",
         | 
| 2243 | 
            +
                "        # answer = agent_w_recursion_limit.invoke(\n",
         | 
| 2244 | 
            +
                "            input={\n",
         | 
| 2245 | 
            +
                "                \"messages\": f\"\"\"Complete the following task: {q[\"question\"]}. Relevant file: {\n",
         | 
| 2246 | 
            +
                "                    q[\"file_name\"]\n",
         | 
| 2247 | 
            +
                "                    if q[\"file_name\"]\n",
         | 
| 2248 | 
            +
                "                    else \"There's no relevant file to use.\"\n",
         | 
| 2249 | 
            +
                "                }\"\"\"\n",
         | 
| 2250 | 
            +
                "            }\n",
         | 
| 2251 | 
            +
                "        )\n",
         | 
| 2252 | 
            +
                "        results.append(answer)\n",
         | 
| 2253 | 
            +
                "    except GraphRecursionError:\n",
         | 
| 2254 | 
            +
                "        print(\"❌ Agent stopped due to max iterations.\")\n",
         | 
| 2255 | 
            +
                "        results.append(q[\"task_id\"])"
         | 
| 2256 | 
            +
               ]
         | 
| 2257 | 
            +
              },
         | 
| 2258 | 
            +
              {
         | 
| 2259 | 
            +
               "cell_type": "code",
         | 
| 2260 | 
            +
               "execution_count": 45,
         | 
| 2261 | 
            +
               "id": "56f0e281-0454-4310-9aaa-4599697e45d6",
         | 
| 2262 | 
            +
               "metadata": {
         | 
| 2263 | 
            +
                "execution": {
         | 
| 2264 | 
            +
                 "iopub.execute_input": "2025-06-14T12:12:58.519979Z",
         | 
| 2265 | 
            +
                 "iopub.status.busy": "2025-06-14T12:12:58.519086Z",
         | 
| 2266 | 
            +
                 "iopub.status.idle": "2025-06-14T12:12:58.530618Z",
         | 
| 2267 | 
            +
                 "shell.execute_reply": "2025-06-14T12:12:58.528121Z",
         | 
| 2268 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:12:58.519907Z"
         | 
| 2269 | 
            +
                },
         | 
| 2270 | 
            +
                "scrolled": true
         | 
| 2271 | 
            +
               },
         | 
| 2272 | 
            +
               "outputs": [],
         | 
| 2273 | 
            +
               "source": [
         | 
| 2274 | 
            +
                "# [results[i][\"structured_response\"].answer if isinstance(results[i], dict) else \"No answer\" for i in range(len(all_questions))]"
         | 
| 2275 | 
            +
               ]
         | 
| 2276 | 
            +
              },
         | 
| 2277 | 
            +
              {
         | 
| 2278 | 
            +
               "cell_type": "code",
         | 
| 2279 | 
            +
               "execution_count": 49,
         | 
| 2280 | 
            +
               "id": "102b3d07-e243-48f4-a0e7-5019a9d1576e",
         | 
| 2281 | 
            +
               "metadata": {
         | 
| 2282 | 
            +
                "execution": {
         | 
| 2283 | 
            +
                 "iopub.execute_input": "2025-06-14T11:59:56.760447Z",
         | 
| 2284 | 
            +
                 "iopub.status.busy": "2025-06-14T11:59:56.753615Z",
         | 
| 2285 | 
            +
                 "iopub.status.idle": "2025-06-14T11:59:56.776865Z",
         | 
| 2286 | 
            +
                 "shell.execute_reply": "2025-06-14T11:59:56.775237Z",
         | 
| 2287 | 
            +
                 "shell.execute_reply.started": "2025-06-14T11:59:56.760348Z"
         | 
| 2288 | 
            +
                }
         | 
| 2289 | 
            +
               },
         | 
| 2290 | 
            +
               "outputs": [],
         | 
| 2291 | 
            +
               "source": [
         | 
| 2292 | 
            +
                "# with open(\"results_gpt_nano.pkl\", \"wb\") as f:\n",
         | 
| 2293 | 
            +
                "#     pickle.dump(obj=results, file=f, protocol=pickle.HIGHEST_PROTOCOL)"
         | 
| 2294 | 
            +
               ]
         | 
| 2295 | 
            +
              },
         | 
| 2296 | 
            +
              {
         | 
| 2297 | 
            +
               "cell_type": "code",
         | 
| 2298 | 
            +
               "execution_count": null,
         | 
| 2299 | 
            +
               "id": "814f62c5-999c-4d98-900f-88828c832bc5",
         | 
| 2300 | 
             
               "metadata": {},
         | 
| 2301 | 
             
               "outputs": [],
         | 
| 2302 | 
            +
               "source": []
         | 
| 2303 | 
            +
              },
         | 
| 2304 | 
            +
              {
         | 
| 2305 | 
            +
               "cell_type": "code",
         | 
| 2306 | 
            +
               "execution_count": 60,
         | 
| 2307 | 
            +
               "id": "22f6c4df-b0b1-451d-9bb5-84a9a4088dd9",
         | 
| 2308 | 
            +
               "metadata": {
         | 
| 2309 | 
            +
                "execution": {
         | 
| 2310 | 
            +
                 "iopub.execute_input": "2025-06-14T12:19:50.162820Z",
         | 
| 2311 | 
            +
                 "iopub.status.busy": "2025-06-14T12:19:50.162047Z",
         | 
| 2312 | 
            +
                 "iopub.status.idle": "2025-06-14T12:19:50.174784Z",
         | 
| 2313 | 
            +
                 "shell.execute_reply": "2025-06-14T12:19:50.172061Z",
         | 
| 2314 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:19:50.162757Z"
         | 
| 2315 | 
            +
                },
         | 
| 2316 | 
            +
                "scrolled": true
         | 
| 2317 | 
            +
               },
         | 
| 2318 | 
            +
               "outputs": [],
         | 
| 2319 | 
             
               "source": [
         | 
| 2320 | 
            +
                "answers = [{\"task_id\":j['task_id'], \n",
         | 
| 2321 | 
            +
                "            \"submitted_answer\": results[i][\"structured_response\"].answer\n",
         | 
| 2322 | 
            +
                "            if isinstance(results[i], dict) else \"No answer\"} \n",
         | 
| 2323 | 
            +
                " for i,j in enumerate(all_questions)]"
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 2324 | 
             
               ]
         | 
| 2325 | 
             
              },
         | 
| 2326 | 
             
              {
         | 
| 2327 | 
             
               "cell_type": "code",
         | 
| 2328 | 
            +
               "execution_count": 61,
         | 
| 2329 | 
            +
               "id": "6b3d183d-f8b6-410d-8581-1da96a1e6bb5",
         | 
| 2330 | 
            +
               "metadata": {
         | 
| 2331 | 
            +
                "execution": {
         | 
| 2332 | 
            +
                 "iopub.execute_input": "2025-06-14T12:19:51.962326Z",
         | 
| 2333 | 
            +
                 "iopub.status.busy": "2025-06-14T12:19:51.961499Z",
         | 
| 2334 | 
            +
                 "iopub.status.idle": "2025-06-14T12:19:51.974035Z",
         | 
| 2335 | 
            +
                 "shell.execute_reply": "2025-06-14T12:19:51.970718Z",
         | 
| 2336 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:19:51.962257Z"
         | 
| 2337 | 
            +
                }
         | 
| 2338 | 
            +
               },
         | 
| 2339 | 
            +
               "outputs": [],
         | 
| 2340 | 
            +
               "source": [
         | 
| 2341 | 
            +
                "data = {\n",
         | 
| 2342 | 
            +
                "  \"username\": \"string\",\n",
         | 
| 2343 | 
            +
                "  \"agent_code\": \"stringstri\",\n",
         | 
| 2344 | 
            +
                "  \"answers\": answers\n",
         | 
| 2345 | 
            +
                "}"
         | 
| 2346 | 
            +
               ]
         | 
| 2347 | 
            +
              },
         | 
| 2348 | 
            +
              {
         | 
| 2349 | 
            +
               "cell_type": "code",
         | 
| 2350 | 
            +
               "execution_count": 62,
         | 
| 2351 | 
            +
               "id": "93527645-f3c9-4c1b-b35f-006d1d067397",
         | 
| 2352 | 
            +
               "metadata": {
         | 
| 2353 | 
            +
                "execution": {
         | 
| 2354 | 
            +
                 "iopub.execute_input": "2025-06-14T12:19:52.917482Z",
         | 
| 2355 | 
            +
                 "iopub.status.busy": "2025-06-14T12:19:52.916225Z",
         | 
| 2356 | 
            +
                 "iopub.status.idle": "2025-06-14T12:19:54.900881Z",
         | 
| 2357 | 
            +
                 "shell.execute_reply": "2025-06-14T12:19:54.897934Z",
         | 
| 2358 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:19:52.917365Z"
         | 
| 2359 | 
            +
                }
         | 
| 2360 | 
            +
               },
         | 
| 2361 | 
            +
               "outputs": [],
         | 
| 2362 | 
            +
               "source": [
         | 
| 2363 | 
            +
                "answer_check = httpx.post(json=data, url=submit_url)"
         | 
| 2364 | 
            +
               ]
         | 
| 2365 | 
            +
              },
         | 
| 2366 | 
            +
              {
         | 
| 2367 | 
            +
               "cell_type": "code",
         | 
| 2368 | 
            +
               "execution_count": 63,
         | 
| 2369 | 
            +
               "id": "b3f7e6eb-2f8c-4b25-a83b-d3b1e628d09c",
         | 
| 2370 | 
            +
               "metadata": {
         | 
| 2371 | 
            +
                "execution": {
         | 
| 2372 | 
            +
                 "iopub.execute_input": "2025-06-14T12:19:54.907705Z",
         | 
| 2373 | 
            +
                 "iopub.status.busy": "2025-06-14T12:19:54.906633Z",
         | 
| 2374 | 
            +
                 "iopub.status.idle": "2025-06-14T12:19:54.924639Z",
         | 
| 2375 | 
            +
                 "shell.execute_reply": "2025-06-14T12:19:54.922534Z",
         | 
| 2376 | 
            +
                 "shell.execute_reply.started": "2025-06-14T12:19:54.907598Z"
         | 
| 2377 | 
            +
                },
         | 
| 2378 | 
            +
                "scrolled": true
         | 
| 2379 | 
            +
               },
         | 
| 2380 | 
            +
               "outputs": [
         | 
| 2381 | 
            +
                {
         | 
| 2382 | 
            +
                 "data": {
         | 
| 2383 | 
            +
                  "text/plain": [
         | 
| 2384 | 
            +
                   "{'username': 'string',\n",
         | 
| 2385 | 
            +
                   " 'score': 30.0,\n",
         | 
| 2386 | 
            +
                   " 'correct_count': 6,\n",
         | 
| 2387 | 
            +
                   " 'total_attempted': 20,\n",
         | 
| 2388 | 
            +
                   " 'message': 'Score calculated successfully: 6/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.',\n",
         | 
| 2389 | 
            +
                   " 'timestamp': '2025-06-14T12:19:54.753829+00:00'}"
         | 
| 2390 | 
            +
                  ]
         | 
| 2391 | 
            +
                 },
         | 
| 2392 | 
            +
                 "execution_count": 63,
         | 
| 2393 | 
            +
                 "metadata": {},
         | 
| 2394 | 
            +
                 "output_type": "execute_result"
         | 
| 2395 | 
            +
                }
         | 
| 2396 | 
            +
               ],
         | 
| 2397 | 
            +
               "source": [
         | 
| 2398 | 
            +
                "answer_check.json()"
         | 
| 2399 | 
            +
               ]
         | 
| 2400 | 
            +
              },
         | 
| 2401 | 
            +
              {
         | 
| 2402 | 
            +
               "cell_type": "code",
         | 
| 2403 | 
            +
               "execution_count": null,
         | 
| 2404 | 
            +
               "id": "ff87396c-e52f-4b6f-bc37-565e3ecf26f7",
         | 
| 2405 | 
             
               "metadata": {},
         | 
| 2406 | 
             
               "outputs": [],
         | 
| 2407 | 
            +
               "source": []
         | 
| 2408 | 
            +
              },
         | 
| 2409 | 
            +
              {
         | 
| 2410 | 
            +
               "cell_type": "code",
         | 
| 2411 | 
            +
               "execution_count": 108,
         | 
| 2412 | 
            +
               "id": "c98460f5-8671-4b98-b932-60f599dc7ec5",
         | 
| 2413 | 
            +
               "metadata": {
         | 
| 2414 | 
            +
                "execution": {
         | 
| 2415 | 
            +
                 "iopub.execute_input": "2025-06-14T10:27:48.568548Z",
         | 
| 2416 | 
            +
                 "iopub.status.busy": "2025-06-14T10:27:48.567818Z",
         | 
| 2417 | 
            +
                 "iopub.status.idle": "2025-06-14T10:27:48.581051Z",
         | 
| 2418 | 
            +
                 "shell.execute_reply": "2025-06-14T10:27:48.579486Z",
         | 
| 2419 | 
            +
                 "shell.execute_reply.started": "2025-06-14T10:27:48.568492Z"
         | 
| 2420 | 
            +
                }
         | 
| 2421 | 
            +
               },
         | 
| 2422 | 
            +
               "outputs": [
         | 
| 2423 | 
            +
                {
         | 
| 2424 | 
            +
                 "data": {
         | 
| 2425 | 
            +
                  "text/plain": [
         | 
| 2426 | 
            +
                   "{'task_id': '7bd855d8-463d-4ed5-93ca-5fe35145f733',\n",
         | 
| 2427 | 
            +
                   " 'question': 'The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.',\n",
         | 
| 2428 | 
            +
                   " 'Level': '1',\n",
         | 
| 2429 | 
            +
                   " 'file_name': '7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx'}"
         | 
| 2430 | 
            +
                  ]
         | 
| 2431 | 
            +
                 },
         | 
| 2432 | 
            +
                 "execution_count": 108,
         | 
| 2433 | 
            +
                 "metadata": {},
         | 
| 2434 | 
            +
                 "output_type": "execute_result"
         | 
| 2435 | 
            +
                }
         | 
| 2436 | 
            +
               ],
         | 
| 2437 | 
             
               "source": [
         | 
| 2438 | 
            +
                "all_questions[18]"
         | 
| 2439 | 
            +
               ]
         | 
| 2440 | 
            +
              },
         | 
| 2441 | 
            +
              {
         | 
| 2442 | 
            +
               "cell_type": "code",
         | 
| 2443 | 
            +
               "execution_count": 25,
         | 
| 2444 | 
            +
               "id": "550688eb-8a32-4b89-85a8-607d74a1b240",
         | 
| 2445 | 
            +
               "metadata": {
         | 
| 2446 | 
            +
                "execution": {
         | 
| 2447 | 
            +
                 "iopub.execute_input": "2025-06-14T11:12:34.441436Z",
         | 
| 2448 | 
            +
                 "iopub.status.busy": "2025-06-14T11:12:34.440861Z",
         | 
| 2449 | 
            +
                 "iopub.status.idle": "2025-06-14T11:13:00.795896Z",
         | 
| 2450 | 
            +
                 "shell.execute_reply": "2025-06-14T11:13:00.792624Z",
         | 
| 2451 | 
            +
                 "shell.execute_reply.started": "2025-06-14T11:12:34.441383Z"
         | 
| 2452 | 
            +
                },
         | 
| 2453 | 
            +
                "scrolled": true
         | 
| 2454 | 
            +
               },
         | 
| 2455 | 
            +
               "outputs": [
         | 
| 2456 | 
            +
                {
         | 
| 2457 | 
            +
                 "data": {
         | 
| 2458 | 
            +
                  "text/plain": [
         | 
| 2459 | 
            +
                   "{'messages': [HumanMessage(content='Complete the following task: The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.. Relevant file: 7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx', additional_kwargs={}, response_metadata={}, id='9c810520-a1cd-40b2-914e-8fd2f3e240c5'),\n",
         | 
| 2460 | 
            +
                   "  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_zp8tBlFZ1vOSnYqFQvbsgbi6', 'function': {'arguments': '{\"filepath\":\"7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx\"}', 'name': 'read_excel_or_csv'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 38, 'prompt_tokens': 1104, 'total_tokens': 1142, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 1024}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6JmHXTRb96TEdmDFjOpQVDXjGH', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--83bd6bab-e401-4383-9511-e4fe72002375-0', tool_calls=[{'name': 'read_excel_or_csv', 'args': {'filepath': '7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx'}, 'id': 'call_zp8tBlFZ1vOSnYqFQvbsgbi6', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1104, 'output_tokens': 38, 'total_tokens': 1142, 'input_token_details': {'audio': 0, 'cache_read': 1024}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
         | 
| 2461 | 
            +
                   "  ToolMessage(content=\"{'Location': ['Pinebrook', 'Wharvton', 'Sagrada', 'Algrimand', 'Marztep', 'San Cecelia', 'Pimento', 'Tinseles', 'Rosdale'], 'Burgers': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007], 'Hot Dogs': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009], 'Salads': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021], 'Fries': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989], 'Ice Cream': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005], 'Soda': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\", name='read_excel_or_csv', id='8b1777a8-387e-417a-a9eb-9af103d701e1', tool_call_id='call_zp8tBlFZ1vOSnYqFQvbsgbi6'),\n",
         | 
| 2462 | 
            +
                   "  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_T5jtJJqyz0zpCvMNwcRZtuqV', 'function': {'arguments': '{\"filepath\":\"7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx\"}', 'name': 'read_excel_or_csv'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 38, 'prompt_tokens': 1439, 'total_tokens': 1477, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6KCZFLq93YDUtQHAG8vGABGAoa', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--ad40832c-c24b-43b5-af46-4eb0a3945439-0', tool_calls=[{'name': 'read_excel_or_csv', 'args': {'filepath': '7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx'}, 'id': 'call_T5jtJJqyz0zpCvMNwcRZtuqV', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1439, 'output_tokens': 38, 'total_tokens': 1477, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
         | 
| 2463 | 
            +
                   "  ToolMessage(content=\"{'Location': ['Pinebrook', 'Wharvton', 'Sagrada', 'Algrimand', 'Marztep', 'San Cecelia', 'Pimento', 'Tinseles', 'Rosdale'], 'Burgers': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007], 'Hot Dogs': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009], 'Salads': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021], 'Fries': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989], 'Ice Cream': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005], 'Soda': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\", name='read_excel_or_csv', id='130e5ee1-fd7e-4120-8084-bcebf8d15a9c', tool_call_id='call_T5jtJJqyz0zpCvMNwcRZtuqV'),\n",
         | 
| 2464 | 
            +
                   "  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_IwlcalqVatpwkLnAbK29EVr3', 'function': {'arguments': '{\"filepath\":\"7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx\"}', 'name': 'read_excel_or_csv'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 38, 'prompt_tokens': 1774, 'total_tokens': 1812, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 1408}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6Lz0gaVXIFsxcfcisuIKWlLNLI', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--affe2bdb-f526-4bed-aae4-35cfc208b61a-0', tool_calls=[{'name': 'read_excel_or_csv', 'args': {'filepath': '7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx'}, 'id': 'call_IwlcalqVatpwkLnAbK29EVr3', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1774, 'output_tokens': 38, 'total_tokens': 1812, 'input_token_details': {'audio': 0, 'cache_read': 1408}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
         | 
| 2465 | 
            +
                   "  ToolMessage(content=\"{'Location': ['Pinebrook', 'Wharvton', 'Sagrada', 'Algrimand', 'Marztep', 'San Cecelia', 'Pimento', 'Tinseles', 'Rosdale'], 'Burgers': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007], 'Hot Dogs': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009], 'Salads': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021], 'Fries': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989], 'Ice Cream': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005], 'Soda': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\", name='read_excel_or_csv', id='ca8399cc-6d48-43df-b781-c8f064154532', tool_call_id='call_IwlcalqVatpwkLnAbK29EVr3'),\n",
         | 
| 2466 | 
            +
                   "  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_GIEGGs5RtOIT92ajgPAJARpt', 'function': {'arguments': '{\"code_str\":\"import pandas as pd\\\\n\\\\n# Data extracted from the Excel file\\\\ndata = {\\'Location\\': [\\'Pinebrook\\', \\'Wharvton\\', \\'Sagrada\\', \\'Algrimand\\', \\'Marztep\\', \\'San Cecelia\\', \\'Pimento\\', \\'Tinseles\\', \\'Rosdale\\'],\\\\n        \\'Burgers\\': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007],\\\\n        \\'Hot Dogs\\': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009],\\\\n        \\'Salads\\': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021],\\\\n        \\'Fries\\': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989],\\\\n        \\'Ice Cream\\': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005],\\\\n        \\'Soda\\': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\\\\n\\\\ndf = pd.DataFrame(data)\\\\n\\\\n# Prices for each item (assuming these are the prices in USD)\\\\nprices = {\\'Burgers\\': 5.50, \\'Hot Dogs\\': 3.75, \\'Salads\\': 4.25, \\'Fries\\': 2.50, \\'Ice Cream\\': 3.00, \\'Soda\\': 1.50}\\\\n\\\\n# Calculate total sales for food items (excluding drinks which is Soda)\\\\nfood_items = [\\'Burgers\\', \\'Hot Dogs\\', \\'Salads\\', \\'Fries\\', \\'Ice Cream\\']\\\\n\\\\n# Calculate total sales\\\\ntotal_sales = 0\\\\nfor item in food_items:\\\\n    total_sales += (df[item].sum() * prices[item])\\\\n\\\\nround(total_sales, 2)\"}', 'name': 'python_executor'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 506, 'prompt_tokens': 2109, 'total_tokens': 2615, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 1792}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6MtqSxoV60bBEmbp8dzJKWyM3Q', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--33c7ee3a-f032-4f24-864b-b773673199db-0', tool_calls=[{'name': 'python_executor', 'args': {'code_str': \"import pandas as pd\\n\\n# Data extracted from the Excel file\\ndata = {'Location': ['Pinebrook', 'Wharvton', 'Sagrada', 'Algrimand', 'Marztep', 'San Cecelia', 'Pimento', 'Tinseles', 'Rosdale'],\\n        'Burgers': [1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007],\\n        'Hot Dogs': [1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009],\\n        'Salads': [2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021],\\n        'Fries': [2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989],\\n        'Ice Cream': [1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005],\\n        'Soda': [1980, 2018, 2019, 2009, 2022, 2016, 2967, 2006, 2011]}\\n\\ndf = pd.DataFrame(data)\\n\\n# Prices for each item (assuming these are the prices in USD)\\nprices = {'Burgers': 5.50, 'Hot Dogs': 3.75, 'Salads': 4.25, 'Fries': 2.50, 'Ice Cream': 3.00, 'Soda': 1.50}\\n\\n# Calculate total sales for food items (excluding drinks which is Soda)\\nfood_items = ['Burgers', 'Hot Dogs', 'Salads', 'Fries', 'Ice Cream']\\n\\n# Calculate total sales\\ntotal_sales = 0\\nfor item in food_items:\\n    total_sales += (df[item].sum() * prices[item])\\n\\nround(total_sales, 2)\"}, 'id': 'call_GIEGGs5RtOIT92ajgPAJARpt', 'type': 'tool_call'}], usage_metadata={'input_tokens': 2109, 'output_tokens': 506, 'total_tokens': 2615, 'input_token_details': {'audio': 0, 'cache_read': 1792}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
         | 
| 2467 | 
            +
                   "  ToolMessage(content='Error: InterpreterError(\"Code execution failed at line \\'import pandas as pd\\' due to: InterpreterError: Import of pandas is not allowed. Authorized imports are: [\\'re\\', \\'statistics\\', \\'queue\\', \\'datetime\\', \\'numpy.*\\', \\'itertools\\', \\'math\\', \\'unicodedata\\', \\'stat\\', \\'collections\\', \\'random\\', \\'polars.*\\', \\'time\\']\")\\n Please fix your mistakes.', name='python_executor', id='56c8bfc0-165e-4ca3-b2c8-f19a71728d3f', tool_call_id='call_GIEGGs5RtOIT92ajgPAJARpt', status='error'),\n",
         | 
| 2468 | 
            +
                   "  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_ukghn9nyQv0MpZ2lPAnpdfjK', 'function': {'arguments': '{\"code_str\":\"import numpy as np\\\\n\\\\n# Data from the Excel file\\\\nburgers = np.array([1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007])\\\\nhot_dogs = np.array([1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009])\\\\nsalads = np.array([2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021])\\\\nfries = np.array([2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989])\\\\nice_cream = np.array([1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005])\\\\n\\\\n# Prices for each food item\\\\nprices = {\\'Burgers\\': 5.50, \\'Hot Dogs\\': 3.75, \\'Salads\\': 4.25, \\'Fries\\': 2.50, \\'Ice Cream\\': 3.00}\\\\n\\\\n# Calculate total sales for each food item\\\\ntotal_burgers = np.sum(burgers) * prices[\\'Burgers\\']\\\\ntotal_hot_dogs = np.sum(hot_dogs) * prices[\\'Hot Dogs\\']\\\\ntotal_salads = np.sum(salads) * prices[\\'Salads\\']\\\\ntotal_fries = np.sum(fries) * prices[\\'Fries\\']\\\\ntotal_ice_cream = np.sum(ice_cream) * prices[\\'Ice Cream\\']\\\\n\\\\n# Calculate total sales from food\\\\ntotal_sales_food = total_burgers + total_hot_dogs + total_salads + total_fries + total_ice_cream\\\\n\\\\nround(total_sales_food, 2)\"}', 'name': 'python_executor'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 451, 'prompt_tokens': 2707, 'total_tokens': 3158, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 2048}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6TIuM5izp01LjdF5U2znulHHxe', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--880f8528-bebd-4f78-8c4c-1dc05df990af-0', tool_calls=[{'name': 'python_executor', 'args': {'code_str': \"import numpy as np\\n\\n# Data from the Excel file\\nburgers = np.array([1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007])\\nhot_dogs = np.array([1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009])\\nsalads = np.array([2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021])\\nfries = np.array([2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989])\\nice_cream = np.array([1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005])\\n\\n# Prices for each food item\\nprices = {'Burgers': 5.50, 'Hot Dogs': 3.75, 'Salads': 4.25, 'Fries': 2.50, 'Ice Cream': 3.00}\\n\\n# Calculate total sales for each food item\\ntotal_burgers = np.sum(burgers) * prices['Burgers']\\ntotal_hot_dogs = np.sum(hot_dogs) * prices['Hot Dogs']\\ntotal_salads = np.sum(salads) * prices['Salads']\\ntotal_fries = np.sum(fries) * prices['Fries']\\ntotal_ice_cream = np.sum(ice_cream) * prices['Ice Cream']\\n\\n# Calculate total sales from food\\ntotal_sales_food = total_burgers + total_hot_dogs + total_salads + total_fries + total_ice_cream\\n\\nround(total_sales_food, 2)\"}, 'id': 'call_ukghn9nyQv0MpZ2lPAnpdfjK', 'type': 'tool_call'}], usage_metadata={'input_tokens': 2707, 'output_tokens': 451, 'total_tokens': 3158, 'input_token_details': {'audio': 0, 'cache_read': 2048}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
         | 
| 2469 | 
            +
                   "  ToolMessage(content='Error: InterpreterError(\"Code execution failed at line \\'round(total_sales_food, 2)\\' due to: InterpreterError: Forbidden function evaluation: \\'round\\' is not among the explicitly allowed tools or defined/imported in the preceding code\")\\n Please fix your mistakes.', name='python_executor', id='bae4bb38-046c-4c36-8f26-1daf4c187b61', tool_call_id='call_ukghn9nyQv0MpZ2lPAnpdfjK', status='error'),\n",
         | 
| 2470 | 
            +
                   "  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_XpqHUTDZzi8Rkuw2ubo53CZJ', 'function': {'arguments': '{\"code_str\":\"import numpy as np\\\\n\\\\nburgers = np.array([1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007])\\\\nhot_dogs = np.array([1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009])\\\\nsalads = np.array([2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021])\\\\nfries = np.array([2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989])\\\\nice_cream = np.array([1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005])\\\\n\\\\nprices = {\\'Burgers\\': 5.50, \\'Hot Dogs\\': 3.75, \\'Salads\\': 4.25, \\'Fries\\': 2.50, \\'Ice Cream\\': 3.00}\\\\n\\\\ntotal_burgers = np.sum(burgers) * prices[\\'Burgers\\']\\\\ntotal_hot_dogs = np.sum(hot_dogs) * prices[\\'Hot Dogs\\']\\\\ntotal_salads = np.sum(salads) * prices[\\'Salads\\']\\\\ntotal_fries = np.sum(fries) * prices[\\'Fries\\']\\\\ntotal_ice_cream = np.sum(ice_cream) * prices[\\'Ice Cream\\']\\\\n\\\\ntotal_sales_food = total_burgers + total_hot_dogs + total_salads + total_fries + total_ice_cream\\\\n\\\\n\\\\\"{:.2f}\\\\\".format(total_sales_food)\"}', 'name': 'python_executor'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 427, 'prompt_tokens': 3221, 'total_tokens': 3648, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 1024}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6ayTTFSltIIg72OsMWD12m59Y4', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--16bcd06a-ffb1-4adf-8528-0a2b47b046e1-0', tool_calls=[{'name': 'python_executor', 'args': {'code_str': 'import numpy as np\\n\\nburgers = np.array([1594, 1983, 2019, 1958, 2015, 2011, 2017, 1967, 2007])\\nhot_dogs = np.array([1999, 2008, 2022, 1971, 2016, 2010, 1999, 1969, 2009])\\nsalads = np.array([2002, 2014, 2022, 1982, 2018, 2012, 2001, 1982, 2021])\\nfries = np.array([2005, 2015, 2023, 1989, 2019, 2013, 2003, 1994, 1989])\\nice_cream = np.array([1977, 2017, 2021, 1998, 2021, 2015, 1969, 2005, 2005])\\n\\nprices = {\\'Burgers\\': 5.50, \\'Hot Dogs\\': 3.75, \\'Salads\\': 4.25, \\'Fries\\': 2.50, \\'Ice Cream\\': 3.00}\\n\\ntotal_burgers = np.sum(burgers) * prices[\\'Burgers\\']\\ntotal_hot_dogs = np.sum(hot_dogs) * prices[\\'Hot Dogs\\']\\ntotal_salads = np.sum(salads) * prices[\\'Salads\\']\\ntotal_fries = np.sum(fries) * prices[\\'Fries\\']\\ntotal_ice_cream = np.sum(ice_cream) * prices[\\'Ice Cream\\']\\n\\ntotal_sales_food = total_burgers + total_hot_dogs + total_salads + total_fries + total_ice_cream\\n\\n\"{:.2f}\".format(total_sales_food)'}, 'id': 'call_XpqHUTDZzi8Rkuw2ubo53CZJ', 'type': 'tool_call'}], usage_metadata={'input_tokens': 3221, 'output_tokens': 427, 'total_tokens': 3648, 'input_token_details': {'audio': 0, 'cache_read': 1024}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),\n",
         | 
| 2471 | 
            +
                   "  ToolMessage(content='340090.25', name='python_executor', id='70c25b8c-9bca-44c5-9f8e-a786d3641a68', tool_call_id='call_XpqHUTDZzi8Rkuw2ubo53CZJ'),\n",
         | 
| 2472 | 
            +
                   "  AIMessage(content='340090.25', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 3660, 'total_tokens': 3665, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 2944}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_6f2eabb9a5', 'id': 'chatcmpl-BiJ6g8VihYdHuX0NzmGFcCVxAbDlh', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--663ce019-aac1-4293-b368-8d8c2a8ed302-0', usage_metadata={'input_tokens': 3660, 'output_tokens': 5, 'total_tokens': 3665, 'input_token_details': {'audio': 0, 'cache_read': 2944}, 'output_token_details': {'audio': 0, 'reasoning': 0}})],\n",
         | 
| 2473 | 
            +
                   " 'structured_response': AnswerFormat(thoughts='I calculated the total sales for each food item by summing the quantities sold across all locations and multiplying by their respective prices. Then, I summed these totals to get the total sales from food items only, excluding drinks. The final total sales amount is formatted to two decimal places as requested.', answer='340090.25')}"
         | 
| 2474 | 
            +
                  ]
         | 
| 2475 | 
            +
                 },
         | 
| 2476 | 
            +
                 "execution_count": 25,
         | 
| 2477 | 
            +
                 "metadata": {},
         | 
| 2478 | 
            +
                 "output_type": "execute_result"
         | 
| 2479 | 
            +
                }
         | 
| 2480 | 
            +
               ],
         | 
| 2481 | 
            +
               "source": [
         | 
| 2482 | 
            +
                "agent.invoke(\n",
         | 
| 2483 | 
             
                "            input={\n",
         | 
| 2484 | 
            +
                "                \"messages\": f\"\"\"Complete the following task: {all_questions[18][\"question\"]}. Relevant file: {\n",
         | 
| 2485 | 
            +
                "                    all_questions[18][\"file_name\"]\n",
         | 
| 2486 | 
            +
                "                    if all_questions[18][\"file_name\"]\n",
         | 
| 2487 | 
            +
                "                    else \"There's no relevant file to use.\"\n",
         | 
| 2488 | 
             
                "                }\"\"\"\n",
         | 
| 2489 | 
             
                "            }\n",
         | 
| 2490 | 
            +
                "        )"
         | 
| 2491 | 
            +
               ]
         | 
| 2492 | 
            +
              },
         | 
| 2493 | 
            +
              {
         | 
| 2494 | 
            +
               "cell_type": "code",
         | 
| 2495 | 
            +
               "execution_count": 40,
         | 
| 2496 | 
            +
               "id": "a525780a",
         | 
| 2497 | 
            +
               "metadata": {
         | 
| 2498 | 
            +
                "execution": {
         | 
| 2499 | 
            +
                 "iopub.execute_input": "2025-06-14T09:41:31.611239Z",
         | 
| 2500 | 
            +
                 "iopub.status.busy": "2025-06-14T09:41:31.608132Z",
         | 
| 2501 | 
            +
                 "iopub.status.idle": "2025-06-14T09:41:31.627482Z",
         | 
| 2502 | 
            +
                 "shell.execute_reply": "2025-06-14T09:41:31.625052Z",
         | 
| 2503 | 
            +
                 "shell.execute_reply.started": "2025-06-14T09:41:31.611107Z"
         | 
| 2504 | 
            +
                }
         | 
| 2505 | 
            +
               },
         | 
| 2506 | 
            +
               "outputs": [],
         | 
| 2507 | 
            +
               "source": [
         | 
| 2508 | 
            +
                "# async def run_all_questions(agent, all_questions):\n",
         | 
| 2509 | 
            +
                "#     \"\"\"Run agent.ainvoke concurrently for all questions.\n",
         | 
| 2510 | 
            +
                "\n",
         | 
| 2511 | 
            +
                "#     Args:\n",
         | 
| 2512 | 
            +
                "#         agent: The LangChain agent with an ainvoke method.\n",
         | 
| 2513 | 
            +
                "#         all_questions (list): List of question dicts.\n",
         | 
| 2514 | 
            +
                "\n",
         | 
| 2515 | 
            +
                "#     Returns:\n",
         | 
| 2516 | 
            +
                "#         list: List of agent responses.\n",
         | 
| 2517 | 
            +
                "#     \"\"\"\n",
         | 
| 2518 | 
            +
                "#     tasks = [\n",
         | 
| 2519 | 
            +
                "#         agent.ainvoke(\n",
         | 
| 2520 | 
            +
                "#             input={\n",
         | 
| 2521 | 
            +
                "#                 \"messages\": f\"\"\"Complete the following task: {q[\"question\"]}. Relevant file: {\n",
         | 
| 2522 | 
            +
                "#                     q[\"file_name\"] if q[\"file_name\"] else \"There's no relevant file to use.\"\n",
         | 
| 2523 | 
            +
                "#                 }\"\"\"\n",
         | 
| 2524 | 
            +
                "#             }\n",
         | 
| 2525 | 
            +
                "#         )\n",
         | 
| 2526 | 
            +
                "#         for q in all_questions\n",
         | 
| 2527 | 
            +
                "#     ]\n",
         | 
| 2528 | 
            +
                "#     try:\n",
         | 
| 2529 | 
            +
                "#         return await asyncio.gather(*tasks)\n",
         | 
| 2530 | 
            +
                "#     except GraphRecursionError as err:\n",
         | 
| 2531 | 
            +
                "#         print(\"❌ Agent stopped due to max iterations.\")\n",
         | 
| 2532 | 
            +
                "#         print(f\"Error: {err}\")\n",
         | 
| 2533 | 
            +
                "#         # return \"Recursion Limit Reached.\"\n",
         | 
| 2534 | 
             
                "\n"
         | 
| 2535 | 
             
               ]
         | 
| 2536 | 
             
              },
         | 
|  | |
| 2541 | 
             
               "metadata": {},
         | 
| 2542 | 
             
               "outputs": [],
         | 
| 2543 | 
             
               "source": [
         | 
| 2544 | 
            +
                "# responses = asyncio.run(run_all_questions(agent, all_questions))"
         | 
| 2545 | 
             
               ]
         | 
| 2546 | 
             
              },
         | 
| 2547 | 
             
              {
         | 
| 2548 | 
             
               "cell_type": "code",
         | 
| 2549 | 
            +
               "execution_count": 43,
         | 
| 2550 | 
            +
               "id": "12e0f74d-e565-47bb-bf6d-0ba11570b451",
         | 
| 2551 | 
            +
               "metadata": {
         | 
| 2552 | 
            +
                "execution": {
         | 
| 2553 | 
            +
                 "iopub.execute_input": "2025-06-14T09:42:47.636798Z",
         | 
| 2554 | 
            +
                 "iopub.status.busy": "2025-06-14T09:42:47.635229Z",
         | 
| 2555 | 
            +
                 "iopub.status.idle": "2025-06-14T09:42:47.649310Z",
         | 
| 2556 | 
            +
                 "shell.execute_reply": "2025-06-14T09:42:47.647870Z",
         | 
| 2557 | 
            +
                 "shell.execute_reply.started": "2025-06-14T09:42:47.636728Z"
         | 
| 2558 | 
            +
                }
         | 
| 2559 | 
            +
               },
         | 
| 2560 | 
             
               "outputs": [
         | 
| 2561 | 
             
                {
         | 
| 2562 | 
             
                 "data": {
         | 
| 2563 | 
             
                  "text/plain": [
         | 
| 2564 | 
            +
                   "NoneType"
         | 
| 2565 | 
             
                  ]
         | 
| 2566 | 
             
                 },
         | 
| 2567 | 
            +
                 "execution_count": 43,
         | 
| 2568 | 
             
                 "metadata": {},
         | 
| 2569 | 
             
                 "output_type": "execute_result"
         | 
| 2570 | 
             
                }
         | 
| 2571 | 
             
               ],
         | 
| 2572 | 
             
               "source": [
         | 
| 2573 | 
            +
                "type(responses)"
         | 
| 2574 | 
             
               ]
         | 
| 2575 | 
             
              },
         | 
| 2576 | 
             
              {
         | 
| 2577 | 
             
               "cell_type": "code",
         | 
| 2578 | 
            +
               "execution_count": 35,
         | 
| 2579 | 
            +
               "id": "d28d2d35",
         | 
| 2580 | 
            +
               "metadata": {
         | 
| 2581 | 
            +
                "execution": {
         | 
| 2582 | 
            +
                 "iopub.execute_input": "2025-06-14T09:38:24.184443Z",
         | 
| 2583 | 
            +
                 "iopub.status.busy": "2025-06-14T09:38:24.183596Z",
         | 
| 2584 | 
            +
                 "iopub.status.idle": "2025-06-14T09:38:24.554423Z",
         | 
| 2585 | 
            +
                 "shell.execute_reply": "2025-06-14T09:38:24.552528Z",
         | 
| 2586 | 
            +
                 "shell.execute_reply.started": "2025-06-14T09:38:24.184340Z"
         | 
| 2587 | 
            +
                }
         | 
| 2588 | 
            +
               },
         | 
| 2589 | 
            +
               "outputs": [
         | 
| 2590 | 
            +
                {
         | 
| 2591 | 
            +
                 "ename": "TypeError",
         | 
| 2592 | 
            +
                 "evalue": "string indices must be integers, not 'str'",
         | 
| 2593 | 
            +
                 "output_type": "error",
         | 
| 2594 | 
            +
                 "traceback": [
         | 
| 2595 | 
            +
                  "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
         | 
| 2596 | 
            +
                  "\u001b[31mTypeError\u001b[39m                                 Traceback (most recent call last)",
         | 
| 2597 | 
            +
                  "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[35]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mresponses\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mstructured_response\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m.answer\n",
         | 
| 2598 | 
            +
                  "\u001b[31mTypeError\u001b[39m: string indices must be integers, not 'str'"
         | 
| 2599 | 
            +
                 ]
         | 
| 2600 | 
            +
                }
         | 
| 2601 | 
            +
               ],
         | 
| 2602 | 
             
               "source": [
         | 
| 2603 | 
            +
                "responses[2]['structured_response'].answer"
         | 
|  | |
| 2604 | 
             
               ]
         | 
| 2605 | 
             
              },
         | 
| 2606 | 
             
              {
         | 
|  | |
| 2613 | 
             
              },
         | 
| 2614 | 
             
              {
         | 
| 2615 | 
             
               "cell_type": "code",
         | 
| 2616 | 
            +
               "execution_count": 23,
         | 
| 2617 | 
             
               "id": "94449302",
         | 
| 2618 | 
            +
               "metadata": {
         | 
| 2619 | 
            +
                "execution": {
         | 
| 2620 | 
            +
                 "iopub.execute_input": "2025-06-14T09:36:13.544492Z",
         | 
| 2621 | 
            +
                 "iopub.status.busy": "2025-06-14T09:36:13.541377Z",
         | 
| 2622 | 
            +
                 "iopub.status.idle": "2025-06-14T09:36:13.558603Z",
         | 
| 2623 | 
            +
                 "shell.execute_reply": "2025-06-14T09:36:13.557527Z",
         | 
| 2624 | 
            +
                 "shell.execute_reply.started": "2025-06-14T09:36:13.544415Z"
         | 
| 2625 | 
            +
                },
         | 
| 2626 | 
            +
                "scrolled": true
         | 
| 2627 | 
            +
               },
         | 
| 2628 | 
             
               "outputs": [
         | 
| 2629 | 
             
                {
         | 
| 2630 | 
             
                 "data": {
         | 
| 2631 | 
             
                  "text/plain": [
         | 
| 2632 | 
            +
                   "{'task_id': '2d83110e-a098-4ebb-9987-066c06fa42d0',\n",
         | 
| 2633 | 
            +
                   " 'question': '.rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI',\n",
         | 
| 2634 | 
             
                   " 'Level': '1',\n",
         | 
| 2635 | 
             
                   " 'file_name': ''}"
         | 
| 2636 | 
             
                  ]
         | 
| 2637 | 
             
                 },
         | 
| 2638 | 
            +
                 "execution_count": 23,
         | 
| 2639 | 
             
                 "metadata": {},
         | 
| 2640 | 
             
                 "output_type": "execute_result"
         | 
| 2641 | 
             
                }
         | 
| 2642 | 
             
               ],
         | 
| 2643 | 
             
               "source": [
         | 
| 2644 | 
            +
                "all_questions[2]"
         | 
| 2645 | 
             
               ]
         | 
| 2646 | 
             
              },
         | 
| 2647 | 
             
              {
         | 
|  | |
| 2683 | 
             
             ],
         | 
| 2684 | 
             
             "metadata": {
         | 
| 2685 | 
             
              "kernelspec": {
         | 
| 2686 | 
            +
               "display_name": "Python 3 (ipykernel)",
         | 
| 2687 | 
             
               "language": "python",
         | 
| 2688 | 
             
               "name": "python3"
         | 
| 2689 | 
             
              },
         | 
    	
        pyproject.toml
    CHANGED
    
    | @@ -6,6 +6,8 @@ dependencies = [ | |
| 6 | 
             
                "accelerate>=1.7.0",
         | 
| 7 | 
             
                "av>=14.4.0",
         | 
| 8 | 
             
                "fastexcel>=0.14.0",
         | 
|  | |
|  | |
| 9 | 
             
                "huggingface-hub[hf-xet]>=0.32.6",
         | 
| 10 | 
             
                "langchain-community>=0.3.25",
         | 
| 11 | 
             
                "langchain-yt-dlp>=0.0.8",
         | 
|  | |
| 6 | 
             
                "accelerate>=1.7.0",
         | 
| 7 | 
             
                "av>=14.4.0",
         | 
| 8 | 
             
                "fastexcel>=0.14.0",
         | 
| 9 | 
            +
                "gradio>=5.34.0",
         | 
| 10 | 
            +
                "httpx>=0.28.1",
         | 
| 11 | 
             
                "huggingface-hub[hf-xet]>=0.32.6",
         | 
| 12 | 
             
                "langchain-community>=0.3.25",
         | 
| 13 | 
             
                "langchain-yt-dlp>=0.0.8",
         | 
    	
        results_gpt_mini.pkl
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:4d24e0fcef15657f57ea2d5b1bc8de286cf1a0a0b1919fa19494e8c1359e66f9
         | 
| 3 | 
            +
            size 486587
         | 
    	
        universal_agent.py
    CHANGED
    
    | @@ -27,7 +27,7 @@ nest_asyncio.apply() | |
| 27 | 
             
            with open("all_questions.pkl", "rb") as f:
         | 
| 28 | 
             
                all_questions = pickle.load(f)
         | 
| 29 | 
             
            lang_model = init_chat_model(
         | 
| 30 | 
            -
                model="gpt-4.1- | 
| 31 | 
             
            )
         | 
| 32 |  | 
| 33 |  | 
| @@ -56,9 +56,9 @@ def read_excel_or_csv(filepath: str) -> str: | |
| 56 | 
             
                return content_str
         | 
| 57 |  | 
| 58 |  | 
| 59 | 
            -
            def  | 
| 60 | 
             
                """Returns the output of a python code."""
         | 
| 61 | 
            -
                with open(filepath) as f:
         | 
| 62 | 
             
                    code = f.readlines()
         | 
| 63 | 
             
                code_result = lang_model.generate(
         | 
| 64 | 
             
                    messages=[
         | 
| @@ -76,6 +76,12 @@ def python_executor(filepath: str) -> Any: | |
| 76 | 
             
                )
         | 
| 77 | 
             
                return code_result.generations[0][0].text
         | 
| 78 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 79 |  | 
| 80 | 
             
            stt_tool = Tool.from_space(
         | 
| 81 | 
             
                space_id="UNSAFESUPERINTELLIGENCE/Minimum-OpenAI-Whisper",
         | 
| @@ -203,12 +209,15 @@ You are an expert agent - please keep going until the user’s query is complete | |
| 203 |  | 
| 204 |  | 
| 205 | 
             
            # Instructions
         | 
| 206 | 
            -
            - Carefully read and understand the task. 
         | 
| 207 | 
             
            - Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance "discograpy".
         | 
| 208 | 
             
            - If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.
         | 
| 209 | 
             
            - You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.
         | 
| 210 | 
             
            - Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.
         | 
| 211 | 
            -
            - You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.
         | 
|  | |
|  | |
|  | |
| 212 | 
             
            - If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.
         | 
| 213 | 
             
            - You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.
         | 
| 214 | 
             
            - Always verify your answers.
         | 
| @@ -247,6 +256,8 @@ What's 2 +2 ? | |
| 247 | 
             
            4
         | 
| 248 |  | 
| 249 | 
             
            """
         | 
|  | |
|  | |
| 250 | 
             
            agent = create_react_agent(
         | 
| 251 | 
             
                model=lang_model,
         | 
| 252 | 
             
                tools=[
         | 
| @@ -254,6 +265,7 @@ agent = create_react_agent( | |
| 254 | 
             
                    visit_web_page,
         | 
| 255 | 
             
                    read_excel_or_csv,
         | 
| 256 | 
             
                    python_executor,
         | 
|  | |
| 257 | 
             
                    call_stt_tool,
         | 
| 258 | 
             
                    image_tool,
         | 
| 259 | 
             
                    youtube_video_tool,
         | 
| @@ -262,56 +274,35 @@ agent = create_react_agent( | |
| 262 | 
             
                prompt=SYSTEM_PROMPT,
         | 
| 263 | 
             
                response_format=AnswerFormat,
         | 
| 264 | 
             
            )
         | 
|  | |
|  | |
| 265 | 
             
            # recursion_limit = 10
         | 
| 266 | 
             
            # agent_w_recursion_limit = agent.with_config(recursion_limit=recursion_limit)
         | 
| 267 | 
            -
            all_questions[0]
         | 
| 268 | 
            -
             | 
| 269 | 
            -
             | 
| 270 | 
            -
             | 
| 271 | 
            -
             | 
| 272 | 
            -
             | 
| 273 | 
            -
            # | 
| 274 | 
            -
            #             "messages": f"""Complete the following task: {all_questions[0]["question"]}. Relevant file: {
         | 
| 275 | 
            -
            #                 all_questions[0]["file_name"]
         | 
| 276 | 
            -
            #                 if all_questions[0]["file_name"]
         | 
| 277 | 
            -
            #                 else "There's no relevant file to use."
         | 
| 278 | 
            -
            #             }"""
         | 
| 279 | 
            -
            #         }
         | 
| 280 | 
            -
            #     )
         | 
| 281 | 
            -
            # except GraphRecursionError:
         | 
| 282 | 
            -
            #     print("❌ Agent stopped due to max iterations.")
         | 
| 283 | 
            -
            async def run_all_questions(agent, all_questions):
         | 
| 284 | 
            -
                """Run agent.ainvoke concurrently for all questions.
         | 
| 285 | 
            -
             | 
| 286 | 
            -
                Args:
         | 
| 287 | 
            -
                    agent: The LangChain agent with an ainvoke method.
         | 
| 288 | 
            -
                    all_questions (list): List of question dicts.
         | 
| 289 | 
            -
             | 
| 290 | 
            -
                Returns:
         | 
| 291 | 
            -
                    list: List of agent responses.
         | 
| 292 | 
            -
                """
         | 
| 293 | 
            -
                tasks = [
         | 
| 294 | 
            -
                    agent.ainvoke(
         | 
| 295 | 
             
                        input={
         | 
| 296 | 
            -
                            "messages": f"""Complete the following task: {
         | 
| 297 | 
            -
                                q["question"]
         | 
| 298 | 
            -
                            }. Relevant file: {
         | 
| 299 | 
             
                                q["file_name"]
         | 
| 300 | 
             
                                if q["file_name"]
         | 
| 301 | 
             
                                else "There's no relevant file to use."
         | 
| 302 | 
             
                            }"""
         | 
| 303 | 
             
                        }
         | 
| 304 | 
             
                    )
         | 
| 305 | 
            -
                     | 
| 306 | 
            -
                 | 
| 307 | 
            -
                try:
         | 
| 308 | 
            -
                    return await asyncio.gather(*tasks)
         | 
| 309 | 
            -
                except GraphRecursionError as err:
         | 
| 310 | 
             
                    print("❌ Agent stopped due to max iterations.")
         | 
| 311 | 
            -
                     | 
| 312 | 
            -
                    return "Recursion Limit Reached."
         | 
| 313 |  | 
|  | |
|  | |
| 314 |  | 
| 315 | 
            -
             | 
| 316 | 
            -
             | 
|  | |
|  | |
| 317 |  | 
|  | |
| 27 | 
             
            with open("all_questions.pkl", "rb") as f:
         | 
| 28 | 
             
                all_questions = pickle.load(f)
         | 
| 29 | 
             
            lang_model = init_chat_model(
         | 
| 30 | 
            +
                model="gpt-4.1-mini", model_provider="openai", temperature=0.2
         | 
| 31 | 
             
            )
         | 
| 32 |  | 
| 33 |  | 
|  | |
| 56 | 
             
                return content_str
         | 
| 57 |  | 
| 58 |  | 
| 59 | 
            +
            def python_code_interpreter(filepath: str) -> Any:
         | 
| 60 | 
             
                """Returns the output of a python code."""
         | 
| 61 | 
            +
                with open(filepath, "r") as f:
         | 
| 62 | 
             
                    code = f.readlines()
         | 
| 63 | 
             
                code_result = lang_model.generate(
         | 
| 64 | 
             
                    messages=[
         | 
|  | |
| 76 | 
             
                )
         | 
| 77 | 
             
                return code_result.generations[0][0].text
         | 
| 78 |  | 
| 79 | 
            +
            def python_executor(code_str:str) -> str:
         | 
| 80 | 
            +
                """This executes python code. The code must be a string.
         | 
| 81 | 
            +
                For any calculations always use numpy."""
         | 
| 82 | 
            +
                lpe = LocalPythonExecutor(additional_authorized_imports=['polars.*', 'numpy.*'])
         | 
| 83 | 
            +
                code_res = lpe(code_action=code_str)[0]
         | 
| 84 | 
            +
                return code_res
         | 
| 85 |  | 
| 86 | 
             
            stt_tool = Tool.from_space(
         | 
| 87 | 
             
                space_id="UNSAFESUPERINTELLIGENCE/Minimum-OpenAI-Whisper",
         | 
|  | |
| 209 |  | 
| 210 |  | 
| 211 | 
             
            # Instructions
         | 
| 212 | 
            +
            - Carefully read and understand the task. Sometimes the task might be a sentence reversed, so un reverse it first and then complete the task.
         | 
| 213 | 
             
            - Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance "discograpy".
         | 
| 214 | 
             
            - If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.
         | 
| 215 | 
             
            - You can use a combination of tools to complete the task, however, you don't have to use the tools all the time.
         | 
| 216 | 
             
            - Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call.
         | 
| 217 | 
            +
            - You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `python_code_interpreter`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`.
         | 
| 218 | 
            +
            - If a python file is given to you, then use the `python_code_interpreter` and the input to the tool should be the file name.
         | 
| 219 | 
            +
            - For any youtube related task use the `youtube_video_tool` and the input to the tool should be URL as a string along with the query.
         | 
| 220 | 
            +
            - For any dataframe related tasks, always use the `read_excel_or_csv` tool.
         | 
| 221 | 
             
            - If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool.
         | 
| 222 | 
             
            - You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully.
         | 
| 223 | 
             
            - Always verify your answers.
         | 
|  | |
| 256 | 
             
            4
         | 
| 257 |  | 
| 258 | 
             
            """
         | 
| 259 | 
            +
             | 
| 260 | 
            +
             | 
| 261 | 
             
            agent = create_react_agent(
         | 
| 262 | 
             
                model=lang_model,
         | 
| 263 | 
             
                tools=[
         | 
|  | |
| 265 | 
             
                    visit_web_page,
         | 
| 266 | 
             
                    read_excel_or_csv,
         | 
| 267 | 
             
                    python_executor,
         | 
| 268 | 
            +
                    python_code_interpreter,
         | 
| 269 | 
             
                    call_stt_tool,
         | 
| 270 | 
             
                    image_tool,
         | 
| 271 | 
             
                    youtube_video_tool,
         | 
|  | |
| 274 | 
             
                prompt=SYSTEM_PROMPT,
         | 
| 275 | 
             
                response_format=AnswerFormat,
         | 
| 276 | 
             
            )
         | 
| 277 | 
            +
             | 
| 278 | 
            +
             | 
| 279 | 
             
            # recursion_limit = 10
         | 
| 280 | 
             
            # agent_w_recursion_limit = agent.with_config(recursion_limit=recursion_limit)
         | 
| 281 | 
            +
            # all_questions[0]
         | 
| 282 | 
            +
             | 
| 283 | 
            +
            results = []
         | 
| 284 | 
            +
            for q in all_questions:
         | 
| 285 | 
            +
                try:
         | 
| 286 | 
            +
                    answer = await agent.ainvoke(
         | 
| 287 | 
            +
                    # answer = agent_w_recursion_limit.invoke(
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 288 | 
             
                        input={
         | 
| 289 | 
            +
                            "messages": f"""Complete the following task: {q["question"]}. Relevant file: {
         | 
|  | |
|  | |
| 290 | 
             
                                q["file_name"]
         | 
| 291 | 
             
                                if q["file_name"]
         | 
| 292 | 
             
                                else "There's no relevant file to use."
         | 
| 293 | 
             
                            }"""
         | 
| 294 | 
             
                        }
         | 
| 295 | 
             
                    )
         | 
| 296 | 
            +
                    results.append(answer)
         | 
| 297 | 
            +
                except GraphRecursionError:
         | 
|  | |
|  | |
|  | |
| 298 | 
             
                    print("❌ Agent stopped due to max iterations.")
         | 
| 299 | 
            +
                    results.append(q["task_id"])
         | 
|  | |
| 300 |  | 
| 301 | 
            +
            # with open("results_gpt_mini.pkl", "wb") as f:
         | 
| 302 | 
            +
            #     pickle.dump(obj=results, file=f, protocol=pickle.HIGHEST_PROTOCOL)
         | 
| 303 |  | 
| 304 | 
            +
            answers = [{"task_id":j['task_id'],
         | 
| 305 | 
            +
                        "submitted_answer": results[i]["structured_response"].answer
         | 
| 306 | 
            +
                        if isinstance(results[i], dict) else "No answer"} 
         | 
| 307 | 
            +
             for i,j in enumerate(all_questions)]
         | 
| 308 |  | 
    	
        uv.lock
    CHANGED
    
    | @@ -3212,6 +3212,8 @@ dependencies = [ | |
| 3212 | 
             
                { name = "accelerate" },
         | 
| 3213 | 
             
                { name = "av" },
         | 
| 3214 | 
             
                { name = "fastexcel" },
         | 
|  | |
|  | |
| 3215 | 
             
                { name = "huggingface-hub", extra = ["hf-xet"] },
         | 
| 3216 | 
             
                { name = "langchain", extra = ["openai"] },
         | 
| 3217 | 
             
                { name = "langchain-community" },
         | 
| @@ -3242,6 +3244,8 @@ requires-dist = [ | |
| 3242 | 
             
                { name = "accelerate", specifier = ">=1.7.0" },
         | 
| 3243 | 
             
                { name = "av", specifier = ">=14.4.0" },
         | 
| 3244 | 
             
                { name = "fastexcel", specifier = ">=0.14.0" },
         | 
|  | |
|  | |
| 3245 | 
             
                { name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=0.32.6" },
         | 
| 3246 | 
             
                { name = "langchain", extras = ["openai"], specifier = ">=0.3.25" },
         | 
| 3247 | 
             
                { name = "langchain-community", specifier = ">=0.3.25" },
         | 
|  | |
| 3212 | 
             
                { name = "accelerate" },
         | 
| 3213 | 
             
                { name = "av" },
         | 
| 3214 | 
             
                { name = "fastexcel" },
         | 
| 3215 | 
            +
                { name = "gradio" },
         | 
| 3216 | 
            +
                { name = "httpx" },
         | 
| 3217 | 
             
                { name = "huggingface-hub", extra = ["hf-xet"] },
         | 
| 3218 | 
             
                { name = "langchain", extra = ["openai"] },
         | 
| 3219 | 
             
                { name = "langchain-community" },
         | 
|  | |
| 3244 | 
             
                { name = "accelerate", specifier = ">=1.7.0" },
         | 
| 3245 | 
             
                { name = "av", specifier = ">=14.4.0" },
         | 
| 3246 | 
             
                { name = "fastexcel", specifier = ">=0.14.0" },
         | 
| 3247 | 
            +
                { name = "gradio", specifier = ">=5.34.0" },
         | 
| 3248 | 
            +
                { name = "httpx", specifier = ">=0.28.1" },
         | 
| 3249 | 
             
                { name = "huggingface-hub", extras = ["hf-xet"], specifier = ">=0.32.6" },
         | 
| 3250 | 
             
                { name = "langchain", extras = ["openai"], specifier = ">=0.3.25" },
         | 
| 3251 | 
             
                { name = "langchain-community", specifier = ">=0.3.25" },
         | 
