Spaces:

John-Jiang
/

starfish_data_ai

Running

File size: 25,618 Bytes

5301c48

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from starfish import data_gen_template"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['starfish/generate_func_call_dataset', 'starfish/generate_by_topic']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_gen_template.list()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "loaded = data_gen_template.get(\"starfish/generate_by_topic\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "get the template input_data schema and example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-05-23 11:23:57\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1mPlease run the template with this input schema\u001b[0m\n",
      "\u001b[32m2025-05-23 11:23:57\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m{\n",
      "    \"description\": \"Input schema for the generate_by_topic template.\\n\\nIMPORTANT: This Pydantic model is the single source of truth for default values.\\nThe validation and default values are controlled by this model, not the function signature.\",\n",
      "    \"properties\": {\n",
      "        \"user_instruction\": {\n",
      "            \"anyOf\": [\n",
      "                {\n",
      "                    \"type\": \"string\"\n",
      "                },\n",
      "                {\n",
      "                    \"type\": \"null\"\n",
      "                }\n",
      "            ],\n",
      "            \"default\": null,\n",
      "            \"title\": \"User Instruction\"\n",
      "        },\n",
      "        \"num_records\": {\n",
      "            \"anyOf\": [\n",
      "                {\n",
      "                    \"type\": \"integer\"\n",
      "                },\n",
      "                {\n",
      "                    \"type\": \"null\"\n",
      "                }\n",
      "            ],\n",
      "            \"default\": 10,\n",
      "            \"title\": \"Num Records\"\n",
      "        },\n",
      "        \"records_per_topic\": {\n",
      "            \"default\": 10,\n",
      "            \"title\": \"Records Per Topic\",\n",
      "            \"type\": \"integer\"\n",
      "        },\n",
      "        \"topics\": {\n",
      "            \"anyOf\": [\n",
      "                {\n",
      "                    \"items\": {\n",
      "                        \"anyOf\": [\n",
      "                            {\n",
      "                                \"type\": \"string\"\n",
      "                            },\n",
      "                            {\n",
      "                                \"additionalProperties\": {\n",
      "                                    \"type\": \"integer\"\n",
      "                                },\n",
      "                                \"type\": \"object\"\n",
      "                            }\n",
      "                        ]\n",
      "                    },\n",
      "                    \"type\": \"array\"\n",
      "                },\n",
      "                {\n",
      "                    \"type\": \"null\"\n",
      "                }\n",
      "            ],\n",
      "            \"default\": null,\n",
      "            \"title\": \"Topics\"\n",
      "        },\n",
      "        \"topic_model_name\": {\n",
      "            \"default\": \"openai/gpt-4o-mini\",\n",
      "            \"title\": \"Topic Model Name\",\n",
      "            \"type\": \"string\"\n",
      "        },\n",
      "        \"topic_model_kwargs\": {\n",
      "            \"anyOf\": [\n",
      "                {\n",
      "                    \"additionalProperties\": true,\n",
      "                    \"type\": \"object\"\n",
      "                },\n",
      "                {\n",
      "                    \"type\": \"null\"\n",
      "                }\n",
      "            ],\n",
      "            \"default\": null,\n",
      "            \"title\": \"Topic Model Kwargs\"\n",
      "        },\n",
      "        \"generation_model_name\": {\n",
      "            \"default\": \"openai/gpt-4o-mini\",\n",
      "            \"title\": \"Generation Model Name\",\n",
      "            \"type\": \"string\"\n",
      "        },\n",
      "        \"generation_model_kwargs\": {\n",
      "            \"anyOf\": [\n",
      "                {\n",
      "                    \"additionalProperties\": true,\n",
      "                    \"type\": \"object\"\n",
      "                },\n",
      "                {\n",
      "                    \"type\": \"null\"\n",
      "                }\n",
      "            ],\n",
      "            \"default\": null,\n",
      "            \"title\": \"Generation Model Kwargs\"\n",
      "        },\n",
      "        \"output_schema\": {\n",
      "            \"anyOf\": [\n",
      "                {\n",
      "                    \"items\": {\n",
      "                        \"additionalProperties\": true,\n",
      "                        \"type\": \"object\"\n",
      "                    },\n",
      "                    \"type\": \"array\"\n",
      "                },\n",
      "                {\n",
      "                    \"additionalProperties\": true,\n",
      "                    \"type\": \"object\"\n",
      "                },\n",
      "                {\n",
      "                    \"type\": \"null\"\n",
      "                }\n",
      "            ],\n",
      "            \"default\": [\n",
      "                {\n",
      "                    \"name\": \"question\",\n",
      "                    \"type\": \"str\"\n",
      "                },\n",
      "                {\n",
      "                    \"name\": \"answer\",\n",
      "                    \"type\": \"str\"\n",
      "                }\n",
      "            ],\n",
      "            \"title\": \"Output Schema\"\n",
      "        },\n",
      "        \"data_factory_config\": {\n",
      "            \"anyOf\": [\n",
      "                {\n",
      "                    \"additionalProperties\": true,\n",
      "                    \"type\": \"object\"\n",
      "                },\n",
      "                {\n",
      "                    \"type\": \"null\"\n",
      "                }\n",
      "            ],\n",
      "            \"default\": {},\n",
      "            \"title\": \"Data Factory Config\"\n",
      "        }\n",
      "    },\n",
      "    \"title\": \"GenerateByTopicInput\",\n",
      "    \"type\": \"object\"\n",
      "}\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "loaded.print_schema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-05-23 11:24:01\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1mHere is an example with api_contract.name as weather_api.get_current_weather\u001b[0m\n",
      "\u001b[32m2025-05-23 11:24:01\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m{\n",
      "        \"user_instruction\": \"Generate Q&A pairs about machine learning concepts\",\n",
      "        \"num_records\": 100,\n",
      "        \"records_per_topic\": 5,\n",
      "        \"topics\": [\n",
      "            \"supervised learning\",\n",
      "            \"unsupervised learning\",\n",
      "            {\"reinforcement learning\": 3},  # This means generate 3 records for this topic\n",
      "            \"neural networks\",\n",
      "        ],\n",
      "        \"topic_model_name\": \"openai/gpt-4\",\n",
      "        \"topic_model_kwargs\": {\"temperature\": 0.7},\n",
      "        \"generation_model_name\": \"openai/gpt-4\",\n",
      "        \"generation_model_kwargs\": {\"temperature\": 0.8, \"max_tokens\": 200},\n",
      "        \"output_schema\": [\n",
      "            {\"name\": \"question\", \"type\": \"str\"},\n",
      "            {\"name\": \"answer\", \"type\": \"str\"},\n",
      "            {\"name\": \"difficulty\", \"type\": \"str\"},  # Added an additional field\n",
      "        ],\n",
      "        \"data_factory_config\": {\"max_concurrency\": 4, \"task_runner_timeout\": 60 * 2},\n",
      "    }\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "loaded.print_example()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🌟 Function Calling Dataset Generation Pipeline\n",
      "============================================================\n",
      "📋 Process Overview:\n",
      "   1. Calculate optimal data distribution\n",
      "   2. Generate diverse topics\n",
      "   3. Create subtopics for each topic\n",
      "   4. Generate query-answer pairs\n",
      "   5. Verify and validate generated data\n",
      "   6. Regenerate failed cases\n",
      "============================================================\n",
      "📊 Data Distribution Plan:\n",
      "   • Requested: 10 records\n",
      "   • Distribution: 1 topics × 1 subtopics × 10 records\n",
      "   • Total generation: 10 records\n",
      "   • API calls needed: 3\n",
      "\n",
      "🎯 Step 1: Generating diverse topics...\n",
      "   ✅ Generated 1 topics\n",
      "\n",
      "🌿 Step 2: Creating subtopics for each topic...\n",
      "\u001b[32m2025-05-23 00:27:04\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: e6763e50-6438-4df5-81a9-5a68ce3f8468\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:04\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
      "   ✅ Generated 1 subtopics total\n",
      "\n",
      "💬 Step 3: Generating query-answer pairs...\n",
      "\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 1931c5c8-c1f3-4268-98b7-1a5295b8abf2\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:06\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:09\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:12\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:15\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:18\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:21\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:24\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:27\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
      "   ✅ Generated 10 initial query-answer pairs\n",
      "\n",
      "🔍 Step 4: Verifying data quality...\n",
      "\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: f036c07c-1cd2-4690-be92-bac359e45544\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:28\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/10\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:31\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/10\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:34\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 9/10\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 9\u001b[0m    (\u001b[32mCompleted: 9\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 10/10\u001b[0m | \u001b[33mAttempted: 10\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mCannot serialize function for resume due to unsupported type: cannot pickle '_hashlib.HMAC' object\u001b[0m\n",
      "   ✅ Quality check complete: 9 passed, 1 failed\n",
      "\n",
      "🔄 Step 5: Regenerating failed cases...\n",
      "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 3d6183a2-e465-4807-9e18-cbb84dc0d28f\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:35\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 8754bec6-25e3-40bd-9743-f2763fc1091f\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:37\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:40\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m, \u001b[1;31mInDeadQueue: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/1\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0, InDeadQueue: 0)\u001b[0m\n",
      "\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mCannot serialize function for resume due to unsupported type: cannot pickle '_hashlib.HMAC' object\u001b[0m\n",
      "   ✅ Regenerated 1 pairs, 1 still failing\n",
      "\u001b[32m2025-05-23 00:27:41\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[33m\u001b[1mSome data still failing after regeneration - prompts may need improvement\u001b[0m\n",
      "🎯 Perfect! Generated exactly 10 records as requested\n",
      "\n",
      "🎉 Generation Complete!\n",
      "============================================================\n",
      "📈 Final Results:\n",
      "   • Records generated: 10\n",
      "   • Success rate: 10/10 (100.0%)\n",
      "   • Distribution used: 1T × 1S × 10R\n",
      "\n",
      "⭐ If you found this helpful, please consider starring our repo!\n",
      "   Your support means the world to us! 🌟\n",
      "============================================================\n"
     ]
    }
   ],
   "source": [
    "input_data = {\n",
    "        \"user_instruction\": \"Generate Q&A pairs about machine learning concepts\",\n",
    "        \"num_records\": 100,\n",
    "        \"records_per_topic\": 5,\n",
    "        \"topics\": [\n",
    "            \"supervised learning\",\n",
    "            \"unsupervised learning\",\n",
    "            {\"reinforcement learning\": 3},  # This means generate 3 records for this topic\n",
    "            \"neural networks\",\n",
    "        ],\n",
    "        \"topic_model_name\": \"openai/gpt-4\",\n",
    "        \"topic_model_kwargs\": {\"temperature\": 0.7},\n",
    "        \"generation_model_name\": \"openai/gpt-4\",\n",
    "        \"generation_model_kwargs\": {\"temperature\": 0.8, \"max_tokens\": 200},\n",
    "        \"output_schema\": [\n",
    "            {\"name\": \"question\", \"type\": \"str\"},\n",
    "            {\"name\": \"answer\", \"type\": \"str\"},\n",
    "            {\"name\": \"difficulty\", \"type\": \"str\"},  # Added an additional field\n",
    "        ],\n",
    "        \"data_factory_config\": {\"max_concurrency\": 4, \"task_runner_timeout\": 60 * 2},\n",
    "    }\n",
    "data = await loaded.run(input_data=input_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'query': 'Can you check the current weather in Toronto and Rome? Use Fahrenheit for both locations.',\n",
       "  'answer': [{'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Toronto', 'units': 'Fahrenheit'}},\n",
       "   {'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Rome', 'units': 'Fahrenheit'}}]},\n",
       " {'query': 'Get me the current weather in Mumbai and also in Johannesburg, please use Fahrenheit for both.',\n",
       "  'answer': [{'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Mumbai', 'units': 'Fahrenheit'}},\n",
       "   {'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Johannesburg', 'units': 'Fahrenheit'}}]},\n",
       " {'query': 'I need the current weather for Sydney and London. What are the temperatures in Celsius?',\n",
       "  'answer': [{'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Sydney', 'units': 'Celsius'}},\n",
       "   {'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'London', 'units': 'Celsius'}}]},\n",
       " {'query': 'Please find the current weather in Buenos Aires and Cape Town, using Celsius for Buenos Aires.',\n",
       "  'answer': [{'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Buenos Aires', 'units': 'Celsius'}},\n",
       "   {'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Cape Town'}}]},\n",
       " {'query': 'What’s the weather like in Moscow? Also, can you get the current conditions in Beijing?',\n",
       "  'answer': [{'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Moscow'}},\n",
       "   {'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Beijing'}}]},\n",
       " {'query': 'Can you tell me the current weather in Tokyo and in Los Angeles? Please provide both in Fahrenheit.',\n",
       "  'answer': [{'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Tokyo', 'units': 'Fahrenheit'}},\n",
       "   {'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Los Angeles', 'units': 'Fahrenheit'}}]},\n",
       " {'query': 'Please provide the current weather for Berlin and Cairo, using Celsius for Berlin and no specific unit for Cairo.',\n",
       "  'answer': [{'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Berlin', 'units': 'Celsius'}},\n",
       "   {'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Cairo'}}]},\n",
       " {'query': 'I need the current weather in Seattle and in Santiago. Use Fahrenheit for Seattle and Celsius for Santiago.',\n",
       "  'answer': [{'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Seattle', 'units': 'Fahrenheit'}},\n",
       "   {'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Santiago', 'units': 'Celsius'}}]},\n",
       " {'query': \"What's the current temperature in San Francisco? Can you also check the weather in Paris?\",\n",
       "  'answer': [{'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'San Francisco'}},\n",
       "   {'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'Paris'}}]},\n",
       " {'query': 'What is the current weather in New York City? And can you also provide the temperature in Celsius?',\n",
       "  'answer': [{'name': 'weather_api.get_current_weather',\n",
       "    'arguments': {'location': 'New York City', 'units': 'Celsius'}}]}]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}