Spaces:

John-Jiang
/

starfish_data_ai

Running

File size: 51,157 Bytes

5301c48

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Google Colab Version: [Open this notebook in Google Colab](https://colab.research.google.com/github/starfishdata/starfish/blob/main/examples/data_factory.ipynb)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Dependencies "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: starfish-core in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (0.1.0)\n",
      "Requirement already satisfied: aiofiles<25.0.0,>=24.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (24.1.0)\n",
      "Requirement already satisfied: aiosqlite<0.22.0,>=0.21.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.21.0)\n",
      "Requirement already satisfied: cachetools<6.0.0,>=5.5.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (5.5.2)\n",
      "Requirement already satisfied: litellm<2.0.0,>=1.65.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (1.65.1)\n",
      "Requirement already satisfied: loguru<0.8.0,>=0.7.3 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.7.3)\n",
      "Requirement already satisfied: ollama<0.5.0,>=0.4.7 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (0.4.7)\n",
      "Requirement already satisfied: platformdirs<5.0.0,>=4.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (4.3.7)\n",
      "Requirement already satisfied: psutil<8.0.0,>=7.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (7.0.0)\n",
      "Requirement already satisfied: python-dotenv<2.0.0,>=1.1.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (1.1.0)\n",
      "Requirement already satisfied: typing-extensions<5.0.0,>=4.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from starfish-core) (4.13.0)\n",
      "Requirement already satisfied: aiohttp in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (3.11.16)\n",
      "Requirement already satisfied: click in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (8.1.8)\n",
      "Requirement already satisfied: httpx>=0.23.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.28.1)\n",
      "Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (8.6.1)\n",
      "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (3.1.6)\n",
      "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (4.23.0)\n",
      "Requirement already satisfied: openai>=1.68.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (1.70.0)\n",
      "Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (2.11.1)\n",
      "Requirement already satisfied: tiktoken>=0.7.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.9.0)\n",
      "Requirement already satisfied: tokenizers in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from litellm<2.0.0,>=1.65.1->starfish-core) (0.21.1)\n",
      "Requirement already satisfied: anyio in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (4.9.0)\n",
      "Requirement already satisfied: certifi in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (2025.1.31)\n",
      "Requirement already satisfied: httpcore==1.* in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (1.0.7)\n",
      "Requirement already satisfied: idna in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.10)\n",
      "Requirement already satisfied: h11<0.15,>=0.13 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from httpcore==1.*->httpx>=0.23.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.14.0)\n",
      "Requirement already satisfied: zipp>=3.20 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from importlib-metadata>=6.8.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.21.0)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jinja2<4.0.0,>=3.1.2->litellm<2.0.0,>=1.65.1->starfish-core) (3.0.2)\n",
      "Requirement already satisfied: attrs>=22.2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (25.3.0)\n",
      "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (2024.10.1)\n",
      "Requirement already satisfied: referencing>=0.28.4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.36.2)\n",
      "Requirement already satisfied: rpds-py>=0.7.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.24.0)\n",
      "Requirement already satisfied: distro<2,>=1.7.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (1.9.0)\n",
      "Requirement already satisfied: jiter<1,>=0.4.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (0.9.0)\n",
      "Requirement already satisfied: sniffio in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (1.3.1)\n",
      "Requirement already satisfied: tqdm>4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from openai>=1.68.2->litellm<2.0.0,>=1.65.1->starfish-core) (4.67.1)\n",
      "Requirement already satisfied: annotated-types>=0.6.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.7.0)\n",
      "Requirement already satisfied: pydantic-core==2.33.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.33.0)\n",
      "Requirement already satisfied: typing-inspection>=0.4.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm<2.0.0,>=1.65.1->starfish-core) (0.4.0)\n",
      "Requirement already satisfied: regex>=2022.1.18 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2024.11.6)\n",
      "Requirement already satisfied: requests>=2.26.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.32.3)\n",
      "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (2.6.1)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.3.2)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.5.0)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (6.3.1)\n",
      "Requirement already satisfied: propcache>=0.2.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (0.3.1)\n",
      "Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from aiohttp->litellm<2.0.0,>=1.65.1->starfish-core) (1.18.3)\n",
      "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (0.30.1)\n",
      "Requirement already satisfied: filelock in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (3.18.0)\n",
      "Requirement already satisfied: fsspec>=2023.5.0 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (2025.3.2)\n",
      "Requirement already satisfied: packaging>=20.9 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (24.2)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm<2.0.0,>=1.65.1->starfish-core) (6.0.2)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (3.4.1)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/zhengisamazing/Library/Caches/pypoetry/virtualenvs/starfish-T7IInzTH-py3.11/lib/python3.11/site-packages (from requests>=2.26.0->tiktoken>=0.7.0->litellm<2.0.0,>=1.65.1->starfish-core) (2.3.0)\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install starfish-core"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Fix for Jupyter Notebook only — do NOT use in production\n",
    "## Enables async code execution in notebooks, but may cause issues with sync/async issues\n",
    "## For production, please run in standard .py files without this workaround\n",
    "## See: https://github.com/erdewit/nest_asyncio for more details\n",
    "import nest_asyncio\n",
    "nest_asyncio.apply()\n",
    "\n",
    "from starfish import StructuredLLM, data_factory\n",
    "from starfish.llm.utils import merge_structured_outputs\n",
    "\n",
    "from starfish.common.env_loader import load_env_file ## Load environment variables from .env file\n",
    "load_env_file()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# setup your openai api key if not already set\n",
    "# import os\n",
    "# os.environ[\"OPENAI_API_KEY\"] = \"your_key_here\"\n",
    "\n",
    "# If you dont have any API key, please navigate to local model section"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Helper function mock llm call\n",
    "# When developing data pipelines with LLMs, making thousands of real API calls\n",
    "# can be expensive. Using mock LLM calls lets you test your pipeline's reliability,\n",
    "# failure handling, and recovery without spending money on API calls.\n",
    "from starfish.data_factory.utils.mock import mock_llm_call"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 1. Your First Data Factory: Simple Scaling\n",
    "\n",
    "The @data_factory decorator transforms any async function into a scalable data processing pipeline.\n",
    "It handles:\n",
    "- Parallel execution \n",
    "- Automatic batching\n",
    "- Error handling & retries\n",
    "- Progress tracking\n",
    "\n",
    "Let's start with a single LLM call and then show how easy it is to scale it.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'fact': 'New Yorkers consume around 1,000,000 slices of pizza every day, which means if you laid them all in a line, they would stretch from the Statue of Liberty to the Eiffel Tower... and back!'}]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# First, create a StructuredLLM instance for generating facts about cities\n",
    "json_llm = StructuredLLM(\n",
    "    model_name = \"openai/gpt-4o-mini\",\n",
    "    prompt = \"Funny facts about city {{city_name}}.\",\n",
    "    output_schema = [{'name': 'fact', 'type': 'str'}],\n",
    "    model_kwargs = {\"temperature\": 0.7},\n",
    ")\n",
    "\n",
    "json_llm_response = await json_llm.run(city_name='New York')\n",
    "json_llm_response.data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-04-25 10:16:32\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 8c926411-63e7-4dc6-98c9-861c3489fb8b\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:32\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "Processing New York at 2025-04-25 10:16:32.524033\n",
      "Processing London at 2025-04-25 10:16:32.524286\n",
      "Processing Tokyo at 2025-04-25 10:16:32.524979\n",
      "Processing Paris at 2025-04-25 10:16:32.525535\n",
      "Processing Sydney at 2025-04-25 10:16:32.526729\n",
      "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'fact': \"In Tokyo, there's a train station called 'Shinjuku' that handles more passengers each day than the entire population of the United States!\"},\n",
       " {'fact': \"London has a 'secret' underground city known as the 'London Stone', which is said to have magical powers, making it one of the city's most famous and quirky legends!\"},\n",
       " {'fact': 'In Paris, you can legally marry a dead person! This quirky law allows for posthumous marriages, as long as you can prove that the deceased had intended to marry you before their untimely demise.'},\n",
       " {'fact': 'In New York City, there are more than 25,000 licensed taxis, but only about 1,200 of them are actually yellow. The rest are a rainbow of colors, including pink, blue, and even animal print!'},\n",
       " {'fact': 'Sydney has a beach where you can surf, swim, and even watch a film – all in one day! Just don’t forget your sunscreen and popcorn!'}]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Now, scale to multiple cities using data_factory\n",
    "# Just add the @data_factory decorator to process many cities in parallel\n",
    "\n",
    "from datetime import datetime\n",
    "@data_factory(max_concurrency=10)\n",
    "async def process_json_llm(city_name: str):\n",
    "    ## Adding a print statement to indicate the start of the processing\n",
    "    print(f\"Processing {city_name} at {datetime.now()}\")\n",
    "    json_llm_response = await json_llm.run(city_name=city_name)\n",
    "    return json_llm_response.data\n",
    "\n",
    "# This is all it takes to scale from one city to many cities!\n",
    "process_json_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 2. Works with any aysnc function\n",
    "\n",
    "Data Factory works with any async function, not just LLM calls, you can build complex pipelines involving multiple LLMs, data processing, etc.\n",
    "\n",
    "Here is example of two chained structured llm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 466fca03-85a2-46de-b135-629cd76738f7\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:34\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:37\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:40\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/3\u001b[0m | \u001b[33mRunning: 3\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:43\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 2/3\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 2\u001b[0m    (\u001b[32mCompleted: 2\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:44\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:44\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 3/3\u001b[0m | \u001b[33mAttempted: 3\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "# Example of a more complex function that chains multiple LLM calls\n",
    "# This was grabbed from structured llm examples \n",
    "\n",
    "@data_factory(max_concurrency=10)\n",
    "async def complex_process_cities(topic: str):\n",
    "    ## topic → generator_llm → rating_llm → merged results\n",
    "    # First LLM to generate question/answer pairs\n",
    "    generator_llm = StructuredLLM(\n",
    "        model_name=\"openai/gpt-4o-mini\",\n",
    "        prompt=\"Generate question/answer pairs about {{topic}}.\",\n",
    "        output_schema=[\n",
    "            {\"name\": \"question\", \"type\": \"str\"},\n",
    "            {\"name\": \"answer\", \"type\": \"str\"}\n",
    "        ],\n",
    "    )\n",
    "\n",
    "    # Second LLM to rate the generated pairs\n",
    "    rater_llm = StructuredLLM(\n",
    "        model_name=\"openai/gpt-4o-mini\",\n",
    "        prompt='''Rate the following Q&A pairs based on accuracy and clarity (1-10).\n",
    "        Pairs: {{generated_pairs}}''',\n",
    "        output_schema=[\n",
    "            {\"name\": \"accuracy_rating\", \"type\": \"int\"},\n",
    "            {\"name\": \"clarity_rating\", \"type\": \"int\"}\n",
    "        ],\n",
    "        model_kwargs={\"temperature\": 0.5}\n",
    ")\n",
    "\n",
    "    generation_response = await generator_llm.run(topic=topic, num_records=5)\n",
    "    rating_response = await rater_llm.run(generated_pairs=generation_response.data)\n",
    "    \n",
    "    # Merge the results\n",
    "    return merge_structured_outputs(generation_response.data, rating_response.data)\n",
    "\n",
    "\n",
    "### To save on token here we only use 3 topics as example\n",
    "complex_process_cities_data = complex_process_cities.run(topic=['Science', 'History', 'Technology'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "15\n",
      "[{'question': 'What is the primary function of a CPU in a computer?', 'answer': 'The CPU, or Central Processing Unit, is responsible for executing instructions and processing data in a computer system.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What does IoT stand for and what is its significance?', 'answer': 'IoT stands for Internet of Things, which refers to the interconnection of everyday devices to the internet, allowing them to send and receive data, thereby enhancing efficiency and convenience.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the difference between RAM and ROM?', 'answer': 'RAM (Random Access Memory) is volatile memory that temporarily stores data and applications currently in use, while ROM (Read-Only Memory) is non-volatile memory that permanently stores firmware and system software.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is cloud computing?', 'answer': 'Cloud computing is the delivery of computing services over the internet, enabling users to access and store data and applications on remote servers rather than on local computers.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What are the benefits of using artificial intelligence in business?', 'answer': 'Artificial intelligence can enhance efficiency, improve decision-making, personalize customer experiences, automate repetitive tasks, and generate insights from data analytics in business operations.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the chemical formula for water?', 'answer': 'The chemical formula for water is H2O.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the process by which plants make their own food?', 'answer': 'The process by which plants make their own food is called photosynthesis.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the speed of light in a vacuum?', 'answer': 'The speed of light in a vacuum is approximately 299,792 kilometers per second (or about 186,282 miles per second).', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What is the smallest unit of life?', 'answer': 'The smallest unit of life is the cell.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What gas do living organisms need for respiration?', 'answer': 'Living organisms need oxygen for respiration.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What was the primary cause of World War I?', 'answer': 'The primary cause of World War I was the complex system of alliances, militarism, imperialism, and nationalism, which escalated tensions following the assassination of Archduke Franz Ferdinand of Austria in 1914.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'Who was the first President of the United States?', 'answer': 'George Washington was the first President of the United States, serving from April 30, 1789, to March 4, 1797.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What year did the Berlin Wall fall?', 'answer': 'The Berlin Wall fell on November 9, 1989, symbolizing the end of the Cold War and the division between East and West Germany.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'Which ancient civilization is known for creating the first known writing system?', 'answer': 'The Sumerians, who inhabited ancient Mesopotamia around 3500 BCE, are known for creating the first known writing system called cuneiform.', 'accuracy_rating': 10, 'clarity_rating': 10}, {'question': 'What was the significance of the Magna Carta?', 'answer': 'The Magna Carta, signed in 1215, was significant because it limited the power of the monarchy and established the principle that everyone, including the king, was subject to the law.', 'accuracy_rating': 10, 'clarity_rating': 10}]\n"
     ]
    }
   ],
   "source": [
    "### Each topic has 5 question/answer pairs so 3 topics has 15 pairs!\n",
    "print(len(complex_process_cities_data))\n",
    "print(complex_process_cities_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3. Working with Different Input Formats\n",
    "\n",
    "\n",
    "Data Factory is flexible with how you provide inputs. Let's demonstrate different ways to pass parameters to data_factory functions.\n",
    "\n",
    "'data' is a reserved keyword expecting list(dict) or tuple(dict) - this design make it super easy to pass large data and support HuggingFace and Pandas dataframe very easily"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'answer': 'New York_5'}, {'answer': 'New York_2'}, {'answer': 'New York_3'}]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## We will be using mock llm call for rest of example to save on token\n",
    "## Mock LLM call is a function that simulates an LLM API call with random delays (controlled by sleep_time) and occasional failures (controlled by fail_rate)\n",
    "await mock_llm_call(city_name=\"New York\", num_records_per_city=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "@data_factory(max_concurrency=100)\n",
    "async def input_format_mock_llm(city_name: str, num_records_per_city: int):\n",
    "    return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.01)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-04-25 10:16:49\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 05c84608-fec3-4010-8876-e59eed12bb6a\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:49\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "# Format 1: Multiple lists that get zipped together\n",
    "input_format_data1 = input_format_mock_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"], num_records_per_city=[2, 1, 1, 1, 1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: fedb98e5-c408-4bc8-9479-6087f4a298b7\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:50\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "# Format 2: List + single value (single value gets broadcasted)\n",
    "input_format_data2 = input_format_mock_llm.run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"], num_records_per_city=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 2f5cb7cc-83c9-4b7e-9ebb-386cd66bdd42\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:51\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/5\u001b[0m | \u001b[33mRunning: 5\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:52\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:52\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 5/5\u001b[0m | \u001b[33mAttempted: 5\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "# Format 3: Special 'data' parameter\n",
    "# 'data' is a reserved keyword expecting list(dict) or tuple(dict)\n",
    "# Makes integration with various data sources easier\n",
    "input_format_data3 = input_format_mock_llm.run(data=[{\"city_name\": \"New York\", \"num_records_per_city\": 2}, {\"city_name\": \"London\", \"num_records_per_city\": 1}, {\"city_name\": \"Tokyo\", \"num_records_per_city\": 1}, {\"city_name\": \"Paris\", \"num_records_per_city\": 1}, {\"city_name\": \"Sydney\", \"num_records_per_city\": 1}])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 4. Resilient error retry\n",
    "Data Factory automatically handles errors and retries, making your pipelines robust.\n",
    "\n",
    "Let's demonstrate with a high failure rate example."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-04-25 10:16:56\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: 38c50ab6-f24b-4cba-a2c5-070130ab420e\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:56\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/25\u001b[0m | \u001b[33mRunning: 25\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 16/25\u001b[0m | \u001b[33mRunning: 9\u001b[0m | \u001b[36mAttempted: 16\u001b[0m    (\u001b[32mCompleted: 16\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Tokyo\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: London\u001b[0m\n",
      "\u001b[32m2025-04-25 10:16:59\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
      "\u001b[32m2025-04-25 10:17:02\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 23/25\u001b[0m | \u001b[33mRunning: 2\u001b[0m | \u001b[36mAttempted: 26\u001b[0m    (\u001b[32mCompleted: 23\u001b[0m, \u001b[31mFailed: 3\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:17:03\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
      "\u001b[32m2025-04-25 10:17:05\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
      "\u001b[32m2025-04-25 10:17:05\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 25/25\u001b[0m | \u001b[33mAttempted: 29\u001b[0m (Failed: 4, Filtered: 0, Duplicate: 0)\u001b[0m\n",
      "\n",
      "Successfully completed 25 out of 25 tasks\n",
      "Data Factory automatically handled the failures and continued processing\n",
      "The results only include successful tasks\n"
     ]
    }
   ],
   "source": [
    "@data_factory(max_concurrency=100)\n",
    "async def high_error_rate_mock_llm(city_name: str, num_records_per_city: int):\n",
    "    return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3) # Hardcode to 30% chance of failure\n",
    "\n",
    "# Process all cities - some will fail, but data_factory keeps going\n",
    "cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 5  # 25 cities\n",
    "high_error_rate_mock_lllm_data = high_error_rate_mock_llm.run(city_name=cities, num_records_per_city=1)\n",
    "\n",
    "print(f\"\\nSuccessfully completed {len(high_error_rate_mock_lllm_data)} out of {len(cities)} tasks\")\n",
    "print(\"Data Factory automatically handled the failures and continued processing\")\n",
    "print(\"The results only include successful tasks\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 5. Resume\n",
    "\n",
    "This is essential for long-running jobs with thousands of tasks.\n",
    "\n",
    "If a job is interrupted, you can pick up where you left off using one of two resume methods:\n",
    "\n",
    "\n",
    "1. **Same Session Resume**: If you're still in the same session where the job was interrupted, simply call - Same instance with .resume()\n",
    "\n",
    "2. **Cross-Session Resume**: If you've closed your notebook or lost your session, you can resume using the job ID:\n",
    "   ```python\n",
    "   from starfish import DataFactory\n",
    "   # Resume using the master job ID from a previous run\n",
    "   data_factory = DataFactory.resume_from_checkpoint(job_id=\"your_job_id\")\n",
    "   ```\n",
    "\n",
    "The key difference:\n",
    "- `resume()` uses the same DataFactory instance you defined\n",
    "- `resume_from_checkpoint()` reconstructs your DataFactory from persistent storage where tasks and progress are saved\n",
    "\n",
    "> **Note**: Google Colab users may experience issues with `resume_from_checkpoint()` due to how Colab works\n",
    "\n",
    "We're simulating an interruption here. In a real scenario, this might happen if your notebook errors out, is manually interrupted with a keyboard command, encounters API rate limits, or experiences any other issues that halt execution."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-04-25 10:17:12\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: b2a400b3-32e7-45ee-b8e8-c2bc7afe9f11\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
      "\u001b[32m2025-04-25 10:17:12\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 17/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 17\u001b[0m    (\u001b[32mCompleted: 17\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
      "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[31m\u001b[1mError occurred: KeyboardInterrupt\u001b[0m\n",
      "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\u001b[0m\n",
      "\u001b[32m2025-04-25 10:17:15\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 20/100\u001b[0m | \u001b[33mAttempted: 20\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "@data_factory(max_concurrency=10)\n",
    "async def re_run_mock_llm(city_name: str, num_records_per_city: int):\n",
    "    return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3)\n",
    "\n",
    "cities = [\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"] * 20  # 100 cities\n",
    "re_run_mock_llm_data_1 = re_run_mock_llm.run(city_name=cities, num_records_per_city=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "When a job is interrupted, you'll see a message like:\n",
      "[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\n",
      "\n",
      "To resume an interrupted job, simply call:\n",
      "interrupted_job_mock_llm.resume()\n",
      "\n",
      "For this example we have 20/100 data generated and not finished yet!\n"
     ]
    }
   ],
   "source": [
    "print(\"When a job is interrupted, you'll see a message like:\")\n",
    "print(\"[RESUME INFO] 🚨 Job stopped unexpectedly. You can resume the job by calling .resume()\")\n",
    "\n",
    "print(\"\\nTo resume an interrupted job, simply call:\")\n",
    "print(\"interrupted_job_mock_llm.resume()\")\n",
    "print('')\n",
    "print(f\"For this example we have {len(re_run_mock_llm_data_1)}/{len(cities)} data generated and not finished yet!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[JOB RESUME START]\u001b[0m \u001b[33mPICKING UP FROM WHERE THE JOB WAS LEFT OFF...\u001b[0m\n",
      "\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[RESUME PROGRESS] STATUS AT THE TIME OF RESUME:\u001b[0m \u001b[32mCompleted: 20 / 100\u001b[0m | \u001b[31mFailed: 0\u001b[0m | \u001b[31mDuplicate: 0\u001b[0m | \u001b[33mFiltered: 0\u001b[0m\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:00\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 20/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 20\u001b[0m    (\u001b[32mCompleted: 20\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 32/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 32\u001b[0m    (\u001b[32mCompleted: 32\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Paris\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:03\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:06\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 56/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 59\u001b[0m    (\u001b[32mCompleted: 56\u001b[0m, \u001b[31mFailed: 3\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:08\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Sydney\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:08\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: London\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:09\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 69/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 74\u001b[0m    (\u001b[32mCompleted: 69\u001b[0m, \u001b[31mFailed: 5\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:09\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:12\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 89/100\u001b[0m | \u001b[33mRunning: 10\u001b[0m | \u001b[36mAttempted: 95\u001b[0m    (\u001b[32mCompleted: 89\u001b[0m, \u001b[31mFailed: 6\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:12\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: Paris\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:13\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[31m\u001b[1mError running task: Mock LLM failed to process city: New York\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 100/100\u001b[0m | \u001b[33mAttempted: 109\u001b[0m (Failed: 9, Filtered: 0, Duplicate: 0)\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "## Lets keep continue the rest of run by resume_from_checkpoint \n",
    "re_run_mock_llm_data_2 = re_run_mock_llm.resume()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Now we still able to finished with what is left!! 100 data generated!\n"
     ]
    }
   ],
   "source": [
    "print(f\"Now we still able to finished with what is left!! {len(re_run_mock_llm_data_2)} data generated!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 6. Dry run\n",
    "Before running a large job, you can do a \"dry run\" to test your pipeline. This only processes a single item and doesn't save state to the database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m\u001b[1m[JOB START]\u001b[0m \u001b[36mMaster Job ID: None\u001b[0m | \u001b[33mLogging progress every 3 seconds\u001b[0m\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:14\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB PROGRESS] \u001b[32mCompleted: 0/1\u001b[0m | \u001b[33mRunning: 1\u001b[0m | \u001b[36mAttempted: 0\u001b[0m    (\u001b[32mCompleted: 0\u001b[0m, \u001b[31mFailed: 0\u001b[0m, \u001b[35mFiltered: 0\u001b[0m, \u001b[34mDuplicate: 0\u001b[0m)\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:15\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1mSending telemetry event, TELEMETRY_ENABLED=true\u001b[0m\n",
      "\u001b[32m2025-04-25 10:18:15\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[1m[JOB FINISHED] \u001b[1mFinal Status:\u001b[0m \u001b[32mCompleted: 1/0\u001b[0m | \u001b[33mAttempted: 1\u001b[0m (Failed: 0, Filtered: 0, Duplicate: 0)\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "@data_factory(max_concurrency=10)\n",
    "async def dry_run_mock_llm(city_name: str, num_records_per_city: int):\n",
    "    return await mock_llm_call(city_name=city_name, num_records_per_city=num_records_per_city, fail_rate=0.3)\n",
    "\n",
    "dry_run_mock_llm_data = dry_run_mock_llm.dry_run(city_name=[\"New York\", \"London\", \"Tokyo\", \"Paris\", \"Sydney\"]*20, num_records_per_city=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 8. Advanced Usage\n",
    "Data Factory offers more advanced capabilities for complete pipeline customization, including hooks that execute at key stages and shareable state to coordinate between tasks. These powerful features enable complex workflows and fine-grained control. Our dedicated examples for advanced data_factory usage will be coming soon!"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}