{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "d7223667431444a584ce297ac976621a": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_33fe94a457ea427497f9aff330406f52", "IPY_MODEL_3681661304c0497195d3ea8275a15f18", "IPY_MODEL_63d881ecb4eb43adbafec01c3992a6fe" ], "layout": "IPY_MODEL_fa700d890a034a49b3a57a30a451f974" } }, "33fe94a457ea427497f9aff330406f52": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_093e16dc717a4b02a5692e8e780eeacb", "placeholder": "​", "style": "IPY_MODEL_80fcf29776be4cc381599ae4dbf4a69b", "value": "Map: 100%" } }, "3681661304c0497195d3ea8275a15f18": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_be530988e82f4080838b0cb6b576bade", "max": 13, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_175ad6b915e741fb9a1cd3f8014f96e3", "value": 13 } }, "63d881ecb4eb43adbafec01c3992a6fe": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_34c943c2735247469764572f931ce3b0", "placeholder": "​", "style": "IPY_MODEL_cfc0da6b23f1472d9e2f28d080b642af", "value": " 13/13 [00:00<00:00, 80.43 examples/s]" } }, "fa700d890a034a49b3a57a30a451f974": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "093e16dc717a4b02a5692e8e780eeacb": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "80fcf29776be4cc381599ae4dbf4a69b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "be530988e82f4080838b0cb6b576bade": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "175ad6b915e741fb9a1cd3f8014f96e3": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "34c943c2735247469764572f931ce3b0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "cfc0da6b23f1472d9e2f28d080b642af": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "e7f33442629342b2a1fc5e2db70b03bb": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_56aa2a323c23404397f2046760c9ccc1", "IPY_MODEL_005be63c3678474aa8ab153e8dd9df6d", "IPY_MODEL_e6d69d094d2f4136beb40e54323c0ff0" ], "layout": "IPY_MODEL_0af742ff483d4a21a874f3c9016b5094" } }, "56aa2a323c23404397f2046760c9ccc1": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b5efac91cdc8478cb44b5fe1ab6a91b9", "placeholder": "​", "style": "IPY_MODEL_0290a7fe07ef408ca947643cade94440", "value": "Map: 100%" } }, "005be63c3678474aa8ab153e8dd9df6d": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_247b8c26e7af481baa2e7984884b00b8", "max": 13, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_03fa22c663b0428a8a32dfd18da39b02", "value": 13 } }, "e6d69d094d2f4136beb40e54323c0ff0": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_23ba120643d549488da26464645f5a48", "placeholder": "​", "style": "IPY_MODEL_0d1f526d3714463f867035180e204dda", "value": " 13/13 [00:00<00:00, 89.96 examples/s]" } }, "0af742ff483d4a21a874f3c9016b5094": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b5efac91cdc8478cb44b5fe1ab6a91b9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0290a7fe07ef408ca947643cade94440": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "247b8c26e7af481baa2e7984884b00b8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "03fa22c663b0428a8a32dfd18da39b02": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "23ba120643d549488da26464645f5a48": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0d1f526d3714463f867035180e204dda": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Nf8y_WJj25sB", "outputId": "0cbd4a16-0c6b-4a64-fc7c-1fbb3e1e38d9" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.44.2)\n", "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (3.0.0)\n", "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.4.0+cu121)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.16.0)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.24.6)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.5.15)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n", "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.5)\n", "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n", "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.1.4)\n", "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.5.0)\n", "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n", "Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets) (2024.6.1)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.10.5)\n", "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch) (4.12.2)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.13.2)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.3)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.4)\n", "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.0)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.11.1)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.8)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.8.30)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.5)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n", "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n" ] } ], "source": [ "pip install transformers datasets torch\n" ] }, { "cell_type": "code", "source": [ "from transformers import GPT2Tokenizer\n", "\n", "def setup_tokenizer(tokenizer):\n", "\n", " if tokenizer.pad_token is None:\n", "\n", " tokenizer.pad_token = tokenizer.eos_token\n", " tokenizer.pad_token_id = tokenizer.eos_token_id\n", "\n", "def load_text_dataset(file_path, tokenizer):\n", " setup_tokenizer(tokenizer)\n", "\n", " dataset = load_dataset('text', data_files={'train': file_path}, split='train')\n", "\n", " def tokenize_function(examples):\n", " return tokenizer(examples['text'], padding=\"max_length\", truncation=True, max_length=512)\n", "\n", " tokenized_datasets = dataset.map(tokenize_function, batched=True)\n", " return tokenized_datasets\n" ], "metadata": { "id": "krFitKaL368W" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "import os\n", "import torch\n", "from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments\n", "\n", "def main():\n", "\n", " tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n", " model = GPT2LMHeadModel.from_pretrained('gpt2')\n", "\n", "\n", " setup_tokenizer(tokenizer)\n", "\n", "\n", " file_path = '/content/Stories.txt'\n", "\n", "\n", " train_dataset = load_text_dataset(file_path, tokenizer)\n", "\n", " # data collator\n", " data_collator = DataCollatorForLanguageModeling(\n", " tokenizer=tokenizer,\n", " mlm=False\n", " )\n", "\n", " # training arguments\n", " training_args = TrainingArguments(\n", " output_dir='./story-generator-model',\n", " overwrite_output_dir=True,\n", " num_train_epochs=3, # Adjust epochs based on your needs\n", " per_device_train_batch_size=4,\n", " save_steps=10_000,\n", " save_total_limit=2,\n", " prediction_loss_only=True,\n", " )\n", "\n", " # Init Trainer\n", " trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " data_collator=data_collator,\n", " train_dataset=train_dataset,\n", " )\n", "\n", "\n", " trainer.train()\n", "\n", "\n", " model.save_pretrained('./story-generator-model')\n", " tokenizer.save_pretrained('./story-generator-model')\n", "\n", "if __name__ == \"__main__\":\n", " main()\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 163, "referenced_widgets": [ "d7223667431444a584ce297ac976621a", "33fe94a457ea427497f9aff330406f52", "3681661304c0497195d3ea8275a15f18", "63d881ecb4eb43adbafec01c3992a6fe", "fa700d890a034a49b3a57a30a451f974", "093e16dc717a4b02a5692e8e780eeacb", "80fcf29776be4cc381599ae4dbf4a69b", "be530988e82f4080838b0cb6b576bade", "175ad6b915e741fb9a1cd3f8014f96e3", "34c943c2735247469764572f931ce3b0", "cfc0da6b23f1472d9e2f28d080b642af" ] }, "id": "rimcl3SH3-nY", "outputId": "0b2df87b-ce3e-4c10-b97f-359dea4ae8e4" }, "execution_count": 8, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", " warnings.warn(\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "Map: 0%| | 0/13 [00:00" ], "text/html": [ "\n", "
\n", " \n", " \n", " [12/12 06:42, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss

" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling\n", "\n", "def evaluate_model(model, tokenizer, test_dataset):\n", " # data collator\n", " data_collator = DataCollatorForLanguageModeling(\n", " tokenizer=tokenizer,\n", " mlm=False\n", " )\n", "\n", " # training arguments (used for evaluation only)\n", " training_args = TrainingArguments(\n", " output_dir='./results',\n", " per_device_eval_batch_size=4,\n", " logging_dir='./logs',\n", " )\n", "\n", " # Trainer instance for evaluation\n", " trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " data_collator=data_collator,\n", " eval_dataset=test_dataset,\n", " )\n", "\n", "\n", " eval_results = trainer.evaluate()\n", " return eval_results\n", "\n", "def main():\n", "\n", " tokenizer = GPT2Tokenizer.from_pretrained('./story-generator-model')\n", " model = GPT2LMHeadModel.from_pretrained('./story-generator-model')\n", "\n", "\n", " file_path = '/content/Stories.txt'\n", " test_dataset = load_text_dataset(file_path, tokenizer)\n", "\n", "\n", " eval_results = evaluate_model(model, tokenizer, test_dataset)\n", " print(\"Evaluation Results:\")\n", " print(eval_results)\n", "\n", "if __name__ == \"__main__\":\n", " main()\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 125, "referenced_widgets": [ "e7f33442629342b2a1fc5e2db70b03bb", "56aa2a323c23404397f2046760c9ccc1", "005be63c3678474aa8ab153e8dd9df6d", "e6d69d094d2f4136beb40e54323c0ff0", "0af742ff483d4a21a874f3c9016b5094", "b5efac91cdc8478cb44b5fe1ab6a91b9", "0290a7fe07ef408ca947643cade94440", "247b8c26e7af481baa2e7984884b00b8", "03fa22c663b0428a8a32dfd18da39b02", "23ba120643d549488da26464645f5a48", "0d1f526d3714463f867035180e204dda" ] }, "id": "RXu3cind2-P4", "outputId": "f4583504-af26-43af-eb3f-7ac730332b30" }, "execution_count": 9, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Map: 0%| | 0/13 [00:00" ], "text/html": [ "\n", "

\n", " \n", " \n", " [4/4 00:26]\n", "
\n", " " ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Evaluation Results:\n", "{'eval_loss': 1.936555027961731, 'eval_model_preparation_time': 0.0042, 'eval_runtime': 39.3769, 'eval_samples_per_second': 0.33, 'eval_steps_per_second': 0.102}\n" ] } ] }, { "cell_type": "code", "source": [ "from transformers import GPT2LMHeadModel, GPT2Tokenizer\n", "import torch\n", "\n", "def generate_text(prompt, model, tokenizer, max_length=1000, num_return_sequences=1, temperature=0.7, repetition_penalty=1.2):\n", "\n", " inputs = tokenizer.encode(prompt, return_tensors='pt')\n", "\n", " # Generate text\n", " with torch.no_grad():\n", " outputs = model.generate(\n", " inputs,\n", " max_length=max_length,\n", " num_return_sequences=num_return_sequences,\n", " temperature=temperature,\n", " repetition_penalty=repetition_penalty,\n", " top_k=50, # Use top_k sampling to limit to top-k probabilities\n", " top_p=0.95, # Use nucleus sampling to limit to top-p cumulative probability\n", " do_sample=True, # Enable sampling for varied text generation\n", " pad_token_id=tokenizer.eos_token_id # Handle padding correctly\n", " )\n", "\n", " # Decode the generated text\n", " generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]\n", "\n", " return generated_texts\n", "\n", "def main():\n", " output_dir = './results' # Directory where the model and tokenizer are saved\n", "\n", "\n", " model, tokenizer = load_model_and_tokenizer(output_dir)\n", "\n", "\n", " prompt = \"Once upon a time\"\n", "\n", " generated_texts = generate_text(prompt, model, tokenizer, max_length=1000, num_return_sequences=1, temperature=0.7, repetition_penalty=1.2)\n", "\n", "\n", " for i, text in enumerate(generated_texts):\n", " print(f\"Generated Text {i + 1}:\\n{text}\\n\")\n", "\n", "if __name__ == \"__main__\":\n", " main()\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GlKwC09_7Af_", "outputId": "943556a7-ce5d-4e9e-8847-814d65dd8a91" }, "execution_count": 14, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Generated Text 1:\n", "Once upon a time, the people of Noxus began to question their existence. This was when they discovered an ancient artifact that had been hidden in some ruins and sought out its creator for guidance on how he might fulfill his destiny as ruler of all mankind.[1]\n", " (TNG: \"Unification\") The artifacts were brought back from beyond space by Romulan forces with hopes it would provide them peace; however this hope turned into bloodshed after both sides saw each other's true potentials within themselves. Ultimately failing at finding any meaning or purpose behind these discoveries, there came together under one banner -- bringing about what became known as Dominion War - where humanity took control over two continents across which thousands fought against invaders who wanted nothing more than subjugation via conquest and colonization. As war raged throughout history along similar lines, tensions flared up between nations seeking shared goals while struggling through intense conflict during periods of crisis such even wars could only be resolved peacefully once allies emerged victorious alongside new alliances led solely towards greater unity among peoples around common values including honor, respect & love. In order not too far off worlds like Earth-616,[3][4] Prime Directive declared victory following revelations regarding New Mombasa' invasion aboard USS Voyager[5]. Following battle many years later before returning home again due to distress caused shortly thereafter...The remaining crewmen faced challenges ranging from petty disagreements amongst friends living side by party fighting factions plagued almost exclusively by distrust toward outsiders alike! Despite having no idea why humans existed besides being members of various races united beneath powerful ideals surrounding race relations,...After much thought exploring further revealed itself deep down inside another dimension whose inhabitants believed so strongly but did little heed those beliefs until finally discovering something surprising… A small group headed westward attempting yet others awaited arrival....In 2352 BCE VOY : OCEAN CULTURE, Admiral Jonathan Archer traveled southbound looking forward never to find signs indicating impending danger near Vulcan City. Once alerted several ships arrived en route bound directly opposite stations waiting patiently awaiting news concerning major developments approaching planetary boundaries if necessary. However things quickly escalated exponentially becoming deadly encounters erupted onto entire planets threatening countless lives without warning..While piloting starship Enterprise NX04HV2 encountered numerous threats ahead leading her astray she soon found herself embroiled amid growing hostilities involving hundreds of alien species battling hordes of savage beasts lurking just miles away.(TSO)During Starfleet Academy students created long lasting friendships spanning centuries--one student is now remembered fondly because of him guiding young girls named Krellia Sanguinius thru uncharted lands filled full speed with tales of lost civilizations driven mad despite decades of hard work inspired by world events.
Krelia has always held great promise amidst generations of hardships presented daily unlike anything seen since birth thanks largelyto knowledge gained alone rather then aided greatly every day.

As captain of Captain America/Captain Marvel #10 you'll meet old foes face to confront newfound dangers await your journey wherever heroes seek refuge.\",\"name\":\"kreniac\",\"link\":\"/en_US\\/about\\u003cp{protection}&action=protect%20yourself+from*thissetInteriorColor(-65%) + \"_blond\"};function protect() { var oo=[], mrc='\", rbcs={}, cts[]=(new Date().getTime(), dateFormat('UTC'), 0), ngs []({ 'date': 1000}); srs([])[\"type\"] += 1 << 9 ^ 2;} function updateInfo(){ hpragma(\"data:{i:\"title\"}\".formatString((null)[strlen(nags)))? \"\"\": \"\" }, null ); //Set background color check variable vpncolor=#000000 ; setBackground(); } else scriptBlockDefinition&& (!blockCreateNode()); blockCreepedProposition &&!scriptDeleteStateIdToPlay (); void main () {} /** * @param string $nodeName We can add text nodes using `@string` */ public static bool playOnPremiseEventArgs ({ start ) { preprocessArguments ('start'); getContentByTagline (\"\\\", \\\"playonpremises\\\":true,\"userid\"); return false }; /* Update info file node name localStorage := GetLocalSpaceManager::GetNewItemFromLocation ($localStorage).FindAllNodesForURL (&globalStrategyPath /optpath/$remoteCacheDir –replace '/tmp/*', \"$tokenToken\").OpenFileSync(&GlobalStrategieDirectoryObjectData\n", "\n" ] } ] }, { "source": [ "from transformers import GPT2LMHeadModel, GPT2Tokenizer\n", "\n", "def save_model(model, tokenizer, output_dir):\n", " model.save_pretrained(output_dir)\n", " tokenizer.save_pretrained(output_dir)\n", "\n", "def load_model_and_tokenizer(output_dir):\n", " model = GPT2LMHeadModel.from_pretrained(output_dir)\n", " tokenizer = GPT2Tokenizer.from_pretrained(output_dir)\n", " return model, tokenizer\n", "\n", "model = GPT2LMHeadModel.from_pretrained('./story-generator-model')\n", "tokenizer = GPT2Tokenizer.from_pretrained('./story-generator-model')\n", "save_model(model, tokenizer, './results')\n", "model, tokenizer = load_model_and_tokenizer('./results')" ], "cell_type": "code", "metadata": { "id": "UpZZnhGo6NnU" }, "execution_count": 13, "outputs": [] } ] }