{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "6a04ce68a3fd4e8a85eab7ef95b460bf": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_8d334cffae1e43d3965926cf47915d87", "IPY_MODEL_9c4084b9c03c4b7d9fb6351d67e51181", "IPY_MODEL_1af047bf93ef458ab60dec0419e162c0" ], "layout": "IPY_MODEL_ad9c688d0f8b4a94819008130cedbada" } }, "8d334cffae1e43d3965926cf47915d87": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7fd0861c22a94f639cae337d38815752", "placeholder": "​", "style": "IPY_MODEL_13d05e0960ac4220a3874135a233b5b2", "value": "100%" } }, "9c4084b9c03c4b7d9fb6351d67e51181": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_46b40383a6ce4ca2ae35faa715c87f87", "max": 7, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_2e2ec919da9240e48f7f85464bd8376b", "value": 7 } }, "1af047bf93ef458ab60dec0419e162c0": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_190b28c62a7548599de3cf1fe701c9e3", "placeholder": "​", "style": "IPY_MODEL_50c3e507389643ceaf02095a60d63045", "value": " 7/7 [00:32<00:00,  4.32s/it]" } }, "ad9c688d0f8b4a94819008130cedbada": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7fd0861c22a94f639cae337d38815752": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "13d05e0960ac4220a3874135a233b5b2": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "46b40383a6ce4ca2ae35faa715c87f87": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "2e2ec919da9240e48f7f85464bd8376b": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "190b28c62a7548599de3cf1fe701c9e3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "50c3e507389643ceaf02095a60d63045": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "d67f5fd0199a41bb949a974fbd3ffab2": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_2113f2a1b6ab40fd8232aeec27a18443", "IPY_MODEL_a9d7ad46be6747ffbab4b2c8614ddce8", "IPY_MODEL_a6ff8a0f04944a7ea4c310d21d6ffd0b" ], "layout": "IPY_MODEL_05325e3c286d49d1a2f4b07ca83f4932" } }, "2113f2a1b6ab40fd8232aeec27a18443": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_fbd9cc97dc474c7d99a7e2a9c4c9e598", "placeholder": "​", "style": "IPY_MODEL_1ac94815802c4d60b605e5a6bf375349", "value": "Map: 100%" } }, "a9d7ad46be6747ffbab4b2c8614ddce8": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_bf3fd415224e4c379013624bf4701a80", "max": 10, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_4d35d12cf5774b3ba9958bced27b2444", "value": 10 } }, "a6ff8a0f04944a7ea4c310d21d6ffd0b": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c2fd01b3bf494abdbae47574a8cdb367", "placeholder": "​", "style": "IPY_MODEL_01248f0d5d33408b8038c1aefcf52888", "value": " 10/10 [00:00<00:00, 290.58 examples/s]" } }, "05325e3c286d49d1a2f4b07ca83f4932": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fbd9cc97dc474c7d99a7e2a9c4c9e598": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "1ac94815802c4d60b605e5a6bf375349": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "bf3fd415224e4c379013624bf4701a80": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4d35d12cf5774b3ba9958bced27b2444": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "c2fd01b3bf494abdbae47574a8cdb367": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "01248f0d5d33408b8038c1aefcf52888": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "BiY1TYKVyZnF" }, "outputs": [], "source": [ "!pip install praw\n", "!pip install pinecone\n", "!pip install semantic-router\n", "!pip install datasets" ] }, { "cell_type": "code", "source": [ "!pip install numpy==1.26.0" ], "metadata": { "id": "k_b_EgJVTnwQ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!pip install semantic-router[local]" ], "metadata": { "id": "yb9c4QahkH6p" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Collecting Data" ], "metadata": { "id": "GkgPDkwzHwHX" } }, { "cell_type": "code", "source": [ "import praw\n", "from google.colab import userdata\n", "\n", "reddit = praw.Reddit(\n", " client_id=userdata.get('REDDIT_CLIENT_ID'),\n", " client_secret=userdata.get('REDDIT_CLIENT_SECRET'),\n", " user_agent=userdata.get('REDDIT_USER_AGENT'),\n", ")" ], "metadata": { "id": "bmzp0IKW04XI" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def submissionToDict(submission):\n", " submissionAsDict = {}\n", " submissionAsDict['id'] = submission.id\n", " # Metadata is directly stored as a dictionary with 'title' and 'body'\n", " submissionAsDict['metadata'] = {\n", " 'title': submission.title,\n", " 'content': '\\n'.join([comment.body for comment in submission.comments.list() if isinstance(comment, praw.models.Comment)]) # Join comments into a single string, but only if it's a Comment object\n", " }\n", " return submissionAsDict" ], "metadata": { "id": "D-n4R8Gh7JQH" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from IPython.display import clear_output\n", "\n", "data = []\n", "subreddit = reddit.subreddit(\"AskNYC\")\n", "for submission in subreddit.hot(limit=10):\n", " data.append(submissionToDict(submission)) # Await the result of submissionToDict\n", "\n", "clear_output()" ], "metadata": { "id": "PrZKgKJO7Ygh" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from datasets import Dataset\n", "\n", "# Convert your existing 'data' list into a Dataset object\n", "data = Dataset.from_list(data)\n", "\n", "# Apply the mapping function to structure the data\n", "data = data.map(lambda x: {\n", " \"id\": x[\"id\"],\n", " \"metadata\": {\n", " \"title\": x[\"metadata\"][\"title\"], # Access title from metadata\n", " \"content\": x[\"metadata\"][\"content\"], # Access content from metadata\n", " }\n", "})\n", "\n", "# Since you don't have the extra columns in your original data\n", "# you can skip the remove_columns step\n", "\n", "# Now 'data' is a Dataset object\n", "print(data)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 120, "referenced_widgets": [ "d67f5fd0199a41bb949a974fbd3ffab2", "2113f2a1b6ab40fd8232aeec27a18443", "a9d7ad46be6747ffbab4b2c8614ddce8", "a6ff8a0f04944a7ea4c310d21d6ffd0b", "05325e3c286d49d1a2f4b07ca83f4932", "fbd9cc97dc474c7d99a7e2a9c4c9e598", "1ac94815802c4d60b605e5a6bf375349", "bf3fd415224e4c379013624bf4701a80", "4d35d12cf5774b3ba9958bced27b2444", "c2fd01b3bf494abdbae47574a8cdb367", "01248f0d5d33408b8038c1aefcf52888" ] }, "id": "Aw2zNQW0MRgv", "outputId": "655bf508-a1a2-4ef3-fc46-0db101321d0c" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Map: 0%| | 0/10 [00:00