{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RVNImn10HJcI", "outputId": "b01c1eb5-05dd-419b-faa8-c08ae8df50aa" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: tensorflow in /usr/local/lib/python3.11/dist-packages (2.18.0)\n", "Requirement already satisfied: keras in /usr/local/lib/python3.11/dist-packages (3.8.0)\n", "Requirement already satisfied: absl-py>=1.0.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (1.4.0)\n", "Requirement already satisfied: astunparse>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (1.6.3)\n", "Requirement already satisfied: flatbuffers>=24.3.25 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (25.2.10)\n", "Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (0.6.0)\n", "Requirement already satisfied: google-pasta>=0.1.1 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (0.2.0)\n", "Requirement already satisfied: libclang>=13.0.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (18.1.1)\n", "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (3.4.0)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from tensorflow) (24.2)\n", "Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (5.29.4)\n", "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (2.32.3)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from tensorflow) (75.2.0)\n", "Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (1.17.0)\n", "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (3.0.1)\n", "Requirement already satisfied: typing-extensions>=3.6.6 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (4.13.2)\n", "Requirement already satisfied: wrapt>=1.11.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (1.17.2)\n", "Requirement already satisfied: grpcio<2.0,>=1.24.3 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (1.71.0)\n", "Requirement already satisfied: tensorboard<2.19,>=2.18 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (2.18.0)\n", "Requirement already satisfied: numpy<2.1.0,>=1.26.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (2.0.2)\n", "Requirement already satisfied: h5py>=3.11.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (3.13.0)\n", "Requirement already satisfied: ml-dtypes<0.5.0,>=0.4.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (0.4.1)\n", "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (0.37.1)\n", "Requirement already satisfied: rich in /usr/local/lib/python3.11/dist-packages (from keras) (13.9.4)\n", "Requirement already satisfied: namex in /usr/local/lib/python3.11/dist-packages (from keras) (0.0.9)\n", "Requirement already satisfied: optree in /usr/local/lib/python3.11/dist-packages (from keras) (0.15.0)\n", "Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from astunparse>=1.6.0->tensorflow) (0.45.1)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2.21.0->tensorflow) (3.4.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2.21.0->tensorflow) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2.21.0->tensorflow) (2.4.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2.21.0->tensorflow) (2025.4.26)\n", "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.11/dist-packages (from tensorboard<2.19,>=2.18->tensorflow) (3.8)\n", "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from tensorboard<2.19,>=2.18->tensorflow) (0.7.2)\n", "Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from tensorboard<2.19,>=2.18->tensorflow) (3.1.3)\n", "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich->keras) (3.0.0)\n", "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich->keras) (2.19.1)\n", "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich->keras) (0.1.2)\n", "Requirement already satisfied: MarkupSafe>=2.1.1 in /usr/local/lib/python3.11/dist-packages (from werkzeug>=1.0.1->tensorboard<2.19,>=2.18->tensorflow) (3.0.2)\n" ] } ], "source": [ "!pip install tensorflow keras\n", "import numpy as np\n", "import tensorflow as tf\n", "from tensorflow import keras\n", "\n", "# Data processing and visualization imports\n", "import string\n", "import pandas as pd\n", "import plotly.express as px\n", "import tensorflow.data as tfd\n", "import matplotlib.pyplot as plt\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.model_selection import train_test_split\n", "\n", "# Model building imports\n", "from sklearn.utils import class_weight\n", "from tensorflow.keras import callbacks\n", "from tensorflow.keras import Model, layers" ] }, { "cell_type": "code", "source": [ "num_heads = 4\n", "embed_dim = 256\n", "ff_dim = 128\n", "vocab_size = 10000\n", "max_seq_len = 40\n", "\n", "# Set constants\n", "learning_rate = 1e-3\n", "epochs = 100\n", "batch_size = 32\n", "\n", "# Define training callbacks\n", "callbacks = [\n", " keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),\n", " keras.callbacks.ModelCheckpoint(\"SpamDetector.h5\", save_best_only=True)\n", "]" ], "metadata": { "id": "KXZX-MiaIB9a" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "random_seed = 123\n", "np.random.seed(random_seed)\n", "tf.random.set_seed(random_seed)" ], "metadata": { "id": "AXke-pwyIZ_I" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "\n", "# Load the dataset using pandas, specifying the encoding as 'latin-1'\n", "data_frame = pd.read_csv('spam.csv', encoding='latin-1')\n", "data_frame.rename(columns = {'v1':'Category','v2':'Messages'},inplace=True)\n", "\n", "# Print the first five rows of the dataset\n", "print(data_frame.head())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IWIFoa10Ien6", "outputId": "068ee878-6a88-4688-8830-81f6cddb26d0" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " Category Messages Unnamed: 2 \\\n", "0 ham Go until jurong point, crazy.. Available only ... NaN \n", "1 ham Ok lar... Joking wif u oni... NaN \n", "2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN \n", "3 ham U dun say so early hor... U c already then say... NaN \n", "4 ham Nah I don't think he goes to usf, he lives aro... NaN \n", "\n", " Unnamed: 3 Unnamed: 4 \n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n" ] } ] }, { "cell_type": "code", "source": [ "\n", "class_dis = data_frame.Category.value_counts()\n", "class_names = class_dis.index\n", "\n", "# Create the Pie Chart\n", "fig = px.pie(names=class_names,\n", " values=class_dis,\n", " color=class_names,\n", " hole=0.4,\n", " labels={'value': 'Count', 'names': 'Class'},\n", " title='Class Distribution of Spam Text Messages')\n", "\n", "# Customize the layout\n", "fig.update_layout(\n", " margin=dict(l=10, r=10, t=60, b=10),\n", " legend=dict(orientation=\"h\", yanchor=\"bottom\", y=1.02, xanchor=\"right\", x=1),\n", ")\n", "\n", "# Show the plot\n", "fig.show()\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 542 }, "id": "AytubNcyIiFE", "outputId": "fd32bc9d-0061-4778-ff22-52004bb71f5a" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/html": [ "\n", "\n", "\n", "
\n", "
\n", "\n", "" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "N_SAMPLES = len(data_frame)\n", "\n", "print(f\"Total Number of Samples : {N_SAMPLES}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yW73nhrwKo2U", "outputId": "fd711c6f-4bfb-4f8f-e839-5aecd01fd3ab" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Total Number of Samples : 5572\n" ] } ] }, { "cell_type": "code", "source": [ "max_len = max([len(text) for text in data_frame.Messages])\n", "print(f\"Maximum Length Of Input Sequence(Chars) : {max_len}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "J_qfT-V9MCJK", "outputId": "c1456930-6ece-4118-b018-6dc484507559" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Maximum Length Of Input Sequence(Chars) : 910\n" ] } ] }, { "cell_type": "code", "source": [ "X = data_frame['Messages'].tolist()\n", "y = data_frame['Category'].tolist()\n", "\n", "\n", "# Initialize label encoder\n", "label_encoder = LabelEncoder()\n", "y = label_encoder.fit_transform(y)\n", "\n", "# Print the first 5 elements of X and y\n", "print(f'X[:5]: \\n{X[:5]}\\n')\n", "print(f'y[:5]: {y[:5]}\\n')\n", "print(f\"Label Mapping : {label_encoder.inverse_transform(y[:5])}\")\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RdZ3CLu3MFz2", "outputId": "e2092b5e-f1e2-4a66-d45c-6398e2e8d47a" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "X[:5]: \n", "['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', \"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\", 'U dun say so early hor... U c already then say...', \"Nah I don't think he goes to usf, he lives around here though\"]\n", "\n", "y[:5]: [0 0 1 0 0]\n", "\n", "Label Mapping : ['ham' 'ham' 'spam' 'ham' 'ham']\n" ] } ] }, { "cell_type": "code", "source": [ "class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=data_frame.Category.unique(), y=label_encoder.inverse_transform(y))\n", "class_weights = {number: weight for number, weight in enumerate(class_weights)}\n", "# Show\n", "print(f\"Associated class weights: {class_weights}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7YbFnWsFNIHI", "outputId": "402b004f-3dee-4a7c-cfc2-1126ffd23006" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Associated class weights: {0: np.float64(0.5774093264248704), 1: np.float64(3.7295850066934406)}\n" ] } ] }, { "cell_type": "code", "source": [ "def preprocess_text(text: str) -> str:\n", " \"\"\"\n", " Preprocesses the text by removing punctuation, lowercasing, and stripping whitespace.\n", " \"\"\"\n", " # Replace punctuation with spaces\n", " text = tf.strings.regex_replace(text, f\"[{string.punctuation}]\", \" \")\n", "\n", " # Lowercase the text\n", " text = tf.strings.lower(text)\n", "\n", " # Strip leading/trailing whitespace\n", " text = tf.strings.strip(text)\n", "\n", " return text\n", "\n", "\n", "# Create a TextVectorization layer\n", "text_vectorizer = layers.TextVectorization(\n", " max_tokens=vocab_size, # Maximum vocabulary size\n", " output_sequence_length=max_seq_len, # Maximum sequence length\n", " standardize=preprocess_text, # Custom text preprocessing function\n", " pad_to_max_tokens=True, # Pad sequences to maximum length\n", " output_mode='int' # Output integer-encoded sequences\n", ")\n", "\n", "# Adapt the TextVectorization layer to the data\n", "text_vectorizer.adapt(X)" ], "metadata": { "id": "F_XmRN2FNNZ5" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "for _ in range(5):\n", " # Send a text to randomly.\n", " text_temp = X[np.random.randint(N_SAMPLES)]\n", "\n", " # Apply text to vectorization.\n", " text_vec_temp = text_vectorizer(text_temp)\n", "\n", " # Show the results\n", " print(f\"Original Text: {text_temp}\")\n", " print(f\"Vectorized Text: {text_vec_temp}\\n\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4Epp7fX6NSa3", "outputId": "0afc39db-6ca2-4b70-ac5e-a7c98e69c4fe" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Original Text: I sent your maga that money yesterday oh.\n", "Vectorized Text: [ 2 197 15 2578 19 229 513 136 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0]\n", "\n", "Original Text: No need lar. Jus testing e phone card. Dunno network not gd i thk. Me waiting 4 my sis 2 finish bathing so i can bathe. Dun disturb u liao u cleaning ur room.\n", "Vectorized Text: [ 42 81 342 365 3108 152 116 702 421 441 29 659 2 267\n", " 11 257 45 12 682 22 316 2211 26 2 25 1075 252 1152\n", " 7 349 7 2181 40 371 0 0 0 0 0 0]\n", "\n", "Original Text: Is it ok if I stay the night here? Xavier has a sleeping bag and I'm getting tired\n", "Vectorized Text: [ 10 13 50 37 2 528 6 127 117 2298 126 5 600 1450\n", " 8 2 30 278 832 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0]\n", "\n", "Original Text: Aight I've been set free, think you could text me blake's address? It occurs to me I'm not quite as sure what I'm doing as I thought I was\n", "Vectorized Text: [ 386 2 154 114 601 53 112 4 230 75 11 2833 20 567\n", " 13 6030 3 11 2 30 29 362 76 192 51 2 30 163\n", " 76 2 280 2 64 0 0 0 0 0 0 0]\n", "\n", "Original Text: I accidentally brought em home in the box\n", "Vectorized Text: [ 2 2237 2192 1063 82 9 6 367 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0]\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "VOCAB = text_vectorizer.get_vocabulary()\n", "\n", "# Let's have a look at the tokens present in the vocabulary\n", "print(f\"Vocabulary size: {len(VOCAB)}\")\n", "print(f\"Vocabulary: {VOCAB[150:200]}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "efgjn27ANXkQ", "outputId": "89be8f9e-af19-4de4-8cc4-63d0e6263994" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Vocabulary size: 8862\n", "Vocabulary: [np.str_('should'), np.str_('message'), np.str_('e'), np.str_('won'), np.str_('ve'), np.str_('tomorrow'), np.str_('prize'), np.str_('say'), np.str_('right'), np.str_('already'), np.str_('after'), np.str_('ask'), np.str_('said'), np.str_('doing'), np.str_('cash'), np.str_('yeah'), np.str_('3'), np.str_('really'), np.str_('amp'), np.str_('why'), np.str_('meet'), np.str_('b'), np.str_('them'), np.str_('im'), np.str_('very'), np.str_('find'), np.str_('life'), np.str_('let'), np.str_('babe'), np.str_('last'), np.str_('thanks'), np.str_('morning'), np.str_('\\\\'), np.str_('would'), np.str_('cos'), np.str_('win'), np.str_('miss'), np.str_('uk'), np.str_('lol'), np.str_('anything'), np.str_('also'), np.str_('every'), np.str_('sure'), np.str_('pick'), np.str_('com'), np.str_('care'), np.str_('150p'), np.str_('sent'), np.str_('nokia'), np.str_('urgent')]\n" ] } ] }, { "cell_type": "code", "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)\n", "\n", "# Apply the Text Vectorization\n", "X_train = text_vectorizer(X_train)\n", "X_test = text_vectorizer(X_test)\n", "\n", "# One Hot Vectors\n", "Xoh_train = tf.one_hot(X_train, depth=10000)\n", "Xoh_test = tf.one_hot(X_test, depth=10000)" ], "metadata": { "id": "8SPtbOxeNb8Q" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "class TokenAndPositionalEmbedding(layers.Layer):\n", "\n", " def __init__(self, embedding_dims, vocab_size, seq_len, **kwargs):\n", " super(TokenAndPositionalEmbedding, self).__init__(**kwargs)\n", "\n", " # Initialize parameters\n", " self.seq_len = seq_len\n", " self.vocab_size = vocab_size\n", " self.embedding_dims = embedding_dims\n", " self.embed_scale = tf.math.sqrt(tf.cast(embedding_dims, tf.float32))\n", "\n", " # Define layers\n", " self.token_embedding = layers.Embedding(\n", " input_dim=vocab_size,\n", " output_dim=embedding_dims,\n", " name=\"token_embedding\"\n", " )\n", "\n", " self.positional_embedding = layers.Embedding(\n", " input_dim=seq_len,\n", " output_dim=embedding_dims,\n", " name=\"positional_embedding\"\n", " )\n", "\n", " def call(self, inputs):\n", " seq_len = tf.shape(inputs)[1]\n", "\n", " # Token Embedding\n", " token_embedding = self.token_embedding(inputs)\n", " token_embedding *= self.embed_scale\n", "\n", " # Positional Embedding\n", " positions = tf.range(start=0, limit=seq_len, delta=1)\n", " positional_embedding = self.positional_embedding(positions)\n", "\n", " # Add Token and Positional Embedding\n", " embeddings = token_embedding + positional_embedding\n", "\n", " return embeddings\n", "\n", "\n", " def get_config(self):\n", " config = super(TokenAndPositionalEmbedding, self).get_config()\n", " config.update({\n", " 'embedding_dims': self.embedding_dims,\n", " 'vocab_size': self.vocab_size,\n", " 'seq_len': self.seq_len,\n", " })\n", " return config" ], "metadata": { "id": "pmkzVuqHNgpk" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "temp_embeds = TokenAndPositionalEmbedding(embed_dim, vocab_size, max_seq_len)(X_train[:1])\n", "temp_embeds" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NhCPM7vYNrDS", "outputId": "e915f6ef-31c5-4603-f9a2-d9ac85d43b57" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 28 } ] }, { "cell_type": "code", "source": [ "class TransformerLayer(layers.Layer):\n", "\n", " def __init__(self, num_heads: int, dropout_rate: float, embedding_dims: int, ff_dim: int, **kwargs):\n", " super(TransformerLayer, self).__init__(**kwargs)\n", "\n", " # Initialize Parameters\n", " self.num_heads = num_heads\n", " self.dropout_rate = dropout_rate\n", " self.embedding_dims = embedding_dims\n", " self.ff_dim = ff_dim\n", "\n", " # Initialize Layers\n", " self.mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dims, dropout=dropout_rate)\n", " self.ln1 = layers.LayerNormalization(epsilon=1e-6)\n", "\n", " self.ffn = keras.Sequential([\n", " layers.Dense(ff_dim, activation='relu', kernel_initializer='he_normal'),\n", " layers.Dense(embedding_dims)\n", " ])\n", " self.ln2 = layers.LayerNormalization(epsilon=1e-6)\n", "\n", " def call(self, inputs):\n", " \"\"\"Forward pass of the Transformer Layer.\n", "\n", " Args:\n", " inputs: Tensor with shape `(batch_size, seq_len, embedding_dims)` representing the input sequence.\n", "\n", " Returns:\n", " Tensor with shape `(batch_size, seq_len, embedding_dims)` representing the output sequence after applying the Transformer Layer.\n", " \"\"\"\n", "\n", " # Multi-Head Attention\n", " attention = self.mha(inputs, inputs, inputs)\n", "\n", " # Layer Normalization and Residual Connection\n", " normalized1 = self.ln1(attention + inputs)\n", "\n", " # Feedforward Network\n", " ffn_out = self.ffn(normalized1)\n", "\n", " # Layer Normalization and Residual Connection\n", " normalized2 = self.ln2(ffn_out + normalized1)\n", "\n", " return normalized2\n", "\n", " def get_config(self):\n", " \"\"\"Get the configuration of the Transformer Layer.\n", "\n", " Returns:\n", " Dictionary with the configuration of the layer.\n", " \"\"\"\n", " config = super(TransformerLayer, self).get_config()\n", " config.update({\n", " \"num_heads\": self.num_heads,\n", " \"dropout_rate\": self.dropout_rate,\n", " \"embedding_dims\": self.embedding_dims,\n", " \"ff_dim\": self.ff_dim\n", " })\n", " return config" ], "metadata": { "id": "_Ou6n-atNvAr" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "TransformerLayer(num_heads=num_heads, embedding_dims=embed_dim, ff_dim=ff_dim, dropout_rate=0.1)(temp_embeds)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7GM5yXkYN8Dz", "outputId": "331a8fae-abbb-439e-f4c4-9bc7d0de0814" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 30 } ] }, { "cell_type": "code", "source": [ "InputLayer = layers.Input(shape=(max_seq_len,), name=\"InputLayer\")\n", "\n", "# Embedding Layer\n", "embeddings = TokenAndPositionalEmbedding(embed_dim, vocab_size, max_seq_len, name=\"EmbeddingLayer\")(InputLayer)\n", "\n", "# Transformer Layer\n", "encodings = TransformerLayer(num_heads=num_heads, embedding_dims=embed_dim, ff_dim=ff_dim, dropout_rate=0.1, name=\"TransformerLayer\")(embeddings)\n", "\n", "# Classifier\n", "gap = layers.GlobalAveragePooling1D(name=\"GlobalAveragePooling\")(encodings)\n", "drop = layers.Dropout(0.5, name=\"Dropout\")(gap)\n", "OutputLayer = layers.Dense(1, activation='sigmoid', name=\"OutputLayer\")(drop)\n", "\n", "# Model\n", "model = keras.Model(InputLayer, OutputLayer, name=\"TransformerNet\")\n", "\n", "# Model Architecture Summary\n", "model.summary()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 391 }, "id": "8eYjV2h2OAJJ", "outputId": "510ecfdb-ab6e-4a45-e767-f22c24493e8b" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "\u001b[1mModel: \"TransformerNet\"\u001b[0m\n" ], "text/html": [ "
Model: \"TransformerNet\"\n",
              "
\n" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n", "│ InputLayer (\u001b[38;5;33mInputLayer\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m40\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ EmbeddingLayer │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m40\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m2,570,240\u001b[0m │\n", "│ (\u001b[38;5;33mTokenAndPositionalEmbedding\u001b[0m) │ │ │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ TransformerLayer │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m40\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m1,118,848\u001b[0m │\n", "│ (\u001b[38;5;33mTransformerLayer\u001b[0m) │ │ │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ GlobalAveragePooling │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │\n", "│ (\u001b[38;5;33mGlobalAveragePooling1D\u001b[0m) │ │ │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ Dropout (\u001b[38;5;33mDropout\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ OutputLayer (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1\u001b[0m) │ \u001b[38;5;34m257\u001b[0m │\n", "└─────────────────────────────────┴────────────────────────┴───────────────┘\n" ], "text/html": [ "
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
              "┃ Layer (type)                     Output Shape                  Param # ┃\n",
              "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
              "│ InputLayer (InputLayer)         │ (None, 40)             │             0 │\n",
              "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
              "│ EmbeddingLayer                  │ (None, 40, 256)        │     2,570,240 │\n",
              "│ (TokenAndPositionalEmbedding)   │                        │               │\n",
              "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
              "│ TransformerLayer                │ (None, 40, 256)        │     1,118,848 │\n",
              "│ (TransformerLayer)              │                        │               │\n",
              "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
              "│ GlobalAveragePooling            │ (None, 256)            │             0 │\n",
              "│ (GlobalAveragePooling1D)        │                        │               │\n",
              "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
              "│ Dropout (Dropout)               │ (None, 256)            │             0 │\n",
              "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
              "│ OutputLayer (Dense)             │ (None, 1)              │           257 │\n",
              "└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
              "
\n" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m3,689,345\u001b[0m (14.07 MB)\n" ], "text/html": [ "
 Total params: 3,689,345 (14.07 MB)\n",
              "
\n" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m3,689,345\u001b[0m (14.07 MB)\n" ], "text/html": [ "
 Trainable params: 3,689,345 (14.07 MB)\n",
              "
\n" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" ], "text/html": [ "
 Non-trainable params: 0 (0.00 B)\n",
              "
\n" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "model.compile(\n", " loss='binary_crossentropy',\n", " optimizer='adam',\n", " metrics=[\n", " keras.metrics.BinaryAccuracy(name='accuracy'),\n", " keras.metrics.Precision(name='precision'),\n", " keras.metrics.Recall(name='recall'),\n", " keras.metrics.AUC(name='auc'),\n", " ]\n", ")\n", "\n", "# Train Model\n", "history = model.fit(\n", " X_train, y_train,\n", " validation_split=0.1,\n", " batch_size=batch_size,\n", " epochs=10,\n", " callbacks=callbacks,\n", " class_weight=class_weights\n", ")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oZYuIFJuOFuJ", "outputId": "4c17fa29-186a-4fba-bcb7-926b2293df03" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Epoch 1/10\n", "\u001b[1m126/126\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m70s\u001b[0m 512ms/step - accuracy: 0.9943 - auc: 0.9993 - loss: 0.0244 - precision: 0.9618 - recall: 0.9944 - val_accuracy: 0.9888 - val_auc: 0.9832 - val_loss: 0.0627 - val_precision: 0.9848 - val_recall: 0.9420\n", "Epoch 2/10\n", "\u001b[1m126/126\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m80s\u001b[0m 497ms/step - accuracy: 0.9985 - auc: 0.9997 - loss: 0.0080 - precision: 0.9882 - recall: 1.0000 - val_accuracy: 0.9865 - val_auc: 0.9779 - val_loss: 0.0860 - val_precision: 1.0000 - val_recall: 0.9130\n", "Epoch 3/10\n", "\u001b[1m126/126\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m81s\u001b[0m 491ms/step - accuracy: 0.9995 - auc: 1.0000 - loss: 0.0026 - precision: 0.9961 - recall: 1.0000 - val_accuracy: 0.9933 - val_auc: 0.9777 - val_loss: 0.0711 - val_precision: 1.0000 - val_recall: 0.9565\n", "Epoch 4/10\n", "\u001b[1m126/126\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m82s\u001b[0m 490ms/step - accuracy: 0.9997 - auc: 1.0000 - loss: 0.0015 - precision: 0.9980 - recall: 1.0000 - val_accuracy: 0.9865 - val_auc: 0.9777 - val_loss: 0.0906 - val_precision: 1.0000 - val_recall: 0.9130\n" ] } ] }, { "cell_type": "code", "source": [ "fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 8))\n", "plt.subplots_adjust(hspace=0.5)\n", "\n", "axes[0, 0].plot(history.history['loss'], label='Training Loss')\n", "axes[0, 0].plot(history.history['val_loss'], label='Validation Loss')\n", "axes[0, 0].set_title('Loss', fontsize=14)\n", "axes[0, 0].set_xlabel('Epoch', fontsize=12)\n", "axes[0, 0].set_ylabel('Loss', fontsize=12)\n", "axes[0, 0].grid(True)\n", "axes[0, 0].legend(fontsize=10)\n", "\n", "axes[0, 1].plot(history.history['accuracy'], label='Training Accuracy')\n", "axes[0, 1].plot(history.history['val_accuracy'], label='Validation Accuracy')\n", "axes[0, 1].set_title('Accuracy', fontsize=14)\n", "axes[0, 1].set_xlabel('Epoch', fontsize=12)\n", "axes[0, 1].set_ylabel('Accuracy', fontsize=12)\n", "axes[0, 1].grid(True)\n", "axes[0, 1].legend(fontsize=10)\n", "\n", "axes[1, 0].plot(history.history['precision'], label='Training Precision')\n", "axes[1, 0].plot(history.history['val_precision'], label='Validation Precision')\n", "axes[1, 0].set_title('Precision', fontsize=14)\n", "axes[1, 0].set_xlabel('Epoch', fontsize=12)\n", "axes[1, 0].set_ylabel('Precision', fontsize=12)\n", "axes[1, 0].grid(True)\n", "axes[1, 0].legend(fontsize=10)\n", "\n", "axes[1, 1].plot(history.history['recall'], label='Training Recall')\n", "axes[1, 1].plot(history.history['val_recall'], label='Validation Recall')\n", "axes[1, 1].set_title('Recall', fontsize=14)\n", "axes[1, 1].set_xlabel('Epoch', fontsize=12)\n", "axes[1, 1].set_ylabel('Recall', fontsize=12)\n", "axes[1, 1].grid(True)\n", "axes[1, 1].legend(fontsize=10)\n", "\n", "fig.suptitle('Model Performance Metrics', fontsize=16, y=1.05)\n", "plt.show()\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 630 }, "id": "QAiJS9mNOLIT", "outputId": "45dc8a9d-ef86-4363-8457-e52c8616b6a9" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "loss, acc, precision, recall, auc = model.evaluate(X_test, y_test, verbose=0)\n", "\n", "# Show the model performance\n", "print('Test loss :', loss)\n", "print('Test accuracy :', acc*100)\n", "print('Test precision :', precision*100)\n", "print('Test recall :', recall*100)\n", "print('Test AUC :', auc*100)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "L08GUPU3RQK0", "outputId": "e60f034e-6f00-499e-aaae-71c2c9d704bb" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Test loss : 0.07856446504592896\n", "Test accuracy : 97.57847785949707\n", "Test precision : 87.65432238578796\n", "Test recall : 95.30201554298401\n", "Test AUC : 98.94327521324158\n" ] } ] }, { "cell_type": "code", "source": [ "def decode_tokens(tokens):\n", " \"\"\"\n", " This function takes in a list of tokenized integers and returns the corresponding text based on the provided vocabulary.\n", "\n", " Args:\n", " - tokens: A list of integers representing tokenized text.\n", " - vocab: A list of words in the vocabulary corresponding to each integer index.\n", "\n", " Returns:\n", " - text: A string of decoded text.\n", " \"\"\"\n", " text = \" \".join(VOCAB[int(token)] for token in tokens).strip()\n", " return text" ], "metadata": { "id": "2S6Iaw7BRjui" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "for _ in range(10):\n", " # Randomly select a text from the testing data.\n", " index = np.random.randint(1,len(X_test))\n", " tokens = X_test[index-1:index]\n", " label = y_test[index]\n", "\n", " # Feed the tokens to the model\n", " print(f\"\\nModel Prediction\\n{'-'*100}\")\n", " proba = 1 if model.predict(tokens, verbose=0)[0][0]>0.5 else 0\n", " pred = label_encoder.inverse_transform([proba])\n", " print(f\"Message: '{decode_tokens(tokens[0])}' | Prediction: {pred[0].title()} | True : {label_encoder.inverse_transform([label])[0].title()}\\n\")" ], "metadata": { "id": "Oc7GR9-GRvpI", "outputId": "0459a1fd-16e7-40d4-8f28-cadd4e502ba4", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "Model Prediction\n", "----------------------------------------------------------------------------------------------------\n", "Message: 'well thats nice too bad i cant eat it' | Prediction: Ham | True : Ham\n", "\n", "\n", "Model Prediction\n", "----------------------------------------------------------------------------------------------------\n", "Message: 'so check your errors and if you had difficulties do correction' | Prediction: Ham | True : Ham\n", "\n", "\n", "Model Prediction\n", "----------------------------------------------------------------------------------------------------\n", "Message: 'in which place do you want da' | Prediction: Ham | True : Ham\n", "\n", "\n", "Model Prediction\n", "----------------------------------------------------------------------------------------------------\n", "Message: 'carlos is down but i have to pick it up from him so i ll swing by usf in a little bit' | Prediction: Ham | True : Ham\n", "\n", "\n", "Model Prediction\n", "----------------------------------------------------------------------------------------------------\n", "Message: 'how much did ur hdd casing cost' | Prediction: Ham | True : Ham\n", "\n", "\n", "Model Prediction\n", "----------------------------------------------------------------------------------------------------\n", "Message: 'its like that hotel dusk game i think you solve puzzles in a area thing' | Prediction: Ham | True : Ham\n", "\n", "\n", "Model Prediction\n", "----------------------------------------------------------------------------------------------------\n", "Message: 'dare i ask any luck with sorting out the car' | Prediction: Ham | True : Ham\n", "\n", "\n", "Model Prediction\n", "----------------------------------------------------------------------------------------------------\n", "Message: 'i m home doc gave me pain meds says everything is fine' | Prediction: Ham | True : Ham\n", "\n", "\n", "Model Prediction\n", "----------------------------------------------------------------------------------------------------\n", "Message: 'you could have seen me i did t recognise you face' | Prediction: Ham | True : Ham\n", "\n", "\n", "Model Prediction\n", "----------------------------------------------------------------------------------------------------\n", "Message: 'they said Ì dun haf passport or smth like dat or Ì juz send to my email account' | Prediction: Ham | True : Ham\n", "\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "73y-E0d_Rz-G" }, "execution_count": null, "outputs": [] } ] }