{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Text task notebook template\n",
"## Loading the necessary libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-01-29 12:18:59.954133: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'quote': 'Interesting to note that Oklahoma minimum temperatures in 2011 were in the bottom ten, including the coldest Oklahoma temperature ever recorded, -31F on February 10, 2011.', 'label': '0_not_relevant', 'source': 'FLICC', 'url': 'https://huggingface.co/datasets/fzanartu/FLICCdataset', 'language': 'en', 'subsource': 'CARDS', 'id': None, '__index_level_0__': 1109}\n"
]
},
{
"data": {
"text/plain": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['quote', 'label', 'source', 'url', 'language', 'subsource', 'id', '__index_level_0__'],\n",
" num_rows: 4872\n",
" })\n",
" test: Dataset({\n",
" features: ['quote', 'label', 'source', 'url', 'language', 'subsource', 'id', '__index_level_0__'],\n",
" num_rows: 1219\n",
" })\n",
"})"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from codecarbon import EmissionsTracker\n",
"import huggingface_hub\n",
"from fastapi import APIRouter\n",
"from datetime import datetime\n",
"from datasets import load_dataset\n",
"from sklearn.metrics import accuracy_score\n",
"import pandas as pd\n",
"from tqdm import tqdm\n",
"from sklearn.model_selection import train_test_split\n",
"import tensorflow as tf\n",
"from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline\n",
"from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D\n",
"\n",
"\n",
"import sys\n",
"sys.path.append('../tasks')\n",
"\n",
"#from utils.evaluation import TextEvaluationRequest\n",
"#from utils.emissions import tracker, clean_emissions_data, get_space_info\n",
"\n",
"dataset = load_dataset(\"quotaclimat/frugalaichallenge-text-train\")\n",
"print(next(iter(dataset['train'])))\n",
" # Convert string labels to integers\n",
"LABEL_MAPPING = {\n",
" \"0_not_relevant\": 0,\n",
" \"1_not_happening\": 1,\n",
" \"2_not_human\": 2,\n",
" \"3_not_bad\": 3,\n",
" \"4_solutions_harmful_unnecessary\": 4,\n",
" \"5_science_unreliable\": 5,\n",
" \"6_proponents_biased\": 6,\n",
" \"7_fossil_fuels_needed\": 7\n",
" }\n",
"dataset = dataset.map(lambda x: {\"label\": LABEL_MAPPING[x[\"label\"]]})\n",
"dataset\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading the datasets and splitting them"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"#request = TextEvaluationRequest()\n",
"\n",
"# Load and prepare the dataset\n",
"#dataset = load_dataset(request.dataset_name)\n",
"\n",
"# Convert string labels to integers\n",
"#dataset = dataset.map(lambda x: {\"label\": LABEL_MAPPING[x[\"label\"]]})\n",
"\n",
"# Split dataset\n",
"train_test = dataset[\"train\"].train_test_split(test_size=.2, #request.test_size, \n",
" seed=42 )#request.test_seed)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"train_dataset = train_test[\"train\"]\n",
"test_dataset = train_test[\"test\"]\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] /Users/laureberti/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to\n",
"[nltk_data] /Users/laureberti/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" quote | \n",
" clean_text | \n",
" length_clean_text | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Americans for Tax Reform opposes a carbon tax ... | \n",
" american tax reform oppose carbon tax work tir... | \n",
" 79 | \n",
"
\n",
" \n",
" 1 | \n",
" More than 100 climate models over the past 30 ... | \n",
" 100 climate model past 30 year predict actuall... | \n",
" 152 | \n",
"
\n",
" \n",
" 2 | \n",
" As an oil and gas operator who has been in the... | \n",
" oil gas operator ha industry 30 year im fortun... | \n",
" 362 | \n",
"
\n",
" \n",
" 3 | \n",
" Climate has always changed, there've been many... | \n",
" climate ha always change thereve many extincti... | \n",
" 141 | \n",
"
\n",
" \n",
" 4 | \n",
" People have made a mistake. They’ve started to... | \n",
" people make mistake theyve start believe human... | \n",
" 118 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" quote \\\n",
"0 Americans for Tax Reform opposes a carbon tax ... \n",
"1 More than 100 climate models over the past 30 ... \n",
"2 As an oil and gas operator who has been in the... \n",
"3 Climate has always changed, there've been many... \n",
"4 People have made a mistake. They’ve started to... \n",
"\n",
" clean_text length_clean_text \n",
"0 american tax reform oppose carbon tax work tir... 79 \n",
"1 100 climate model past 30 year predict actuall... 152 \n",
"2 oil gas operator ha industry 30 year im fortun... 362 \n",
"3 climate ha always change thereve many extincti... 141 \n",
"4 people make mistake theyve start believe human... 118 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import nltk\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n",
"import re\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.corpus import stopwords\n",
"\n",
"stop_words = set(stopwords.words(\"english\")) \n",
"lemmatizer = WordNetLemmatizer()\n",
"\n",
"\n",
"def clean_text(text):\n",
" text = re.sub(r'[^\\w\\s]','',text, re.UNICODE)\n",
" text = text.lower()\n",
" text = [lemmatizer.lemmatize(token) for token in text.split(\" \")]\n",
" text = [lemmatizer.lemmatize(token, \"v\") for token in text]\n",
" text = [word for word in text if not word in stop_words]\n",
" text = \" \".join(text)\n",
" return text\n",
"\n",
"train_df= pd.DataFrame(train_dataset[\"quote\"], columns=['quote']) \n",
"train_df['clean_text'] = train_df.map(clean_text) \n",
"train_df['length_clean_text'] = train_df['clean_text'].map(len)\n",
"\n",
"train_df.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" quote | \n",
" clean_text | \n",
" length_clean_text | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" The term climate change was hijacked by “progr... | \n",
" term climate change wa hijack progressive term... | \n",
" 76 | \n",
"
\n",
" \n",
" 1 | \n",
" Climate change is a scam.Banks and Home Owner'... | \n",
" climate change scambanks home owner insurance ... | \n",
" 82 | \n",
"
\n",
" \n",
" 2 | \n",
" Against the half-trillion in benefits you can ... | \n",
" halftrillion benefit weigh global warm impact ... | \n",
" 337 | \n",
"
\n",
" \n",
" 3 | \n",
" Do you agree with the vast majority of climate... | \n",
" agree vast majority climate scientist climate ... | \n",
" 59 | \n",
"
\n",
" \n",
" 4 | \n",
" Global warming and climate change, even if it ... | \n",
" global warm climate change even 100 cause huma... | \n",
" 165 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" quote \\\n",
"0 The term climate change was hijacked by “progr... \n",
"1 Climate change is a scam.Banks and Home Owner'... \n",
"2 Against the half-trillion in benefits you can ... \n",
"3 Do you agree with the vast majority of climate... \n",
"4 Global warming and climate change, even if it ... \n",
"\n",
" clean_text length_clean_text \n",
"0 term climate change wa hijack progressive term... 76 \n",
"1 climate change scambanks home owner insurance ... 82 \n",
"2 halftrillion benefit weigh global warm impact ... 337 \n",
"3 agree vast majority climate scientist climate ... 59 \n",
"4 global warm climate change even 100 cause huma... 165 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df= pd.DataFrame(test_dataset[\"quote\"], columns=['quote']) \n",
"test_df['clean_text'] = test_df.map(clean_text) \n",
"test_df['length_clean_text'] = test_df['clean_text'].map(len)\n",
"\n",
"test_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"27.92250449063382"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df['clean_text'].apply(lambda x: len(x.split(\" \"))).mean()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"27.25948717948718"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df['clean_text'].apply(lambda x: len(x.split(\" \"))).mean()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"import tensorflow.keras as keras\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten\n",
"from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D\n",
"from tensorflow.keras.models import Model, Sequential\n",
"from tensorflow.keras.layers import Convolution1D\n",
"from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers\n",
"\n",
"\n",
"MAX_FEATURES = 6000\n",
"EMBED_SIZE = 28\n",
"tokenizer = Tokenizer(num_words=MAX_FEATURES)\n",
"tokenizer.fit_on_texts(train_df['clean_text'])\n",
"list_tokenized_train = tokenizer.texts_to_sequences(train_df['clean_text'])\n",
"\n",
"RNN_CELL_SIZE = 32\n",
"\n",
"MAX_LEN = 30 \n",
"\n",
"X_train = pad_sequences(list_tokenized_train, maxlen=MAX_LEN)\n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"true_labels = test_dataset[\"label\"]\n",
"y_train = train_dataset[\"label\"]\n",
"y_test = test_dataset[\"label\"]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"class Attention(tf.keras.Model):\n",
" def __init__(self, units):\n",
" super(Attention, self).__init__()\n",
" self.W1 = tf.keras.layers.Dense(units)\n",
" self.W2 = tf.keras.layers.Dense(units)\n",
" self.V = tf.keras.layers.Dense(1)\n",
" \n",
" def call(self, features, hidden):\n",
" # hidden shape == (batch_size, hidden size)\n",
" # hidden_with_time_axis shape == (batch_size, 1, hidden size)\n",
" # we are doing this to perform addition to calculate the score\n",
" hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
"\n",
" # score shape == (batch_size, max_length, 1)\n",
" # we get 1 at the last axis because we are applying score to self.V\n",
" # the shape of the tensor before applying self.V is (batch_size, max_length, units)\n",
" score = tf.nn.tanh(\n",
" self.W1(features) + self.W2(hidden_with_time_axis))\n",
" \n",
" # attention_weights shape == (batch_size, max_length, 1)\n",
" attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
"\n",
" # context_vector shape after sum == (batch_size, hidden_size)\n",
" context_vector = attention_weights * features\n",
" context_vector = tf.reduce_sum(context_vector, axis=1)\n",
" \n",
" return context_vector, attention_weights"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"sequence_input = Input(shape=(MAX_LEN,), dtype=\"int32\")\n",
"embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(sequence_input)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"lstm = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences = True), name=\"bi_lstm_0\")(embedded_sequences)\n",
"\n",
"# Getting our LSTM outputs\n",
"(lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences=True, return_state=True), name=\"bi_lstm_1\")(lstm)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"state_h = Concatenate()([forward_h, backward_h])\n",
"state_c = Concatenate()([forward_c, backward_c])\n",
"\n",
"context_vector, attention_weights = Attention(10)(lstm, state_h)\n",
"\n",
"# Removal of the globalMaxPool1D could be trouble\n",
"#globmax = GlobalMaxPool1D()(context_vector)\n",
"dense1 = Dense(20, activation=\"relu\")(context_vector)\n",
"dropout = Dropout(0.05)(dense1)\n",
"output = Dense(8, activation=\"sigmoid\")(dropout)\n",
"\n",
"model = keras.Model(inputs=sequence_input, outputs=output)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Model: \"functional_1\"\n",
"
\n"
],
"text/plain": [
"\u001b[1mModel: \"functional_1\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
"┃ Layer (type) ┃ Output Shape ┃ Param # ┃ Connected to ┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
"│ input_layer_1 │ (None, 30) │ 0 │ - │\n",
"│ (InputLayer) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding_1 │ (None, 30, 28) │ 168,000 │ input_layer_1[0]… │\n",
"│ (Embedding) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ bi_lstm_0 │ (None, 30, 64) │ 15,616 │ embedding_1[0][0] │\n",
"│ (Bidirectional) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ bi_lstm_1 │ [(None, 30, 64), │ 24,832 │ bi_lstm_0[0][0] │\n",
"│ (Bidirectional) │ (None, 32), │ │ │\n",
"│ │ (None, 32), │ │ │\n",
"│ │ (None, 32), │ │ │\n",
"│ │ (None, 32)] │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ concatenate_2 │ (None, 64) │ 0 │ bi_lstm_1[0][1], │\n",
"│ (Concatenate) │ │ │ bi_lstm_1[0][3] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ attention_1 │ [(None, 64), │ 1,311 │ bi_lstm_1[0][0], │\n",
"│ (Attention) │ (None, 30, 1)] │ │ concatenate_2[0]… │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ dense_8 (Dense) │ (None, 20) │ 1,300 │ attention_1[0][0] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ dropout_1 (Dropout) │ (None, 20) │ 0 │ dense_8[0][0] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ dense_9 (Dense) │ (None, 8) │ 168 │ dropout_1[0][0] │\n",
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
"
\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
"│ input_layer_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m30\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m30\u001b[0m, \u001b[38;5;34m28\u001b[0m) │ \u001b[38;5;34m168,000\u001b[0m │ input_layer_1[\u001b[38;5;34m0\u001b[0m]… │\n",
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ bi_lstm_0 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m30\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m15,616\u001b[0m │ embedding_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mBidirectional\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ bi_lstm_1 │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m30\u001b[0m, \u001b[38;5;34m64\u001b[0m), │ \u001b[38;5;34m24,832\u001b[0m │ bi_lstm_0[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mBidirectional\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m32\u001b[0m), │ │ │\n",
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m32\u001b[0m), │ │ │\n",
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m32\u001b[0m), │ │ │\n",
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m32\u001b[0m)] │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ concatenate_2 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ bi_lstm_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m1\u001b[0m], │\n",
"│ (\u001b[38;5;33mConcatenate\u001b[0m) │ │ │ bi_lstm_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m3\u001b[0m] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ attention_1 │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m), │ \u001b[38;5;34m1,311\u001b[0m │ bi_lstm_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m], │\n",
"│ (\u001b[38;5;33mAttention\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m30\u001b[0m, \u001b[38;5;34m1\u001b[0m)] │ │ concatenate_2[\u001b[38;5;34m0\u001b[0m]… │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ dense_8 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m20\u001b[0m) │ \u001b[38;5;34m1,300\u001b[0m │ attention_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ dropout_1 (\u001b[38;5;33mDropout\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m20\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ dense_8[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ dense_9 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m8\u001b[0m) │ \u001b[38;5;34m168\u001b[0m │ dropout_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Total params: 211,227 (825.11 KB)\n",
"
\n"
],
"text/plain": [
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m211,227\u001b[0m (825.11 KB)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Trainable params: 211,227 (825.11 KB)\n",
"
\n"
],
"text/plain": [
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m211,227\u001b[0m (825.11 KB)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Non-trainable params: 0 (0.00 B)\n",
"
\n"
],
"text/plain": [
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"None\n"
]
}
],
"source": [
"# summarize layers\n",
"print(model.summary())"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"from keras.callbacks import EarlyStopping\n",
"from keras import backend \n",
"\n",
"es = EarlyStopping(monitor='accuracy', mode='min', verbose=1, patience=5)\n",
"model.compile(loss='SparseCategoricalCrossentropy', optimizer='adam', metrics=['accuracy'])\n"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"\n",
"import numpy as np\n",
"\n",
"X_train_np = np.array(X_train)\n",
"y_train_np = np.array(y_train)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 39ms/step - accuracy: 0.7935 - loss: 0.6349\n",
"Epoch 2/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 38ms/step - accuracy: 0.8229 - loss: 0.5661\n",
"Epoch 3/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 44ms/step - accuracy: 0.8691 - loss: 0.4346\n",
"Epoch 4/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 39ms/step - accuracy: 0.8974 - loss: 0.3836\n",
"Epoch 5/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 52ms/step - accuracy: 0.9059 - loss: 0.3363\n",
"Epoch 6/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 55ms/step - accuracy: 0.9146 - loss: 0.2993\n",
"Epoch 7/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 54ms/step - accuracy: 0.9364 - loss: 0.2439\n",
"Epoch 8/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 48ms/step - accuracy: 0.9365 - loss: 0.2423\n",
"Epoch 9/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 40ms/step - accuracy: 0.9464 - loss: 0.1978\n",
"Epoch 10/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 39ms/step - accuracy: 0.9516 - loss: 0.1880\n",
"Epoch 11/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 49ms/step - accuracy: 0.9478 - loss: 0.1854\n",
"Epoch 12/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 59ms/step - accuracy: 0.9545 - loss: 0.1586\n",
"Epoch 13/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 59ms/step - accuracy: 0.9563 - loss: 0.1485\n",
"Epoch 14/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 61ms/step - accuracy: 0.9598 - loss: 0.1378\n",
"Epoch 15/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 52ms/step - accuracy: 0.9575 - loss: 0.1429\n",
"Epoch 16/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 60ms/step - accuracy: 0.9576 - loss: 0.1285\n",
"Epoch 17/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 53ms/step - accuracy: 0.9585 - loss: 0.1384\n",
"Epoch 18/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 45ms/step - accuracy: 0.9597 - loss: 0.1333\n",
"Epoch 19/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 51ms/step - accuracy: 0.9671 - loss: 0.1189\n",
"Epoch 20/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 52ms/step - accuracy: 0.9709 - loss: 0.1102\n",
"Epoch 21/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 58ms/step - accuracy: 0.9691 - loss: 0.1136\n",
"Epoch 22/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 59ms/step - accuracy: 0.9774 - loss: 0.0918\n",
"Epoch 23/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 63ms/step - accuracy: 0.9777 - loss: 0.0876\n",
"Epoch 24/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 59ms/step - accuracy: 0.9841 - loss: 0.0615\n",
"Epoch 25/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 43ms/step - accuracy: 0.9781 - loss: 0.0804\n",
"Epoch 26/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 43ms/step - accuracy: 0.9724 - loss: 0.0936\n",
"Epoch 27/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 42ms/step - accuracy: 0.9711 - loss: 0.1026\n",
"Epoch 28/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 44ms/step - accuracy: 0.9728 - loss: 0.0933\n",
"Epoch 29/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 49ms/step - accuracy: 0.9771 - loss: 0.0772\n",
"Epoch 30/30\n",
"\u001b[1m39/39\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 55ms/step - accuracy: 0.9771 - loss: 0.0940\n"
]
}
],
"source": [
"BATCH_SIZE = 100\n",
"EPOCHS = 30\n",
"history = model.fit(X_train_np,y_train_np, shuffle=True,\n",
" batch_size=BATCH_SIZE, verbose=1,\n",
" epochs=EPOCHS)#, callbacks=[es])"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"def classifier(input_text,candidate_labels):\n",
" #PREPROCESS THE INPUT TEXT\n",
" input_text_cleaned = clean_text(input_text)\n",
" input_sequence = tokenizer.texts_to_sequences([input_text_cleaned])\n",
" input_padded = pad_sequences(input_sequence, maxlen = MAX_LEN, padding = 'post')\n",
" #PREDICTION\n",
" prediction = np.ravel(model.predict(input_padded))\n",
" return {'sequence': input_text,'labels': candidate_labels,'scores': list(prediction)}\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"candidate_labels = [\n",
" \"Not related to climate change disinformation\",\n",
" \"Climate change is not real and not happening\",\n",
" \"Climate change is not human-induced\",\n",
" \"Climate change impacts are not that bad\",\n",
" \"Climate change solutions are harmful and unnecessary\",\n",
" \"Climate change science is unreliable\",\n",
" \"Climate change proponents are biased\",\n",
" \"Fossil fuels are needed to address climate change\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[6, 6, 4, 0, 5, 5, 2, 4, 1, 0]"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"true_labels[:10]"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predictions[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Start tracking emissions\n",
"tracker.start()\n",
"tracker.start_task(\"inference\")\n"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"\n",
"from tqdm.auto import tqdm\n",
"predictions = []\n",
"\n",
"for i, text in tqdm(enumerate(test_dataset[\"quote\"])):\n",
"\n",
" result = classifier(text, candidate_labels)\n",
"\n",
" # Get index of highest scoring label\n",
"\n",
" pred_label = candidate_labels.index(result[\"labels\"][0])\n",
"\n",
" predictions.append(pred_label)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Stop tracking emissions\n",
"emissions_data = tracker.stop_task()\n",
"emissions_data"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.27"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Calculate accuracy\n",
"accuracy = accuracy_score(true_labels[:100], predictions[:100])\n",
"accuracy"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Prepare results dictionary\n",
"results = {\n",
" \"submission_timestamp\": datetime.now().isoformat(),\n",
" \"accuracy\": float(accuracy),\n",
" \"energy_consumed_wh\": emissions_data.energy_consumed * 1000,\n",
" \"emissions_gco2eq\": emissions_data.emissions * 1000,\n",
" \"emissions_data\": clean_emissions_data(emissions_data),\n",
" \"dataset_config\": {\n",
" \"dataset_name\": request.dataset_name,\n",
" \"test_size\": request.test_size,\n",
" \"test_seed\": request.test_seed\n",
" }\n",
"}\n",
"\n",
"results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}