# Text task notebook template
## Loading the necessary libraries

In [1]:
from codecarbon import EmissionsTracker
import huggingface_hub
from fastapi import APIRouter
from datetime import datetime
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D


import sys
sys.path.append('../tasks')

#from utils.evaluation import TextEvaluationRequest
#from utils.emissions import tracker, clean_emissions_data, get_space_info

dataset = load_dataset("quotaclimat/frugalaichallenge-text-train")
print(next(iter(dataset['train'])))
    # Convert string labels to integers
LABEL_MAPPING = {
        "0_not_relevant": 0,
        "1_not_happening": 1,
        "2_not_human": 2,
        "3_not_bad": 3,
        "4_solutions_harmful_unnecessary": 4,
        "5_science_unreliable": 5,
        "6_proponents_biased": 6,
        "7_fossil_fuels_needed": 7
    }
dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
dataset


2025-01-29 12:18:59.954133: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


{'quote': 'Interesting to note that Oklahoma minimum temperatures in 2011 were in the bottom ten, including the coldest Oklahoma temperature ever recorded, -31F on February 10, 2011.', 'label': '0_not_relevant', 'source': 'FLICC', 'url': 'https://huggingface.co/datasets/fzanartu/FLICCdataset', 'language': 'en', 'subsource': 'CARDS', 'id': None, '__index_level_0__': 1109}


DatasetDict({
    train: Dataset({
        features: ['quote', 'label', 'source', 'url', 'language', 'subsource', 'id', '__index_level_0__'],
        num_rows: 4872
    })
    test: Dataset({
        features: ['quote', 'label', 'source', 'url', 'language', 'subsource', 'id', '__index_level_0__'],
        num_rows: 1219
    })
})

## Loading the datasets and splitting them

In [2]:
#request = TextEvaluationRequest()

# Load and prepare the dataset
#dataset = load_dataset(request.dataset_name)

# Convert string labels to integers
#dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})

# Split dataset
train_test = dataset["train"].train_test_split(test_size=.2, #request.test_size, 
                                               seed=42 )#request.test_seed)


In [3]:
train_dataset = train_test["train"]
test_dataset = train_test["test"]


In [4]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

train_df= pd.DataFrame(train_dataset["quote"], columns=['quote'])    
train_df['clean_text'] = train_df.map(clean_text) 
train_df['length_clean_text'] = train_df['clean_text'].map(len)

train_df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/laureberti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/laureberti/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,quote,clean_text,length_clean_text
0,Americans for Tax Reform opposes a carbon tax ...,american tax reform oppose carbon tax work tir...,79
1,More than 100 climate models over the past 30 ...,100 climate model past 30 year predict actuall...,152
2,As an oil and gas operator who has been in the...,oil gas operator ha industry 30 year im fortun...,362
3,"Climate has always changed, there've been many...",climate ha always change thereve many extincti...,141
4,People have made a mistake. They’ve started to...,people make mistake theyve start believe human...,118


In [5]:
test_df= pd.DataFrame(test_dataset["quote"], columns=['quote'])    
test_df['clean_text'] = test_df.map(clean_text) 
test_df['length_clean_text'] = test_df['clean_text'].map(len)

test_df.head()

Unnamed: 0,quote,clean_text,length_clean_text
0,The term climate change was hijacked by “progr...,term climate change wa hijack progressive term...,76
1,Climate change is a scam.Banks and Home Owner'...,climate change scambanks home owner insurance ...,82
2,Against the half-trillion in benefits you can ...,halftrillion benefit weigh global warm impact ...,337
3,Do you agree with the vast majority of climate...,agree vast majority climate scientist climate ...,59
4,"Global warming and climate change, even if it ...",global warm climate change even 100 cause huma...,165


In [6]:
train_df['clean_text'].apply(lambda x: len(x.split(" "))).mean()

27.92250449063382

In [7]:
test_df['clean_text'].apply(lambda x: len(x.split(" "))).mean()

27.25948717948718

In [32]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Convolution1D
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers


MAX_FEATURES = 6000
EMBED_SIZE = 28
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_df['clean_text'])
list_tokenized_train = tokenizer.texts_to_sequences(train_df['clean_text'])

RNN_CELL_SIZE = 32

MAX_LEN = 30   

X_train = pad_sequences(list_tokenized_train, maxlen=MAX_LEN)


In [33]:
true_labels = test_dataset["label"]
y_train = train_dataset["label"]
y_test = test_dataset["label"]

In [34]:
class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
 
    def call(self, features, hidden):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = tf.nn.tanh(
            self.W1(features) + self.W2(hidden_with_time_axis))
        
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
 
        return context_vector, attention_weights

In [35]:
sequence_input = Input(shape=(MAX_LEN,), dtype="int32")
embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(sequence_input)

In [36]:
lstm = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences = True), name="bi_lstm_0")(embedded_sequences)

# Getting our LSTM outputs
(lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences=True, return_state=True), name="bi_lstm_1")(lstm)

In [37]:
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])

context_vector, attention_weights = Attention(10)(lstm, state_h)

# Removal of the globalMaxPool1D could be trouble
#globmax = GlobalMaxPool1D()(context_vector)
dense1 = Dense(20, activation="relu")(context_vector)
dropout = Dropout(0.05)(dense1)
output = Dense(8, activation="sigmoid")(dropout)

model = keras.Model(inputs=sequence_input, outputs=output)

In [38]:
# summarize layers
print(model.summary())

None


In [39]:
from keras.callbacks import EarlyStopping
from keras import backend 

es = EarlyStopping(monitor='accuracy', mode='min', verbose=1, patience=5)
model.compile(loss='SparseCategoricalCrossentropy', optimizer='adam', metrics=['accuracy'])


In [40]:

import numpy as np

X_train_np = np.array(X_train)
y_train_np = np.array(y_train)

In [42]:
BATCH_SIZE = 100
EPOCHS = 30
history = model.fit(X_train_np,y_train_np, shuffle=True,
                    batch_size=BATCH_SIZE, verbose=1,
                    epochs=EPOCHS)#, callbacks=[es])

Epoch 1/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - accuracy: 0.7935 - loss: 0.6349
Epoch 2/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.8229 - loss: 0.5661
Epoch 3/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - accuracy: 0.8691 - loss: 0.4346
Epoch 4/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - accuracy: 0.8974 - loss: 0.3836
Epoch 5/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.9059 - loss: 0.3363
Epoch 6/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - accuracy: 0.9146 - loss: 0.2993
Epoch 7/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - accuracy: 0.9364 - loss: 0.2439
Epoch 8/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 48ms/step - accuracy: 0.9365 - loss: 0.2423
Epoch 9/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━

In [43]:
def classifier(input_text,candidate_labels):
    #PREPROCESS THE INPUT TEXT
    input_text_cleaned = clean_text(input_text)
    input_sequence = tokenizer.texts_to_sequences([input_text_cleaned])
    input_padded = pad_sequences(input_sequence, maxlen = MAX_LEN, padding = 'post')
    #PREDICTION
    prediction = np.ravel(model.predict(input_padded))
    return {'sequence': input_text,'labels': candidate_labels,'scores': list(prediction)}


In [44]:
candidate_labels = [
    "Not related to climate change disinformation",
    "Climate change is not real and not happening",
    "Climate change is not human-induced",
    "Climate change impacts are not that bad",
    "Climate change solutions are harmful and unnecessary",
    "Climate change science is unreliable",
    "Climate change proponents are biased",
    "Fossil fuels are needed to address climate change"
]

In [48]:
true_labels[:10]

[6, 6, 4, 0, 5, 5, 2, 4, 1, 0]

In [49]:
predictions[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
# Start tracking emissions
tracker.start()
tracker.start_task("inference")


In [46]:
%%capture

from tqdm.auto import tqdm
predictions = []

for i, text in tqdm(enumerate(test_dataset["quote"])):

    result = classifier(text, candidate_labels)

    # Get index of highest scoring label

    pred_label = candidate_labels.index(result["labels"][0])

    predictions.append(pred_label)


In [None]:
# Stop tracking emissions
emissions_data = tracker.stop_task()
emissions_data

In [47]:
# Calculate accuracy
accuracy = accuracy_score(true_labels[:100], predictions[:100])
accuracy

0.27

In [None]:
# Prepare results dictionary
results = {
    "submission_timestamp": datetime.now().isoformat(),
    "accuracy": float(accuracy),
    "energy_consumed_wh": emissions_data.energy_consumed * 1000,
    "emissions_gco2eq": emissions_data.emissions * 1000,
    "emissions_data": clean_emissions_data(emissions_data),
    "dataset_config": {
        "dataset_name": request.dataset_name,
        "test_size": request.test_size,
        "test_seed": request.test_seed
    }
}

results