from fastapi import APIRouter from datetime import datetime from datasets import load_dataset from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split import pandas as pd import numpy as np from .utils.evaluation import TextEvaluationRequest from .utils.emissions import tracker, clean_emissions_data, get_space_info import os import re import pandas as pd from tqdm import tqdm from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D import tensorflow as tf import tensorflow.keras as keras from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten from tensorflow.keras.models import Model, Sequential from tensorflow.keras.layers import Convolution1D from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers router = APIRouter() DESCRIPTION = "Attention GRU classification" ROUTE = "/text" @router.post(ROUTE, tags=["Text Task"], description=DESCRIPTION) async def evaluate_text(request: TextEvaluationRequest): """ Evaluate text classification for climate disinformation detection. Current Model: DistilBert classification - DistilBert classification predictions from the label space (0-7) - Used as a baseline for comparison """ # Get space info username, space_url = get_space_info() # Define the label mapping LABEL_MAPPING = { "0_not_relevant": 0, "1_not_happening": 1, "2_not_human": 2, "3_not_bad": 3, "4_solutions_harmful_unnecessary": 4, "5_science_unreliable": 5, "6_proponents_biased": 6, "7_fossil_fuels_needed": 7 } # Load and prepare the dataset dataset = load_dataset(request.dataset_name) # Convert string labels to integers dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]}) # Split dataset train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed) train_dataset = train_test["train"] test_dataset = train_test["test"] import nltk nltk.download('stopwords') nltk.download('wordnet') import re from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords stop_words = set(stopwords.words("english")) lemmatizer = WordNetLemmatizer() def clean_text(text): text = re.sub(r'[^\w\s]','',text, re.UNICODE) text = text.lower() text = [lemmatizer.lemmatize(token) for token in text.split(" ")] text = [lemmatizer.lemmatize(token, "v") for token in text] text = [word for word in text if not word in stop_words] text = " ".join(text) return text train_df= pd.DataFrame(train_dataset["quote"], columns=['quote']) train_df['clean_text'] = train_df.map(clean_text) train_df['length_clean_text'] = train_df['clean_text'].map(len) MAX_FEATURES = 6000 EMBED_SIZE = 28 RNN_CELL_SIZE = 32 MAX_LEN = 30 BATCH_SIZE = 100 EPOCHS = 30 tokenizer = Tokenizer(num_words=MAX_FEATURES) tokenizer.fit_on_texts(train_df['clean_text']) list_tokenized_train = tokenizer.texts_to_sequences(train_df['clean_text']) X_train = pad_sequences(list_tokenized_train, maxlen=MAX_LEN) true_labels = test_dataset["label"] y_train = train_dataset["label"] X_train_np = np.array(X_train) y_train_np = np.array(y_train) # Attention Layer class Attention(tf.keras.Model): def __init__(self, units): super(Attention, self).__init__() self.W1 = tf.keras.layers.Dense(units) self.W2 = tf.keras.layers.Dense(units) self.V = tf.keras.layers.Dense(1) def call(self, features, hidden): # hidden shape == (batch_size, hidden size) # hidden_with_time_axis shape == (batch_size, 1, hidden size) # we are doing this to perform addition to calculate the score hidden_with_time_axis = tf.expand_dims(hidden, 1) # score shape == (batch_size, max_length, 1) # we get 1 at the last axis because we are applying score to self.V # the shape of the tensor before applying self.V is (batch_size, max_length, units) score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis)) # attention_weights shape == (batch_size, max_length, 1) attention_weights = tf.nn.softmax(self.V(score), axis=1) # context_vector shape after sum == (batch_size, hidden_size) context_vector = attention_weights * features context_vector = tf.reduce_sum(context_vector, axis=1) return context_vector, attention_weights # Model sequence_input = Input(shape=(MAX_LEN,), dtype="int32") embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(sequence_input) lstm = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences = True), name="bi_lstm_0")(embedded_sequences) (lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences=True, return_state=True), name="bi_lstm_1")(lstm) state_h = Concatenate()([forward_h, backward_h]) state_c = Concatenate()([forward_c, backward_c]) context_vector, attention_weights = Attention(10)(lstm, state_h) dense1 = Dense(20, activation="relu")(context_vector) dropout = Dropout(0.05)(dense1) output = Dense(8, activation="sigmoid")(dropout) model = keras.Model(inputs=sequence_input, outputs=output) # Compile from keras.callbacks import EarlyStopping from keras import backend optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08) # Start tracking emissions tracker.start() tracker.start_task("inference") model.compile(loss='SparseCategoricalCrossentropy', optimizer=optimizer, metrics=['accuracy']) history = model.fit(X_train_np,y_train_np, shuffle=False,batch_size=BATCH_SIZE, verbose=1,epochs=EPOCHS) # Make predictions (placeholder for actual model inference) candidate_labels = [ "Not related to climate change disinformation", "Climate change is not real and not happening", "Climate change is not human-induced", "Climate change impacts are not that bad", "Climate change solutions are harmful and unnecessary", "Climate change science is unreliable", "Climate change proponents are biased", "Fossil fuels are needed to address climate change" ] def classifier(input_text,candidate_labels): #PREPROCESS THE INPUT TEXT input_text_cleaned = clean_text(input_text) input_sequence = tokenizer.texts_to_sequences([input_text_cleaned]) input_padded = pad_sequences(input_sequence, maxlen = MAX_LEN, padding = 'post') #PREDICTION prediction = np.ravel(model.predict(input_padded)) return {'sequence': input_text,'labels': candidate_labels,'scores': list(prediction)} predictions = [] for i, text in tqdm(enumerate(test_dataset["quote"])): result = classifier(text, candidate_labels) # Get index of highest scoring label pred_label = candidate_labels.index(result["labels"][0]) predictions.append(pred_label) # Stop tracking emissions emissions_data = tracker.stop_task() # Calculate accuracy accuracy = accuracy_score(true_labels, predictions) # Prepare results dictionary results = { "username": username, "space_url": space_url, "submission_timestamp": datetime.now().isoformat(), "model_description": DESCRIPTION, "accuracy": float(accuracy), "energy_consumed_wh": emissions_data.energy_consumed * 1000, "emissions_gco2eq": emissions_data.emissions * 1000, "emissions_data": clean_emissions_data(emissions_data), "api_route": ROUTE, "dataset_config": { "dataset_name": request.dataset_name, "test_size": request.test_size, "test_seed": request.test_seed } } return results