|
from fastapi import APIRouter |
|
from datetime import datetime |
|
from datasets import load_dataset |
|
from sklearn.metrics import accuracy_score |
|
from sklearn.model_selection import train_test_split |
|
import pandas as pd |
|
import numpy as np |
|
from .utils.evaluation import TextEvaluationRequest |
|
from .utils.emissions import tracker, clean_emissions_data, get_space_info |
|
import os |
|
import re |
|
import pandas as pd |
|
from tqdm import tqdm |
|
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline |
|
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D |
|
import tensorflow as tf |
|
import tensorflow.keras as keras |
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten |
|
from tensorflow.keras.models import Model, Sequential |
|
from tensorflow.keras.layers import Convolution1D |
|
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers |
|
|
|
|
|
router = APIRouter() |
|
|
|
DESCRIPTION = "Attention GRU classification" |
|
ROUTE = "/text" |
|
|
|
@router.post(ROUTE, tags=["Text Task"], |
|
description=DESCRIPTION) |
|
async def evaluate_text(request: TextEvaluationRequest): |
|
""" |
|
Evaluate text classification for climate disinformation detection. |
|
|
|
Current Model: Bidirectional LSTM with Attention layer classification |
|
- Current Model: Bidirectional LSTM with Attention layer classification classification predictions from the label space (0-7) |
|
- Used as a baseline for comparison |
|
""" |
|
|
|
username, space_url = get_space_info() |
|
|
|
|
|
LABEL_MAPPING = { |
|
"0_not_relevant": 0, |
|
"1_not_happening": 1, |
|
"2_not_human": 2, |
|
"3_not_bad": 3, |
|
"4_solutions_harmful_unnecessary": 4, |
|
"5_science_unreliable": 5, |
|
"6_proponents_biased": 6, |
|
"7_fossil_fuels_needed": 7 |
|
} |
|
|
|
|
|
dataset = load_dataset(request.dataset_name) |
|
|
|
|
|
dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]}) |
|
|
|
|
|
|
|
|
|
|
|
train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed) |
|
|
|
train_dataset = train_test["train"] |
|
test_dataset = train_test["test"] |
|
|
|
import nltk |
|
nltk.download('stopwords') |
|
nltk.download('wordnet') |
|
|
|
import re |
|
from nltk.stem import WordNetLemmatizer |
|
from nltk.corpus import stopwords |
|
|
|
stop_words = set(stopwords.words("english")) |
|
lemmatizer = WordNetLemmatizer() |
|
|
|
|
|
def clean_text(text): |
|
text = re.sub(r'[^\w\s]','',text, re.UNICODE) |
|
text = text.lower() |
|
text = [lemmatizer.lemmatize(token) for token in text.split(" ")] |
|
text = [lemmatizer.lemmatize(token, "v") for token in text] |
|
text = [word for word in text if not word in stop_words] |
|
text = " ".join(text) |
|
return text |
|
|
|
train_df= pd.DataFrame(train_dataset["quote"], columns=['quote']) |
|
train_df['clean_text'] = train_df.map(clean_text) |
|
train_df['length_clean_text'] = train_df['clean_text'].map(len) |
|
|
|
MAX_FEATURES = 6000 |
|
EMBED_SIZE = 28 |
|
RNN_CELL_SIZE = 32 |
|
MAX_LEN = 30 |
|
BATCH_SIZE = 100 |
|
EPOCHS = 30 |
|
|
|
tokenizer = Tokenizer(num_words=MAX_FEATURES) |
|
tokenizer.fit_on_texts(train_df['clean_text']) |
|
list_tokenized_train = tokenizer.texts_to_sequences(train_df['clean_text']) |
|
X_train = pad_sequences(list_tokenized_train, maxlen=MAX_LEN) |
|
true_labels = test_dataset["label"] |
|
y_train = train_dataset["label"] |
|
|
|
|
|
X_train_np = np.array(X_train) |
|
y_train_np = np.array(y_train) |
|
|
|
|
|
|
|
class Attention(tf.keras.Model): |
|
def __init__(self, units): |
|
super(Attention, self).__init__() |
|
self.W1 = tf.keras.layers.Dense(units) |
|
self.W2 = tf.keras.layers.Dense(units) |
|
self.V = tf.keras.layers.Dense(1) |
|
|
|
def call(self, features, hidden): |
|
|
|
|
|
|
|
hidden_with_time_axis = tf.expand_dims(hidden, 1) |
|
|
|
|
|
|
|
|
|
score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis)) |
|
|
|
|
|
attention_weights = tf.nn.softmax(self.V(score), axis=1) |
|
|
|
|
|
context_vector = attention_weights * features |
|
context_vector = tf.reduce_sum(context_vector, axis=1) |
|
|
|
return context_vector, attention_weights |
|
|
|
|
|
|
|
sequence_input = Input(shape=(MAX_LEN,), dtype="int32") |
|
embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(sequence_input) |
|
|
|
lstm = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences = True), name="bi_lstm_0")(embedded_sequences) |
|
|
|
|
|
(lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences=True, return_state=True), name="bi_lstm_1")(lstm) |
|
|
|
state_h = Concatenate()([forward_h, backward_h]) |
|
state_c = Concatenate()([forward_c, backward_c]) |
|
|
|
context_vector, attention_weights = Attention(10)(lstm, state_h) |
|
|
|
dense1 = Dense(20, activation="relu")(context_vector) |
|
dropout = Dropout(0.05)(dense1) |
|
output = Dense(8, activation="sigmoid")(dropout) |
|
|
|
model = keras.Model(inputs=sequence_input, outputs=output) |
|
|
|
|
|
|
|
from keras.callbacks import EarlyStopping |
|
from keras import backend |
|
|
|
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08) |
|
|
|
|
|
tracker.start() |
|
tracker.start_task("inference") |
|
|
|
model.compile(loss='SparseCategoricalCrossentropy', optimizer=optimizer, metrics=['accuracy']) |
|
|
|
history = model.fit(X_train_np,y_train_np, shuffle=False,batch_size=BATCH_SIZE, verbose=1,epochs=EPOCHS) |
|
|
|
|
|
candidate_labels = [ |
|
"Not related to climate change disinformation", |
|
"Climate change is not real and not happening", |
|
"Climate change is not human-induced", |
|
"Climate change impacts are not that bad", |
|
"Climate change solutions are harmful and unnecessary", |
|
"Climate change science is unreliable", |
|
"Climate change proponents are biased", |
|
"Fossil fuels are needed to address climate change" |
|
] |
|
def classifier(input_text,candidate_labels): |
|
|
|
input_text_cleaned = clean_text(input_text) |
|
input_sequence = tokenizer.texts_to_sequences([input_text_cleaned]) |
|
input_padded = pad_sequences(input_sequence, maxlen = MAX_LEN, padding = 'post') |
|
|
|
prediction = np.ravel(model.predict(input_padded)) |
|
return {'sequence': input_text,'labels': candidate_labels,'scores': list(prediction)} |
|
|
|
|
|
predictions = [] |
|
|
|
for i, text in tqdm(enumerate(test_dataset["quote"])): |
|
|
|
result = classifier(text, candidate_labels) |
|
|
|
|
|
|
|
pred_label = candidate_labels.index(result["labels"][0]) |
|
|
|
predictions.append(pred_label) |
|
|
|
|
|
emissions_data = tracker.stop_task() |
|
|
|
|
|
accuracy = accuracy_score(true_labels, predictions) |
|
|
|
|
|
|
|
results = { |
|
"username": username, |
|
"space_url": space_url, |
|
"submission_timestamp": datetime.now().isoformat(), |
|
"model_description": DESCRIPTION, |
|
"accuracy": float(accuracy), |
|
"energy_consumed_wh": emissions_data.energy_consumed * 1000, |
|
"emissions_gco2eq": emissions_data.emissions * 1000, |
|
"emissions_data": clean_emissions_data(emissions_data), |
|
"api_route": ROUTE, |
|
"dataset_config": { |
|
"dataset_name": request.dataset_name, |
|
"test_size": request.test_size, |
|
"test_seed": request.test_seed |
|
} |
|
} |
|
|
|
return results |