File size: 8,572 Bytes
4d6e8c2 945b47e aa215e9 8963d03 4d6e8c2 0f3f04a 24930f7 8963d03 24930f7 4d6e8c2 8963d03 1c33274 70f5f26 1c33274 70f5f26 4d6e8c2 70f5f26 d195db5 70f5f26 4d6e8c2 1c1e26c 4d6e8c2 d195db5 8963d03 d195db5 8963d03 ee79959 8963d03 945b47e 8963d03 ee79959 8963d03 4d6e8c2 0f3f04a 8963d03 d195db5 8963d03 70f5f26 8963d03 4d6e8c2 8963d03 4d6e8c2 d195db5 8963d03 4d6e8c2 70f5f26 4d6e8c2 1c33274 4d6e8c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 |
from fastapi import APIRouter
from datetime import datetime
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from .utils.evaluation import TextEvaluationRequest
from .utils.emissions import tracker, clean_emissions_data, get_space_info
import os
import re
import pandas as pd
from tqdm import tqdm
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Convolution1D
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
router = APIRouter()
DESCRIPTION = "Attention GRU classification"
ROUTE = "/text"
@router.post(ROUTE, tags=["Text Task"],
description=DESCRIPTION)
async def evaluate_text(request: TextEvaluationRequest):
"""
Evaluate text classification for climate disinformation detection.
Current Model: DistilBert classification
- DistilBert classification predictions from the label space (0-7)
- Used as a baseline for comparison
"""
# Get space info
username, space_url = get_space_info()
# Define the label mapping
LABEL_MAPPING = {
"0_not_relevant": 0,
"1_not_happening": 1,
"2_not_human": 2,
"3_not_bad": 3,
"4_solutions_harmful_unnecessary": 4,
"5_science_unreliable": 5,
"6_proponents_biased": 6,
"7_fossil_fuels_needed": 7
}
# Load and prepare the dataset
dataset = load_dataset(request.dataset_name)
# Convert string labels to integers
dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
# Split dataset
train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
train_dataset = train_test["train"]
test_dataset = train_test["test"]
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
def clean_text(text):
text = re.sub(r'[^\w\s]','',text, re.UNICODE)
text = text.lower()
text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
text = [lemmatizer.lemmatize(token, "v") for token in text]
text = [word for word in text if not word in stop_words]
text = " ".join(text)
return text
train_df= pd.DataFrame(train_dataset["quote"], columns=['quote'])
train_df['clean_text'] = train_df.map(clean_text)
train_df['length_clean_text'] = train_df['clean_text'].map(len)
MAX_FEATURES = 6000
EMBED_SIZE = 28
RNN_CELL_SIZE = 32
MAX_LEN = 30
BATCH_SIZE = 100
EPOCHS = 30
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_df['clean_text'])
list_tokenized_train = tokenizer.texts_to_sequences(train_df['clean_text'])
X_train = pad_sequences(list_tokenized_train, maxlen=MAX_LEN)
true_labels = test_dataset["label"]
y_train = train_dataset["label"]
X_train_np = np.array(X_train)
y_train_np = np.array(y_train)
# Attention Layer
class Attention(tf.keras.Model):
def __init__(self, units):
super(Attention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, features, hidden):
# hidden shape == (batch_size, hidden size)
# hidden_with_time_axis shape == (batch_size, 1, hidden size)
# we are doing this to perform addition to calculate the score
hidden_with_time_axis = tf.expand_dims(hidden, 1)
# score shape == (batch_size, max_length, 1)
# we get 1 at the last axis because we are applying score to self.V
# the shape of the tensor before applying self.V is (batch_size, max_length, units)
score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
# attention_weights shape == (batch_size, max_length, 1)
attention_weights = tf.nn.softmax(self.V(score), axis=1)
# context_vector shape after sum == (batch_size, hidden_size)
context_vector = attention_weights * features
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
# Model
sequence_input = Input(shape=(MAX_LEN,), dtype="int32")
embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(sequence_input)
lstm = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences = True), name="bi_lstm_0")(embedded_sequences)
(lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences=True, return_state=True), name="bi_lstm_1")(lstm)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
context_vector, attention_weights = Attention(10)(lstm, state_h)
dense1 = Dense(20, activation="relu")(context_vector)
dropout = Dropout(0.05)(dense1)
output = Dense(8, activation="sigmoid")(dropout)
model = keras.Model(inputs=sequence_input, outputs=output)
# Compile
from keras.callbacks import EarlyStopping
from keras import backend
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
# Start tracking emissions
tracker.start()
tracker.start_task("inference")
model.compile(loss='SparseCategoricalCrossentropy', optimizer=optimizer, metrics=['accuracy'])
history = model.fit(X_train_np,y_train_np, shuffle=False,batch_size=BATCH_SIZE, verbose=1,epochs=EPOCHS)
# Make predictions (placeholder for actual model inference)
candidate_labels = [
"Not related to climate change disinformation",
"Climate change is not real and not happening",
"Climate change is not human-induced",
"Climate change impacts are not that bad",
"Climate change solutions are harmful and unnecessary",
"Climate change science is unreliable",
"Climate change proponents are biased",
"Fossil fuels are needed to address climate change"
]
def classifier(input_text,candidate_labels):
#PREPROCESS THE INPUT TEXT
input_text_cleaned = clean_text(input_text)
input_sequence = tokenizer.texts_to_sequences([input_text_cleaned])
input_padded = pad_sequences(input_sequence, maxlen = MAX_LEN, padding = 'post')
#PREDICTION
prediction = np.ravel(model.predict(input_padded))
return {'sequence': input_text,'labels': candidate_labels,'scores': list(prediction)}
predictions = []
for i, text in tqdm(enumerate(test_dataset["quote"])):
result = classifier(text, candidate_labels)
# Get index of highest scoring label
pred_label = candidate_labels.index(result["labels"][0])
predictions.append(pred_label)
# Stop tracking emissions
emissions_data = tracker.stop_task()
# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
# Prepare results dictionary
results = {
"username": username,
"space_url": space_url,
"submission_timestamp": datetime.now().isoformat(),
"model_description": DESCRIPTION,
"accuracy": float(accuracy),
"energy_consumed_wh": emissions_data.energy_consumed * 1000,
"emissions_gco2eq": emissions_data.emissions * 1000,
"emissions_data": clean_emissions_data(emissions_data),
"api_route": ROUTE,
"dataset_config": {
"dataset_name": request.dataset_name,
"test_size": request.test_size,
"test_seed": request.test_seed
}
}
return results |