Update tasks/text.py
Browse files- tasks/text.py +154 -121
tasks/text.py
CHANGED
@@ -2,27 +2,30 @@ from fastapi import APIRouter
|
|
2 |
from datetime import datetime
|
3 |
from datasets import load_dataset
|
4 |
from sklearn.metrics import accuracy_score
|
5 |
-
from sklearn.linear_model import LogisticRegression
|
6 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
from sklearn.model_selection import train_test_split
|
8 |
import pandas as pd
|
9 |
-
import
|
10 |
-
from transformers import DistilBertTokenizer
|
11 |
-
from transformers import TFDistilBertForSequenceClassification
|
12 |
-
from transformers import logging
|
13 |
from .utils.evaluation import TextEvaluationRequest
|
14 |
from .utils.emissions import tracker, clean_emissions_data, get_space_info
|
15 |
import os
|
16 |
import re
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
router = APIRouter()
|
24 |
|
25 |
-
DESCRIPTION = "
|
26 |
ROUTE = "/text"
|
27 |
|
28 |
@router.post(ROUTE, tags=["Text Task"],
|
@@ -59,130 +62,160 @@ async def evaluate_text(request: TextEvaluationRequest):
|
|
59 |
|
60 |
|
61 |
# Split dataset
|
62 |
-
train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
|
63 |
-
test_dataset = train_test["test"]
|
64 |
-
train_dataset = train_test["train"]
|
65 |
-
y_train=train_dataset['label']
|
66 |
|
|
|
|
|
67 |
train_dataset = train_test["train"]
|
68 |
-
tn=pd.DataFrame([(i, j, k) for i,j,k in zip(train_dataset["quote"] , train_dataset["source"],
|
69 |
-
train_dataset["subsource"])], columns=['quote','source', 'subsource'])
|
70 |
test_dataset = train_test["test"]
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
def
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
# Start tracking emissions
|
147 |
tracker.start()
|
148 |
tracker.start_task("inference")
|
149 |
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
# Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
|
155 |
-
#--------------------------------------------------------------------------------------------
|
156 |
-
|
157 |
# Make predictions (placeholder for actual model inference)
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
# Stop tracking emissions
|
168 |
emissions_data = tracker.stop_task()
|
169 |
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
predict_input =tokenizer.encode(text,
|
174 |
-
truncation=True,
|
175 |
-
padding=True,
|
176 |
-
return_tensors="tf")
|
177 |
-
output = model(predict_input)[0]
|
178 |
-
prediction_value = tf.argmax(output, axis=1).numpy()[0]
|
179 |
-
return prediction_value
|
180 |
-
# - - - - - - - - - - - - - - -��- - - - - - - - - - - -
|
181 |
-
y_pred = []
|
182 |
-
for text_ in test_dataset_df.to_list():
|
183 |
-
y_pred.append(predict_category(text_))
|
184 |
-
|
185 |
-
accuracy_score(test_labels, y_pred)
|
186 |
|
187 |
# Prepare results dictionary
|
188 |
results = {
|
|
|
2 |
from datetime import datetime
|
3 |
from datasets import load_dataset
|
4 |
from sklearn.metrics import accuracy_score
|
|
|
|
|
5 |
from sklearn.model_selection import train_test_split
|
6 |
import pandas as pd
|
7 |
+
import numpy as np
|
|
|
|
|
|
|
8 |
from .utils.evaluation import TextEvaluationRequest
|
9 |
from .utils.emissions import tracker, clean_emissions_data, get_space_info
|
10 |
import os
|
11 |
import re
|
12 |
+
import pandas as pd
|
13 |
+
from tqdm import tqdm
|
14 |
+
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
|
15 |
+
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
|
16 |
+
import tensorflow as tf
|
17 |
+
import tensorflow.keras as keras
|
18 |
+
from tensorflow.keras.preprocessing.text import Tokenizer
|
19 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
20 |
+
from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten
|
21 |
+
from tensorflow.keras.models import Model, Sequential
|
22 |
+
from tensorflow.keras.layers import Convolution1D
|
23 |
+
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
|
24 |
|
25 |
|
26 |
router = APIRouter()
|
27 |
|
28 |
+
DESCRIPTION = "Attention GRU classification"
|
29 |
ROUTE = "/text"
|
30 |
|
31 |
@router.post(ROUTE, tags=["Text Task"],
|
|
|
62 |
|
63 |
|
64 |
# Split dataset
|
|
|
|
|
|
|
|
|
65 |
|
66 |
+
train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
|
67 |
+
|
68 |
train_dataset = train_test["train"]
|
|
|
|
|
69 |
test_dataset = train_test["test"]
|
70 |
+
|
71 |
+
import nltk
|
72 |
+
nltk.download('stopwords')
|
73 |
+
nltk.download('wordnet')
|
74 |
+
|
75 |
+
import re
|
76 |
+
from nltk.stem import WordNetLemmatizer
|
77 |
+
from nltk.corpus import stopwords
|
78 |
+
|
79 |
+
stop_words = set(stopwords.words("english"))
|
80 |
+
lemmatizer = WordNetLemmatizer()
|
81 |
+
|
82 |
+
|
83 |
+
def clean_text(text):
|
84 |
+
text = re.sub(r'[^\w\s]','',text, re.UNICODE)
|
85 |
+
text = text.lower()
|
86 |
+
text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
|
87 |
+
text = [lemmatizer.lemmatize(token, "v") for token in text]
|
88 |
+
text = [word for word in text if not word in stop_words]
|
89 |
+
text = " ".join(text)
|
90 |
+
return text
|
91 |
+
|
92 |
+
train_df= pd.DataFrame(train_dataset["quote"], columns=['quote'])
|
93 |
+
train_df['clean_text'] = train_df.map(clean_text)
|
94 |
+
train_df['length_clean_text'] = train_df['clean_text'].map(len)
|
95 |
+
|
96 |
+
MAX_FEATURES = 6000
|
97 |
+
EMBED_SIZE = 28
|
98 |
+
RNN_CELL_SIZE = 32
|
99 |
+
MAX_LEN = 30
|
100 |
+
BATCH_SIZE = 100
|
101 |
+
EPOCHS = 30
|
102 |
+
|
103 |
+
tokenizer = Tokenizer(num_words=MAX_FEATURES)
|
104 |
+
tokenizer.fit_on_texts(train_df['clean_text'])
|
105 |
+
list_tokenized_train = tokenizer.texts_to_sequences(train_df['clean_text'])
|
106 |
+
X_train = pad_sequences(list_tokenized_train, maxlen=MAX_LEN)
|
107 |
+
true_labels = test_dataset["label"]
|
108 |
+
y_train = train_dataset["label"]
|
109 |
+
|
110 |
+
|
111 |
+
X_train_np = np.array(X_train)
|
112 |
+
y_train_np = np.array(y_train)
|
113 |
+
|
114 |
+
# Attention Layer
|
115 |
+
|
116 |
+
class Attention(tf.keras.Model):
|
117 |
+
def __init__(self, units):
|
118 |
+
super(Attention, self).__init__()
|
119 |
+
self.W1 = tf.keras.layers.Dense(units)
|
120 |
+
self.W2 = tf.keras.layers.Dense(units)
|
121 |
+
self.V = tf.keras.layers.Dense(1)
|
122 |
+
|
123 |
+
def call(self, features, hidden):
|
124 |
+
# hidden shape == (batch_size, hidden size)
|
125 |
+
# hidden_with_time_axis shape == (batch_size, 1, hidden size)
|
126 |
+
# we are doing this to perform addition to calculate the score
|
127 |
+
hidden_with_time_axis = tf.expand_dims(hidden, 1)
|
128 |
+
|
129 |
+
# score shape == (batch_size, max_length, 1)
|
130 |
+
# we get 1 at the last axis because we are applying score to self.V
|
131 |
+
# the shape of the tensor before applying self.V is (batch_size, max_length, units)
|
132 |
+
score = tf.nn.tanh(
|
133 |
+
self.W1(features) + self.W2(hidden_with_time_axis))
|
134 |
+
|
135 |
+
# attention_weights shape == (batch_size, max_length, 1)
|
136 |
+
attention_weights = tf.nn.softmax(self.V(score), axis=1)
|
137 |
+
|
138 |
+
# context_vector shape after sum == (batch_size, hidden_size)
|
139 |
+
context_vector = attention_weights * features
|
140 |
+
context_vector = tf.reduce_sum(context_vector, axis=1)
|
141 |
+
|
142 |
+
return context_vector, attention_weights
|
143 |
+
|
144 |
+
# Model
|
145 |
+
|
146 |
+
sequence_input = Input(shape=(MAX_LEN,), dtype="int32")
|
147 |
+
embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(sequence_input)
|
148 |
+
|
149 |
+
lstm = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences = True), name="bi_lstm_0")(embedded_sequences)
|
150 |
|
151 |
+
|
152 |
+
(lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences=True, return_state=True), name="bi_lstm_1")(lstm)
|
153 |
+
|
154 |
+
state_h = Concatenate()([forward_h, backward_h])
|
155 |
+
state_c = Concatenate()([forward_c, backward_c])
|
156 |
+
|
157 |
+
context_vector, attention_weights = Attention(10)(lstm, state_h)
|
158 |
+
|
159 |
+
dense1 = Dense(20, activation="relu")(context_vector)
|
160 |
+
dropout = Dropout(0.05)(dense1)
|
161 |
+
output = Dense(8, activation="sigmoid")(dropout)
|
162 |
+
|
163 |
+
model = keras.Model(inputs=sequence_input, outputs=output)
|
164 |
+
|
165 |
+
# Compile
|
166 |
+
|
167 |
+
from keras.callbacks import EarlyStopping
|
168 |
+
from keras import backend
|
169 |
+
|
170 |
+
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
|
171 |
+
|
172 |
# Start tracking emissions
|
173 |
tracker.start()
|
174 |
tracker.start_task("inference")
|
175 |
|
176 |
+
model.compile(loss='SparseCategoricalCrossentropy', optimizer=optimizer, metrics=['accuracy'])
|
177 |
+
|
178 |
+
history = model.fit(X_train_np,y_train_np, shuffle=False,batch_size=BATCH_SIZE, verbose=1,epochs=EPOCHS)
|
179 |
+
|
|
|
|
|
|
|
180 |
# Make predictions (placeholder for actual model inference)
|
181 |
+
candidate_labels = [
|
182 |
+
"Not related to climate change disinformation",
|
183 |
+
"Climate change is not real and not happening",
|
184 |
+
"Climate change is not human-induced",
|
185 |
+
"Climate change impacts are not that bad",
|
186 |
+
"Climate change solutions are harmful and unnecessary",
|
187 |
+
"Climate change science is unreliable",
|
188 |
+
"Climate change proponents are biased",
|
189 |
+
"Fossil fuels are needed to address climate change"
|
190 |
+
]
|
191 |
+
def classifier(input_text,candidate_labels):
|
192 |
+
#PREPROCESS THE INPUT TEXT
|
193 |
+
input_text_cleaned = clean_text(input_text)
|
194 |
+
input_sequence = tokenizer.texts_to_sequences([input_text_cleaned])
|
195 |
+
input_padded = pad_sequences(input_sequence, maxlen = MAX_LEN, padding = 'post')
|
196 |
+
#PREDICTION
|
197 |
+
prediction = np.ravel(model.predict(input_padded))
|
198 |
+
return {'sequence': input_text,'labels': candidate_labels,'scores': list(prediction)}
|
199 |
|
200 |
+
|
201 |
+
predictions = []
|
202 |
+
|
203 |
+
for i, text in tqdm(enumerate(test_dataset["quote"])):
|
204 |
+
|
205 |
+
result = classifier(text, candidate_labels)
|
206 |
|
207 |
+
# Get index of highest scoring label
|
208 |
+
|
209 |
+
pred_label = candidate_labels.index(result["labels"][0])
|
210 |
+
|
211 |
+
predictions.append(pred_label)
|
212 |
+
|
213 |
# Stop tracking emissions
|
214 |
emissions_data = tracker.stop_task()
|
215 |
|
216 |
+
# Calculate accuracy
|
217 |
+
accuracy = accuracy_score(true_labels, predictions)
|
218 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
# Prepare results dictionary
|
221 |
results = {
|