Update tasks/text.py
Browse files- tasks/text.py +31 -137
tasks/text.py
CHANGED
@@ -10,22 +10,14 @@ from .utils.emissions import tracker, clean_emissions_data, get_space_info
|
|
10 |
import os
|
11 |
import re
|
12 |
import pandas as pd
|
13 |
-
from tqdm import tqdm
|
14 |
-
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
|
15 |
-
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
|
16 |
import tensorflow as tf
|
17 |
-
import
|
18 |
-
|
19 |
-
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
20 |
-
from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten
|
21 |
-
from tensorflow.keras.models import Model, Sequential
|
22 |
-
from tensorflow.keras.layers import Convolution1D
|
23 |
-
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
|
24 |
|
25 |
|
26 |
router = APIRouter()
|
27 |
|
28 |
-
DESCRIPTION = "
|
29 |
ROUTE = "/text"
|
30 |
|
31 |
@router.post(ROUTE, tags=["Text Task"],
|
@@ -68,147 +60,49 @@ async def evaluate_text(request: TextEvaluationRequest):
|
|
68 |
train_dataset = train_test["train"]
|
69 |
test_dataset = train_test["test"]
|
70 |
|
71 |
-
import
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
74 |
|
75 |
-
|
76 |
-
from nltk.stem import WordNetLemmatizer
|
77 |
-
from nltk.corpus import stopwords
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
def clean_text(text):
|
84 |
-
text = re.sub(r'[^\w\s]','',text, re.UNICODE)
|
85 |
-
text = text.lower()
|
86 |
-
text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
|
87 |
-
text = [lemmatizer.lemmatize(token, "v") for token in text]
|
88 |
-
text = [word for word in text if not word in stop_words]
|
89 |
-
text = " ".join(text)
|
90 |
-
return text
|
91 |
-
|
92 |
-
train_df= pd.DataFrame(train_dataset["quote"], columns=['quote'])
|
93 |
-
train_df['clean_text'] = train_df.map(clean_text)
|
94 |
-
train_df['length_clean_text'] = train_df['clean_text'].map(len)
|
95 |
-
|
96 |
-
MAX_FEATURES = 6000
|
97 |
-
EMBED_SIZE = 28
|
98 |
-
RNN_CELL_SIZE = 32
|
99 |
-
MAX_LEN = 30
|
100 |
-
BATCH_SIZE = 100
|
101 |
-
EPOCHS = 30
|
102 |
-
|
103 |
-
tokenizer = Tokenizer(num_words=MAX_FEATURES)
|
104 |
-
tokenizer.fit_on_texts(train_df['clean_text'])
|
105 |
-
list_tokenized_train = tokenizer.texts_to_sequences(train_df['clean_text'])
|
106 |
-
X_train = pad_sequences(list_tokenized_train, maxlen=MAX_LEN)
|
107 |
true_labels = test_dataset["label"]
|
108 |
y_train = train_dataset["label"]
|
|
|
109 |
|
110 |
-
|
111 |
-
X_train_np = np.array(X_train)
|
112 |
-
y_train_np = np.array(y_train)
|
113 |
|
114 |
-
# Attention Layer
|
115 |
-
|
116 |
-
class Attention(tf.keras.Model):
|
117 |
-
def __init__(self, units):
|
118 |
-
super(Attention, self).__init__()
|
119 |
-
self.W1 = tf.keras.layers.Dense(units)
|
120 |
-
self.W2 = tf.keras.layers.Dense(units)
|
121 |
-
self.V = tf.keras.layers.Dense(1)
|
122 |
-
|
123 |
-
def call(self, features, hidden):
|
124 |
-
# hidden shape == (batch_size, hidden size)
|
125 |
-
# hidden_with_time_axis shape == (batch_size, 1, hidden size)
|
126 |
-
# we are doing this to perform addition to calculate the score
|
127 |
-
hidden_with_time_axis = tf.expand_dims(hidden, 1)
|
128 |
-
|
129 |
-
# score shape == (batch_size, max_length, 1)
|
130 |
-
# we get 1 at the last axis because we are applying score to self.V
|
131 |
-
# the shape of the tensor before applying self.V is (batch_size, max_length, units)
|
132 |
-
score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
|
133 |
-
|
134 |
-
# attention_weights shape == (batch_size, max_length, 1)
|
135 |
-
attention_weights = tf.nn.softmax(self.V(score), axis=1)
|
136 |
-
|
137 |
-
# context_vector shape after sum == (batch_size, hidden_size)
|
138 |
-
context_vector = attention_weights * features
|
139 |
-
context_vector = tf.reduce_sum(context_vector, axis=1)
|
140 |
-
|
141 |
-
return context_vector, attention_weights
|
142 |
|
143 |
# Model
|
144 |
-
|
145 |
-
sequence_input = Input(shape=(MAX_LEN,), dtype="int32")
|
146 |
-
embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(sequence_input)
|
147 |
-
|
148 |
-
lstm = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences = True), name="bi_lstm_0")(embedded_sequences)
|
149 |
-
|
150 |
-
|
151 |
-
(lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences=True, return_state=True), name="bi_lstm_1")(lstm)
|
152 |
|
153 |
-
|
154 |
-
|
|
|
155 |
|
156 |
-
|
|
|
|
|
|
|
|
|
157 |
|
158 |
-
dense1 = Dense(20, activation="relu")(context_vector)
|
159 |
-
dropout = Dropout(0.05)(dense1)
|
160 |
-
output = Dense(8, activation="sigmoid")(dropout)
|
161 |
-
|
162 |
-
model = keras.Model(inputs=sequence_input, outputs=output)
|
163 |
-
|
164 |
-
# Compile
|
165 |
-
|
166 |
-
from keras.callbacks import EarlyStopping
|
167 |
-
from keras import backend
|
168 |
-
|
169 |
-
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
|
170 |
-
|
171 |
# Start tracking emissions
|
172 |
tracker.start()
|
173 |
tracker.start_task("inference")
|
174 |
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
"Not related to climate change disinformation",
|
182 |
-
"Climate change is not real and not happening",
|
183 |
-
"Climate change is not human-induced",
|
184 |
-
"Climate change impacts are not that bad",
|
185 |
-
"Climate change solutions are harmful and unnecessary",
|
186 |
-
"Climate change science is unreliable",
|
187 |
-
"Climate change proponents are biased",
|
188 |
-
"Fossil fuels are needed to address climate change"
|
189 |
-
]
|
190 |
-
def classifier(input_text,candidate_labels):
|
191 |
-
#PREPROCESS THE INPUT TEXT
|
192 |
-
input_text_cleaned = clean_text(input_text)
|
193 |
-
input_sequence = tokenizer.texts_to_sequences([input_text_cleaned])
|
194 |
-
input_padded = pad_sequences(input_sequence, maxlen = MAX_LEN, padding = 'post')
|
195 |
-
#PREDICTION
|
196 |
-
prediction = np.ravel(model.predict(input_padded))
|
197 |
-
return {'sequence': input_text,'labels': candidate_labels,'scores': list(prediction)}
|
198 |
-
|
199 |
-
|
200 |
-
predictions = []
|
201 |
-
|
202 |
-
for i, text in tqdm(enumerate(test_dataset["quote"])):
|
203 |
-
|
204 |
-
result = classifier(text, candidate_labels)
|
205 |
-
|
206 |
-
# Get index of highest scoring label
|
207 |
-
|
208 |
-
pred_label = candidate_labels.index(result["labels"][0])
|
209 |
-
|
210 |
-
predictions.append(pred_label)
|
211 |
-
|
212 |
# Stop tracking emissions
|
213 |
emissions_data = tracker.stop_task()
|
214 |
|
|
|
10 |
import os
|
11 |
import re
|
12 |
import pandas as pd
|
|
|
|
|
|
|
13 |
import tensorflow as tf
|
14 |
+
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
|
15 |
+
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
|
18 |
router = APIRouter()
|
19 |
|
20 |
+
DESCRIPTION = " XGBOOST classification"
|
21 |
ROUTE = "/text"
|
22 |
|
23 |
@router.post(ROUTE, tags=["Text Task"],
|
|
|
60 |
train_dataset = train_test["train"]
|
61 |
test_dataset = train_test["test"]
|
62 |
|
63 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
64 |
+
from sklearn.model_selection import train_test_split
|
65 |
+
from sklearn import metrics
|
66 |
+
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
|
67 |
+
from sklearn.tree import DecisionTreeClassifier
|
68 |
+
from datetime import datetime
|
69 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
70 |
|
71 |
+
tfidf_vect = TfidfVectorizer(stop_words = 'english')
|
|
|
|
|
72 |
|
73 |
+
tfidf_train = tfidf_vect.fit_transform(train_dataset['quote'])
|
74 |
+
tfidf_train = tfidf_vect.transform(train_dataset['quote'])
|
75 |
+
tfidf_test = tfidf_vect.fit_transform(test_dataset['quote'])
|
76 |
+
tfidf_test = tfidf_vect.transform(test_dataset['quote'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
true_labels = test_dataset["label"]
|
78 |
y_train = train_dataset["label"]
|
79 |
+
y_test = test_dataset["label"]
|
80 |
|
|
|
|
|
|
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
# Model
|
84 |
+
import xgboost as xgb
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
+
#Parameters: {'colsample_bytree': 0.7039283369765, 'gamma': 0.3317686860083553, 'learning_rate': 0.08341079006092542, 'max_depth': 5, 'n_estimators': 140, 'subsample': 0.6594650911012452}
|
87 |
+
#Parameters: {'colsample_bytree': 0.7039283369765, 'gamma': 0.3317686860083553, 'learning_rate': 0.08341079006092542, 'max_depth': 5, 'n_estimators': 140, 'subsample': 0.6594650911012452}
|
88 |
+
#Parameters: {'colsample_bytree': 0.7498850106268238, 'gamma': 0.3690168082131852, 'learning_rate': 0.054839600377537934, 'max_depth': 5, 'n_estimators': 125, 'subsample': 0.6272998821416366}
|
89 |
|
90 |
+
#xgb_model = xgb.XGBRegressor(max_depth=5, objective='multi:softprob',
|
91 |
+
n_estimators=125, num_class=8, colsample_bytree=0.7498850106268238,gamma=0.3690168082131852,
|
92 |
+
learning_rate=0.054839600377537934, subsample=0.6272998821416366)
|
93 |
+
#xgb_model.fit(tfidf_train, y_train)
|
94 |
+
#y_pred = xgb_model.predict(tfidf_train)
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
# Start tracking emissions
|
97 |
tracker.start()
|
98 |
tracker.start_task("inference")
|
99 |
|
100 |
+
xgb_model = xgb.XGBRegressor(max_depth=6, objective='multi:softprob',
|
101 |
+
n_estimators=500, num_class=8, colsample_bytree=0.75,gamma=0.35,
|
102 |
+
learning_rate=0.06, subsample=0.63)
|
103 |
+
xgb_model.fit(tfidf_test, y_test)
|
104 |
+
predictions = np.argmax(xgb_model.predict(tfidf_test), axis=1)
|
105 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
# Stop tracking emissions
|
107 |
emissions_data = tracker.stop_task()
|
108 |
|