laureBe commited on
Commit
2aa2667
·
verified ·
1 Parent(s): f8008ee

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +31 -137
tasks/text.py CHANGED
@@ -10,22 +10,14 @@ from .utils.emissions import tracker, clean_emissions_data, get_space_info
10
  import os
11
  import re
12
  import pandas as pd
13
- from tqdm import tqdm
14
- from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
15
- from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
16
  import tensorflow as tf
17
- import tensorflow.keras as keras
18
- from tensorflow.keras.preprocessing.text import Tokenizer
19
- from tensorflow.keras.preprocessing.sequence import pad_sequences
20
- from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten
21
- from tensorflow.keras.models import Model, Sequential
22
- from tensorflow.keras.layers import Convolution1D
23
- from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
24
 
25
 
26
  router = APIRouter()
27
 
28
- DESCRIPTION = "Attention GRU classification"
29
  ROUTE = "/text"
30
 
31
  @router.post(ROUTE, tags=["Text Task"],
@@ -68,147 +60,49 @@ async def evaluate_text(request: TextEvaluationRequest):
68
  train_dataset = train_test["train"]
69
  test_dataset = train_test["test"]
70
 
71
- import nltk
72
- nltk.download('stopwords')
73
- nltk.download('wordnet')
 
 
 
 
74
 
75
- import re
76
- from nltk.stem import WordNetLemmatizer
77
- from nltk.corpus import stopwords
78
 
79
- stop_words = set(stopwords.words("english"))
80
- lemmatizer = WordNetLemmatizer()
81
-
82
-
83
- def clean_text(text):
84
- text = re.sub(r'[^\w\s]','',text, re.UNICODE)
85
- text = text.lower()
86
- text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
87
- text = [lemmatizer.lemmatize(token, "v") for token in text]
88
- text = [word for word in text if not word in stop_words]
89
- text = " ".join(text)
90
- return text
91
-
92
- train_df= pd.DataFrame(train_dataset["quote"], columns=['quote'])
93
- train_df['clean_text'] = train_df.map(clean_text)
94
- train_df['length_clean_text'] = train_df['clean_text'].map(len)
95
-
96
- MAX_FEATURES = 6000
97
- EMBED_SIZE = 28
98
- RNN_CELL_SIZE = 32
99
- MAX_LEN = 30
100
- BATCH_SIZE = 100
101
- EPOCHS = 30
102
-
103
- tokenizer = Tokenizer(num_words=MAX_FEATURES)
104
- tokenizer.fit_on_texts(train_df['clean_text'])
105
- list_tokenized_train = tokenizer.texts_to_sequences(train_df['clean_text'])
106
- X_train = pad_sequences(list_tokenized_train, maxlen=MAX_LEN)
107
  true_labels = test_dataset["label"]
108
  y_train = train_dataset["label"]
 
109
 
110
-
111
- X_train_np = np.array(X_train)
112
- y_train_np = np.array(y_train)
113
 
114
- # Attention Layer
115
-
116
- class Attention(tf.keras.Model):
117
- def __init__(self, units):
118
- super(Attention, self).__init__()
119
- self.W1 = tf.keras.layers.Dense(units)
120
- self.W2 = tf.keras.layers.Dense(units)
121
- self.V = tf.keras.layers.Dense(1)
122
-
123
- def call(self, features, hidden):
124
- # hidden shape == (batch_size, hidden size)
125
- # hidden_with_time_axis shape == (batch_size, 1, hidden size)
126
- # we are doing this to perform addition to calculate the score
127
- hidden_with_time_axis = tf.expand_dims(hidden, 1)
128
-
129
- # score shape == (batch_size, max_length, 1)
130
- # we get 1 at the last axis because we are applying score to self.V
131
- # the shape of the tensor before applying self.V is (batch_size, max_length, units)
132
- score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
133
-
134
- # attention_weights shape == (batch_size, max_length, 1)
135
- attention_weights = tf.nn.softmax(self.V(score), axis=1)
136
-
137
- # context_vector shape after sum == (batch_size, hidden_size)
138
- context_vector = attention_weights * features
139
- context_vector = tf.reduce_sum(context_vector, axis=1)
140
-
141
- return context_vector, attention_weights
142
 
143
  # Model
144
-
145
- sequence_input = Input(shape=(MAX_LEN,), dtype="int32")
146
- embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(sequence_input)
147
-
148
- lstm = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences = True), name="bi_lstm_0")(embedded_sequences)
149
-
150
-
151
- (lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences=True, return_state=True), name="bi_lstm_1")(lstm)
152
 
153
- state_h = Concatenate()([forward_h, backward_h])
154
- state_c = Concatenate()([forward_c, backward_c])
 
155
 
156
- context_vector, attention_weights = Attention(10)(lstm, state_h)
 
 
 
 
157
 
158
- dense1 = Dense(20, activation="relu")(context_vector)
159
- dropout = Dropout(0.05)(dense1)
160
- output = Dense(8, activation="sigmoid")(dropout)
161
-
162
- model = keras.Model(inputs=sequence_input, outputs=output)
163
-
164
- # Compile
165
-
166
- from keras.callbacks import EarlyStopping
167
- from keras import backend
168
-
169
- optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
170
-
171
  # Start tracking emissions
172
  tracker.start()
173
  tracker.start_task("inference")
174
 
175
- model.compile(loss='SparseCategoricalCrossentropy', optimizer=optimizer, metrics=['accuracy'])
176
-
177
- history = model.fit(X_train_np,y_train_np, shuffle=False,batch_size=BATCH_SIZE, verbose=1,epochs=EPOCHS)
178
-
179
- # Make predictions (placeholder for actual model inference)
180
- candidate_labels = [
181
- "Not related to climate change disinformation",
182
- "Climate change is not real and not happening",
183
- "Climate change is not human-induced",
184
- "Climate change impacts are not that bad",
185
- "Climate change solutions are harmful and unnecessary",
186
- "Climate change science is unreliable",
187
- "Climate change proponents are biased",
188
- "Fossil fuels are needed to address climate change"
189
- ]
190
- def classifier(input_text,candidate_labels):
191
- #PREPROCESS THE INPUT TEXT
192
- input_text_cleaned = clean_text(input_text)
193
- input_sequence = tokenizer.texts_to_sequences([input_text_cleaned])
194
- input_padded = pad_sequences(input_sequence, maxlen = MAX_LEN, padding = 'post')
195
- #PREDICTION
196
- prediction = np.ravel(model.predict(input_padded))
197
- return {'sequence': input_text,'labels': candidate_labels,'scores': list(prediction)}
198
-
199
-
200
- predictions = []
201
-
202
- for i, text in tqdm(enumerate(test_dataset["quote"])):
203
-
204
- result = classifier(text, candidate_labels)
205
-
206
- # Get index of highest scoring label
207
-
208
- pred_label = candidate_labels.index(result["labels"][0])
209
-
210
- predictions.append(pred_label)
211
-
212
  # Stop tracking emissions
213
  emissions_data = tracker.stop_task()
214
 
 
10
  import os
11
  import re
12
  import pandas as pd
 
 
 
13
  import tensorflow as tf
14
+ from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
15
+
 
 
 
 
 
16
 
17
 
18
  router = APIRouter()
19
 
20
+ DESCRIPTION = " XGBOOST classification"
21
  ROUTE = "/text"
22
 
23
  @router.post(ROUTE, tags=["Text Task"],
 
60
  train_dataset = train_test["train"]
61
  test_dataset = train_test["test"]
62
 
63
+ from sklearn.feature_extraction.text import TfidfVectorizer
64
+ from sklearn.model_selection import train_test_split
65
+ from sklearn import metrics
66
+ from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
67
+ from sklearn.tree import DecisionTreeClassifier
68
+ from datetime import datetime
69
+ from sklearn.feature_extraction.text import CountVectorizer
70
 
71
+ tfidf_vect = TfidfVectorizer(stop_words = 'english')
 
 
72
 
73
+ tfidf_train = tfidf_vect.fit_transform(train_dataset['quote'])
74
+ tfidf_train = tfidf_vect.transform(train_dataset['quote'])
75
+ tfidf_test = tfidf_vect.fit_transform(test_dataset['quote'])
76
+ tfidf_test = tfidf_vect.transform(test_dataset['quote'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  true_labels = test_dataset["label"]
78
  y_train = train_dataset["label"]
79
+ y_test = test_dataset["label"]
80
 
 
 
 
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  # Model
84
+ import xgboost as xgb
 
 
 
 
 
 
 
85
 
86
+ #Parameters: {'colsample_bytree': 0.7039283369765, 'gamma': 0.3317686860083553, 'learning_rate': 0.08341079006092542, 'max_depth': 5, 'n_estimators': 140, 'subsample': 0.6594650911012452}
87
+ #Parameters: {'colsample_bytree': 0.7039283369765, 'gamma': 0.3317686860083553, 'learning_rate': 0.08341079006092542, 'max_depth': 5, 'n_estimators': 140, 'subsample': 0.6594650911012452}
88
+ #Parameters: {'colsample_bytree': 0.7498850106268238, 'gamma': 0.3690168082131852, 'learning_rate': 0.054839600377537934, 'max_depth': 5, 'n_estimators': 125, 'subsample': 0.6272998821416366}
89
 
90
+ #xgb_model = xgb.XGBRegressor(max_depth=5, objective='multi:softprob',
91
+ n_estimators=125, num_class=8, colsample_bytree=0.7498850106268238,gamma=0.3690168082131852,
92
+ learning_rate=0.054839600377537934, subsample=0.6272998821416366)
93
+ #xgb_model.fit(tfidf_train, y_train)
94
+ #y_pred = xgb_model.predict(tfidf_train)
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  # Start tracking emissions
97
  tracker.start()
98
  tracker.start_task("inference")
99
 
100
+ xgb_model = xgb.XGBRegressor(max_depth=6, objective='multi:softprob',
101
+ n_estimators=500, num_class=8, colsample_bytree=0.75,gamma=0.35,
102
+ learning_rate=0.06, subsample=0.63)
103
+ xgb_model.fit(tfidf_test, y_test)
104
+ predictions = np.argmax(xgb_model.predict(tfidf_test), axis=1)
105
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  # Stop tracking emissions
107
  emissions_data = tracker.stop_task()
108