laureBe commited on
Commit
8963d03
·
verified ·
1 Parent(s): 24930f7

Update tasks/text.py

Browse files
Files changed (1) hide show
  1. tasks/text.py +154 -121
tasks/text.py CHANGED
@@ -2,27 +2,30 @@ from fastapi import APIRouter
2
  from datetime import datetime
3
  from datasets import load_dataset
4
  from sklearn.metrics import accuracy_score
5
- from sklearn.linear_model import LogisticRegression
6
- from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.model_selection import train_test_split
8
  import pandas as pd
9
- import tensorflow as tf
10
- from transformers import DistilBertTokenizer
11
- from transformers import TFDistilBertForSequenceClassification
12
- from transformers import logging
13
  from .utils.evaluation import TextEvaluationRequest
14
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
15
  import os
16
  import re
17
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
18
- os.environ['TF_ENABLE_ONEDNN_OPTS'] ='0'
19
- logging.set_verbosity_error()
20
- logging.set_verbosity_warning()
 
 
 
 
 
 
 
 
21
 
22
 
23
  router = APIRouter()
24
 
25
- DESCRIPTION = "DistilBert classification"
26
  ROUTE = "/text"
27
 
28
  @router.post(ROUTE, tags=["Text Task"],
@@ -59,130 +62,160 @@ async def evaluate_text(request: TextEvaluationRequest):
59
 
60
 
61
  # Split dataset
62
- train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
63
- test_dataset = train_test["test"]
64
- train_dataset = train_test["train"]
65
- y_train=train_dataset['label']
66
 
 
 
67
  train_dataset = train_test["train"]
68
- tn=pd.DataFrame([(i, j, k) for i,j,k in zip(train_dataset["quote"] , train_dataset["source"],
69
- train_dataset["subsource"])], columns=['quote','source', 'subsource'])
70
  test_dataset = train_test["test"]
71
- tt=pd.DataFrame([(i, j, k) for i,j,k in zip(test_dataset["quote"] , test_dataset["source"],
72
- test_dataset["subsource"])], columns=['quote','source', 'subsource'])
73
- tt.fillna("",inplace=True)
74
- tn.fillna("",inplace=True)
75
-
76
- tn['text'] = tn[['quote', 'source','subsource']].agg(' '.join, axis=1)
77
- tt['text'] = tn[['quote', 'source','subsource']].agg(' '.join, axis=1)
78
-
79
- def clean_text(x):
80
- pattern = r'[^a-zA-z0-9\s]'
81
- text = re.sub(pattern, '', x)
82
- return x
83
-
84
- def clean_numbers(x):
85
- if bool(re.search(r'\d', x)):
86
- x = re.sub('[0-9]{5,}', '#####', x)
87
- x = re.sub('[0-9]{4}', '####', x)
88
- x = re.sub('[0-9]{3}', '###', x)
89
- x = re.sub('[0-9]{2}', '##', x)
90
- return x
91
-
92
- contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
93
-
94
- def _get_contractions(contraction_dict):
95
- contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
96
- return contraction_dict, contraction_re
97
-
98
- contractions, contractions_re = _get_contractions(contraction_dict)
99
-
100
- def replace_contractions(text):
101
- def replace(match):
102
- return contractions[match.group(0)]
103
- return contractions_re.sub(replace, text)
104
-
105
- train_dataset_df = tn['quote'].apply(lambda x: x.lower())
106
- test_dataset_df = tt['quote'].apply(lambda x: x.lower())
107
-
108
- # Clean the text
109
- train_dataset_df = train_dataset_df.apply(lambda x: clean_text(x))
110
- test_dataset_df= test_dataset_df.apply(lambda x: clean_text(x))
111
-
112
- # Clean numbers
113
- train_dataset_df= train_dataset_df.apply(lambda x: clean_numbers(x))
114
- test_dataset_df = test_dataset_df.apply(lambda x: clean_numbers(x))
115
-
116
- # Clean Contractions
117
- train_dataset_df = train_dataset_df.apply(lambda x: replace_contractions(x))
118
- test_dataset_df = test_dataset_df.apply(lambda x: replace_contractions(x))
119
-
120
- # Encoding
121
- y_train_df=pd.DataFrame(train_dataset['label'], columns=['label'])
122
- y_test_df=pd.DataFrame(test_dataset['label'], columns=['label'])
123
- y_train_encoded = y_train_df['label'].astype('category').cat.codes
124
- y_test_encoded = y_test_df['label'].astype('category').cat.codes
125
- train_labels = y_train_encoded.to_list()
126
- test_labels=y_test_encoded.to_list()
127
-
128
- # Tokenize
129
- tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
130
- train_encodings = tokenizer(train_dataset_df.to_list(), truncation=True, padding=True)
131
- val_encodings = tokenizer(test_dataset_df.to_list(), truncation=True, padding=True)
132
-
133
- # Slicing
134
- train_dataset_bert = tf.data.Dataset.from_tensor_slices((
135
- dict(train_encodings),
136
- train_labels
137
- ))
138
- val_dataset_bert = tf.data.Dataset.from_tensor_slices((
139
- dict(val_encodings),
140
- test_labels
141
- ))
142
-
143
- model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=8)
144
-
 
 
 
 
 
 
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  # Start tracking emissions
147
  tracker.start()
148
  tracker.start_task("inference")
149
 
150
- optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
151
- model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=['accuracy'])
152
- #--------------------------------------------------------------------------------------------
153
- # YOUR MODEL INFERENCE CODE HERE
154
- # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
155
- #--------------------------------------------------------------------------------------------
156
-
157
  # Make predictions (placeholder for actual model inference)
158
-
159
- early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
160
-
161
- model.fit(train_dataset_bert.shuffle(1000).batch(16),epochs=2,batch_size=16,validation_data=val_dataset_bert.shuffle(1000).batch(16),callbacks=[early_stopping])
162
- #--------------------------------------------------------------------------------------------
163
- # YOUR MODEL INFERENCE STOPS HERE
164
- #--------------------------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
165
 
 
 
 
 
 
 
166
 
 
 
 
 
 
 
167
  # Stop tracking emissions
168
  emissions_data = tracker.stop_task()
169
 
170
-
171
- # Calculate accuracy
172
- def predict_category(text):
173
- predict_input =tokenizer.encode(text,
174
- truncation=True,
175
- padding=True,
176
- return_tensors="tf")
177
- output = model(predict_input)[0]
178
- prediction_value = tf.argmax(output, axis=1).numpy()[0]
179
- return prediction_value
180
- # - - - - - - - - - - - - - - -��- - - - - - - - - - - -
181
- y_pred = []
182
- for text_ in test_dataset_df.to_list():
183
- y_pred.append(predict_category(text_))
184
-
185
- accuracy_score(test_labels, y_pred)
186
 
187
  # Prepare results dictionary
188
  results = {
 
2
  from datetime import datetime
3
  from datasets import load_dataset
4
  from sklearn.metrics import accuracy_score
 
 
5
  from sklearn.model_selection import train_test_split
6
  import pandas as pd
7
+ import numpy as np
 
 
 
8
  from .utils.evaluation import TextEvaluationRequest
9
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
10
  import os
11
  import re
12
+ import pandas as pd
13
+ from tqdm import tqdm
14
+ from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
15
+ from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
16
+ import tensorflow as tf
17
+ import tensorflow.keras as keras
18
+ from tensorflow.keras.preprocessing.text import Tokenizer
19
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
20
+ from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten
21
+ from tensorflow.keras.models import Model, Sequential
22
+ from tensorflow.keras.layers import Convolution1D
23
+ from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
24
 
25
 
26
  router = APIRouter()
27
 
28
+ DESCRIPTION = "Attention GRU classification"
29
  ROUTE = "/text"
30
 
31
  @router.post(ROUTE, tags=["Text Task"],
 
62
 
63
 
64
  # Split dataset
 
 
 
 
65
 
66
+ train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
67
+
68
  train_dataset = train_test["train"]
 
 
69
  test_dataset = train_test["test"]
70
+
71
+ import nltk
72
+ nltk.download('stopwords')
73
+ nltk.download('wordnet')
74
+
75
+ import re
76
+ from nltk.stem import WordNetLemmatizer
77
+ from nltk.corpus import stopwords
78
+
79
+ stop_words = set(stopwords.words("english"))
80
+ lemmatizer = WordNetLemmatizer()
81
+
82
+
83
+ def clean_text(text):
84
+ text = re.sub(r'[^\w\s]','',text, re.UNICODE)
85
+ text = text.lower()
86
+ text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
87
+ text = [lemmatizer.lemmatize(token, "v") for token in text]
88
+ text = [word for word in text if not word in stop_words]
89
+ text = " ".join(text)
90
+ return text
91
+
92
+ train_df= pd.DataFrame(train_dataset["quote"], columns=['quote'])
93
+ train_df['clean_text'] = train_df.map(clean_text)
94
+ train_df['length_clean_text'] = train_df['clean_text'].map(len)
95
+
96
+ MAX_FEATURES = 6000
97
+ EMBED_SIZE = 28
98
+ RNN_CELL_SIZE = 32
99
+ MAX_LEN = 30
100
+ BATCH_SIZE = 100
101
+ EPOCHS = 30
102
+
103
+ tokenizer = Tokenizer(num_words=MAX_FEATURES)
104
+ tokenizer.fit_on_texts(train_df['clean_text'])
105
+ list_tokenized_train = tokenizer.texts_to_sequences(train_df['clean_text'])
106
+ X_train = pad_sequences(list_tokenized_train, maxlen=MAX_LEN)
107
+ true_labels = test_dataset["label"]
108
+ y_train = train_dataset["label"]
109
+
110
+
111
+ X_train_np = np.array(X_train)
112
+ y_train_np = np.array(y_train)
113
+
114
+ # Attention Layer
115
+
116
+ class Attention(tf.keras.Model):
117
+ def __init__(self, units):
118
+ super(Attention, self).__init__()
119
+ self.W1 = tf.keras.layers.Dense(units)
120
+ self.W2 = tf.keras.layers.Dense(units)
121
+ self.V = tf.keras.layers.Dense(1)
122
+
123
+ def call(self, features, hidden):
124
+ # hidden shape == (batch_size, hidden size)
125
+ # hidden_with_time_axis shape == (batch_size, 1, hidden size)
126
+ # we are doing this to perform addition to calculate the score
127
+ hidden_with_time_axis = tf.expand_dims(hidden, 1)
128
+
129
+ # score shape == (batch_size, max_length, 1)
130
+ # we get 1 at the last axis because we are applying score to self.V
131
+ # the shape of the tensor before applying self.V is (batch_size, max_length, units)
132
+ score = tf.nn.tanh(
133
+ self.W1(features) + self.W2(hidden_with_time_axis))
134
+
135
+ # attention_weights shape == (batch_size, max_length, 1)
136
+ attention_weights = tf.nn.softmax(self.V(score), axis=1)
137
+
138
+ # context_vector shape after sum == (batch_size, hidden_size)
139
+ context_vector = attention_weights * features
140
+ context_vector = tf.reduce_sum(context_vector, axis=1)
141
+
142
+ return context_vector, attention_weights
143
+
144
+ # Model
145
+
146
+ sequence_input = Input(shape=(MAX_LEN,), dtype="int32")
147
+ embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(sequence_input)
148
+
149
+ lstm = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences = True), name="bi_lstm_0")(embedded_sequences)
150
 
151
+
152
+ (lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences=True, return_state=True), name="bi_lstm_1")(lstm)
153
+
154
+ state_h = Concatenate()([forward_h, backward_h])
155
+ state_c = Concatenate()([forward_c, backward_c])
156
+
157
+ context_vector, attention_weights = Attention(10)(lstm, state_h)
158
+
159
+ dense1 = Dense(20, activation="relu")(context_vector)
160
+ dropout = Dropout(0.05)(dense1)
161
+ output = Dense(8, activation="sigmoid")(dropout)
162
+
163
+ model = keras.Model(inputs=sequence_input, outputs=output)
164
+
165
+ # Compile
166
+
167
+ from keras.callbacks import EarlyStopping
168
+ from keras import backend
169
+
170
+ optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
171
+
172
  # Start tracking emissions
173
  tracker.start()
174
  tracker.start_task("inference")
175
 
176
+ model.compile(loss='SparseCategoricalCrossentropy', optimizer=optimizer, metrics=['accuracy'])
177
+
178
+ history = model.fit(X_train_np,y_train_np, shuffle=False,batch_size=BATCH_SIZE, verbose=1,epochs=EPOCHS)
179
+
 
 
 
180
  # Make predictions (placeholder for actual model inference)
181
+ candidate_labels = [
182
+ "Not related to climate change disinformation",
183
+ "Climate change is not real and not happening",
184
+ "Climate change is not human-induced",
185
+ "Climate change impacts are not that bad",
186
+ "Climate change solutions are harmful and unnecessary",
187
+ "Climate change science is unreliable",
188
+ "Climate change proponents are biased",
189
+ "Fossil fuels are needed to address climate change"
190
+ ]
191
+ def classifier(input_text,candidate_labels):
192
+ #PREPROCESS THE INPUT TEXT
193
+ input_text_cleaned = clean_text(input_text)
194
+ input_sequence = tokenizer.texts_to_sequences([input_text_cleaned])
195
+ input_padded = pad_sequences(input_sequence, maxlen = MAX_LEN, padding = 'post')
196
+ #PREDICTION
197
+ prediction = np.ravel(model.predict(input_padded))
198
+ return {'sequence': input_text,'labels': candidate_labels,'scores': list(prediction)}
199
 
200
+
201
+ predictions = []
202
+
203
+ for i, text in tqdm(enumerate(test_dataset["quote"])):
204
+
205
+ result = classifier(text, candidate_labels)
206
 
207
+ # Get index of highest scoring label
208
+
209
+ pred_label = candidate_labels.index(result["labels"][0])
210
+
211
+ predictions.append(pred_label)
212
+
213
  # Stop tracking emissions
214
  emissions_data = tracker.stop_task()
215
 
216
+ # Calculate accuracy
217
+ accuracy = accuracy_score(true_labels, predictions)
218
+
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  # Prepare results dictionary
221
  results = {