baseline

Sleeping

App Files Files Community

laureBe commited on Jan 29

Commit

8963d03

verified ·

1 Parent(s): 24930f7

Update tasks/text.py

Browse files

Files changed (1) hide show

tasks/text.py +154 -121

tasks/text.py CHANGED Viewed

@@ -2,27 +2,30 @@ from fastapi import APIRouter
 from datetime import datetime
 from datasets import load_dataset
 from sklearn.metrics import accuracy_score
-from sklearn.linear_model import LogisticRegression
-from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.model_selection import train_test_split
 import pandas as pd
-import tensorflow as tf
-from transformers import DistilBertTokenizer
-from transformers import TFDistilBertForSequenceClassification
-from transformers import logging
 from .utils.evaluation import TextEvaluationRequest
 from .utils.emissions import tracker, clean_emissions_data, get_space_info
 import os
 import re
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
-os.environ['TF_ENABLE_ONEDNN_OPTS'] ='0'
-logging.set_verbosity_error()
-logging.set_verbosity_warning()
 router = APIRouter()
-DESCRIPTION = "DistilBert classification"
 ROUTE = "/text"
 @router.post(ROUTE, tags=["Text Task"],
@@ -59,130 +62,160 @@ async def evaluate_text(request: TextEvaluationRequest):
     # Split dataset
-    train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
-    test_dataset = train_test["test"]
-    train_dataset = train_test["train"]
-    y_train=train_dataset['label']
     train_dataset = train_test["train"]
-    tn=pd.DataFrame([(i, j, k) for i,j,k in zip(train_dataset["quote"] , train_dataset["source"],
-                       train_dataset["subsource"])], columns=['quote','source', 'subsource'])
     test_dataset = train_test["test"]
-    tt=pd.DataFrame([(i, j, k) for i,j,k in zip(test_dataset["quote"] , test_dataset["source"],
-                       test_dataset["subsource"])], columns=['quote','source', 'subsource'])
-    tt.fillna("",inplace=True)
-    tn.fillna("",inplace=True)
-    tn['text'] = tn[['quote', 'source','subsource']].agg(' '.join, axis=1)
-    tt['text'] = tn[['quote', 'source','subsource']].agg(' '.join, axis=1)
-    def clean_text(x):
-        pattern = r'[^a-zA-z0-9\s]'
-        text = re.sub(pattern, '', x)
-        return x
-    def clean_numbers(x):
-        if bool(re.search(r'\d', x)):
-            x = re.sub('[0-9]{5,}', '#####', x)
-            x = re.sub('[0-9]{4}', '####', x)
-            x = re.sub('[0-9]{3}', '###', x)
-            x = re.sub('[0-9]{2}', '##', x)
-        return x
-    contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
-    def _get_contractions(contraction_dict):
-        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
-        return contraction_dict, contraction_re
-    contractions, contractions_re = _get_contractions(contraction_dict)
-    def replace_contractions(text):
-        def replace(match):
-            return contractions[match.group(0)]
-        return contractions_re.sub(replace, text)
-    train_dataset_df = tn['quote'].apply(lambda x: x.lower())
-    test_dataset_df = tt['quote'].apply(lambda x: x.lower())
-    # Clean the text
-    train_dataset_df = train_dataset_df.apply(lambda x: clean_text(x))
-    test_dataset_df= test_dataset_df.apply(lambda x: clean_text(x))
-    # Clean numbers
-    train_dataset_df= train_dataset_df.apply(lambda x: clean_numbers(x))
-    test_dataset_df = test_dataset_df.apply(lambda x: clean_numbers(x))
-    # Clean Contractions
-    train_dataset_df = train_dataset_df.apply(lambda x: replace_contractions(x))
-    test_dataset_df = test_dataset_df.apply(lambda x: replace_contractions(x))
-    # Encoding
-    y_train_df=pd.DataFrame(train_dataset['label'], columns=['label'])
-    y_test_df=pd.DataFrame(test_dataset['label'], columns=['label'])
-    y_train_encoded  = y_train_df['label'].astype('category').cat.codes
-    y_test_encoded = y_test_df['label'].astype('category').cat.codes
-    train_labels = y_train_encoded.to_list()
-    test_labels=y_test_encoded.to_list()
-    # Tokenize
-    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-    train_encodings = tokenizer(train_dataset_df.to_list(), truncation=True, padding=True)
-    val_encodings = tokenizer(test_dataset_df.to_list(), truncation=True, padding=True)
-    # Slicing
-    train_dataset_bert = tf.data.Dataset.from_tensor_slices((
-    dict(train_encodings),
-    train_labels
-    ))
-    val_dataset_bert = tf.data.Dataset.from_tensor_slices((
-    dict(val_encodings),
-    test_labels
-    ))
-    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=8)
     # Start tracking emissions
     tracker.start()
     tracker.start_task("inference")
-    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
-    model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=['accuracy'])
-    #--------------------------------------------------------------------------------------------
-    # YOUR MODEL INFERENCE CODE HERE
-    # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
-    #--------------------------------------------------------------------------------------------
     # Make  predictions (placeholder for actual model inference)
-    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
-    model.fit(train_dataset_bert.shuffle(1000).batch(16),epochs=2,batch_size=16,validation_data=val_dataset_bert.shuffle(1000).batch(16),callbacks=[early_stopping])
-    #--------------------------------------------------------------------------------------------
-    # YOUR MODEL INFERENCE STOPS HERE
-    #--------------------------------------------------------------------------------------------
     # Stop tracking emissions
     emissions_data = tracker.stop_task()
-    # Calculate accuracy
-    def predict_category(text):
-        predict_input =tokenizer.encode(text,
-        truncation=True,
-        padding=True,
-        return_tensors="tf")
-        output = model(predict_input)[0]
-        prediction_value = tf.argmax(output, axis=1).numpy()[0]
-        return prediction_value
-    # - - - - - - - - - - - - - - -��- - - - - - - - - - - -
-    y_pred = []
-    for text_ in test_dataset_df.to_list():
-        y_pred.append(predict_category(text_))
-    accuracy_score(test_labels, y_pred)
     # Prepare results dictionary
     results = {

 from datetime import datetime
 from datasets import load_dataset
 from sklearn.metrics import accuracy_score
 from sklearn.model_selection import train_test_split
 import pandas as pd
+import numpy as np
 from .utils.evaluation import TextEvaluationRequest
 from .utils.emissions import tracker, clean_emissions_data, get_space_info
 import os
 import re
+import pandas as pd
+from tqdm import tqdm
+from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
+from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
+import tensorflow as tf
+import tensorflow.keras as keras
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten
+from tensorflow.keras.models import Model, Sequential
+from tensorflow.keras.layers import Convolution1D
+from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
 router = APIRouter()
+DESCRIPTION = "Attention GRU classification"
 ROUTE = "/text"
 @router.post(ROUTE, tags=["Text Task"],
     # Split dataset
+    train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
     train_dataset = train_test["train"]
     test_dataset = train_test["test"]
+    import nltk
+    nltk.download('stopwords')
+    nltk.download('wordnet')
+    import re
+    from nltk.stem import WordNetLemmatizer
+    from nltk.corpus import stopwords
+    stop_words = set(stopwords.words("english"))
+    lemmatizer = WordNetLemmatizer()
+    def clean_text(text):
+        text = re.sub(r'[^\w\s]','',text, re.UNICODE)
+        text = text.lower()
+        text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
+        text = [lemmatizer.lemmatize(token, "v") for token in text]
+        text = [word for word in text if not word in stop_words]
+        text = " ".join(text)
+        return text
+    train_df= pd.DataFrame(train_dataset["quote"], columns=['quote'])
+    train_df['clean_text'] = train_df.map(clean_text)
+    train_df['length_clean_text'] = train_df['clean_text'].map(len)
+    MAX_FEATURES = 6000
+    EMBED_SIZE = 28
+    RNN_CELL_SIZE = 32
+    MAX_LEN = 30
+    BATCH_SIZE = 100
+    EPOCHS = 30
+    tokenizer = Tokenizer(num_words=MAX_FEATURES)
+    tokenizer.fit_on_texts(train_df['clean_text'])
+    list_tokenized_train = tokenizer.texts_to_sequences(train_df['clean_text'])
+    X_train = pad_sequences(list_tokenized_train, maxlen=MAX_LEN)
+    true_labels = test_dataset["label"]
+    y_train = train_dataset["label"]
+    X_train_np = np.array(X_train)
+    y_train_np = np.array(y_train)
+    # Attention Layer
+    class Attention(tf.keras.Model):
+        def __init__(self, units):
+            super(Attention, self).__init__()
+            self.W1 = tf.keras.layers.Dense(units)
+            self.W2 = tf.keras.layers.Dense(units)
+            self.V = tf.keras.layers.Dense(1)
+        def call(self, features, hidden):
+            # hidden shape == (batch_size, hidden size)
+            # hidden_with_time_axis shape == (batch_size, 1, hidden size)
+            # we are doing this to perform addition to calculate the score
+            hidden_with_time_axis = tf.expand_dims(hidden, 1)
+            # score shape == (batch_size, max_length, 1)
+            # we get 1 at the last axis because we are applying score to self.V
+            # the shape of the tensor before applying self.V is (batch_size, max_length, units)
+            score = tf.nn.tanh(
+                self.W1(features) + self.W2(hidden_with_time_axis))
+            # attention_weights shape == (batch_size, max_length, 1)
+            attention_weights = tf.nn.softmax(self.V(score), axis=1)
+            # context_vector shape after sum == (batch_size, hidden_size)
+            context_vector = attention_weights * features
+            context_vector = tf.reduce_sum(context_vector, axis=1)
+            return context_vector, attention_weights
+    # Model
+    sequence_input = Input(shape=(MAX_LEN,), dtype="int32")
+    embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(sequence_input)
+    lstm = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences = True), name="bi_lstm_0")(embedded_sequences)
+    (lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences=True, return_state=True), name="bi_lstm_1")(lstm)
+        state_h = Concatenate()([forward_h, backward_h])
+    state_c = Concatenate()([forward_c, backward_c])
+    context_vector, attention_weights = Attention(10)(lstm, state_h)
+    dense1 = Dense(20, activation="relu")(context_vector)
+    dropout = Dropout(0.05)(dense1)
+    output = Dense(8, activation="sigmoid")(dropout)
+    model = keras.Model(inputs=sequence_input, outputs=output)
+    # Compile
+    from keras.callbacks import EarlyStopping
+    from keras import backend
+    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
     # Start tracking emissions
     tracker.start()
     tracker.start_task("inference")
+    model.compile(loss='SparseCategoricalCrossentropy', optimizer=optimizer, metrics=['accuracy'])
+    history = model.fit(X_train_np,y_train_np, shuffle=False,batch_size=BATCH_SIZE, verbose=1,epochs=EPOCHS)
     # Make  predictions (placeholder for actual model inference)
+    candidate_labels = [
+        "Not related to climate change disinformation",
+        "Climate change is not real and not happening",
+        "Climate change is not human-induced",
+        "Climate change impacts are not that bad",
+        "Climate change solutions are harmful and unnecessary",
+        "Climate change science is unreliable",
+        "Climate change proponents are biased",
+        "Fossil fuels are needed to address climate change"
+    ]
+    def classifier(input_text,candidate_labels):
+        #PREPROCESS THE INPUT TEXT
+        input_text_cleaned = clean_text(input_text)
+        input_sequence = tokenizer.texts_to_sequences([input_text_cleaned])
+        input_padded = pad_sequences(input_sequence, maxlen = MAX_LEN, padding = 'post')
+        #PREDICTION
+        prediction = np.ravel(model.predict(input_padded))
+        return {'sequence': input_text,'labels': candidate_labels,'scores': list(prediction)}
+    predictions = []
+    for i, text in tqdm(enumerate(test_dataset["quote"])):
+        result = classifier(text, candidate_labels)
+        # Get index of highest scoring label
+        pred_label = candidate_labels.index(result["labels"][0])
+        predictions.append(pred_label)
     # Stop tracking emissions
     emissions_data = tracker.stop_task()
+    # Calculate accuracy
+    accuracy = accuracy_score(true_labels, predictions)
     # Prepare results dictionary
     results = {