baseline

Sleeping

App Files Files Community

laureBe commited on Jan 29

Commit

2aa2667

verified ·

1 Parent(s): f8008ee

Update tasks/text.py

Browse files

Files changed (1) hide show

tasks/text.py +31 -137

tasks/text.py CHANGED Viewed

@@ -10,22 +10,14 @@ from .utils.emissions import tracker, clean_emissions_data, get_space_info
 import os
 import re
 import pandas as pd
-from tqdm import tqdm
-from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
-from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
 import tensorflow as tf
-import tensorflow.keras as keras
-from tensorflow.keras.preprocessing.text import Tokenizer
-from tensorflow.keras.preprocessing.sequence import pad_sequences
-from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten
-from tensorflow.keras.models import Model, Sequential
-from tensorflow.keras.layers import Convolution1D
-from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
 router = APIRouter()
-DESCRIPTION = "Attention GRU classification"
 ROUTE = "/text"
 @router.post(ROUTE, tags=["Text Task"],
@@ -68,147 +60,49 @@ async def evaluate_text(request: TextEvaluationRequest):
     train_dataset = train_test["train"]
     test_dataset = train_test["test"]
-    import nltk
-    nltk.download('stopwords')
-    nltk.download('wordnet')
-    import re
-    from nltk.stem import WordNetLemmatizer
-    from nltk.corpus import stopwords
-    stop_words = set(stopwords.words("english"))
-    lemmatizer = WordNetLemmatizer()
-    def clean_text(text):
-        text = re.sub(r'[^\w\s]','',text, re.UNICODE)
-        text = text.lower()
-        text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
-        text = [lemmatizer.lemmatize(token, "v") for token in text]
-        text = [word for word in text if not word in stop_words]
-        text = " ".join(text)
-        return text
-    train_df= pd.DataFrame(train_dataset["quote"], columns=['quote'])
-    train_df['clean_text'] = train_df.map(clean_text)
-    train_df['length_clean_text'] = train_df['clean_text'].map(len)
-    MAX_FEATURES = 6000
-    EMBED_SIZE = 28
-    RNN_CELL_SIZE = 32
-    MAX_LEN = 30
-    BATCH_SIZE = 100
-    EPOCHS = 30
-    tokenizer = Tokenizer(num_words=MAX_FEATURES)
-    tokenizer.fit_on_texts(train_df['clean_text'])
-    list_tokenized_train = tokenizer.texts_to_sequences(train_df['clean_text'])
-    X_train = pad_sequences(list_tokenized_train, maxlen=MAX_LEN)
     true_labels = test_dataset["label"]
     y_train = train_dataset["label"]
-    X_train_np = np.array(X_train)
-    y_train_np = np.array(y_train)
-    # Attention Layer
-    class Attention(tf.keras.Model):
-        def __init__(self, units):
-            super(Attention, self).__init__()
-            self.W1 = tf.keras.layers.Dense(units)
-            self.W2 = tf.keras.layers.Dense(units)
-            self.V = tf.keras.layers.Dense(1)
-        def call(self, features, hidden):
-            # hidden shape == (batch_size, hidden size)
-            # hidden_with_time_axis shape == (batch_size, 1, hidden size)
-            # we are doing this to perform addition to calculate the score
-            hidden_with_time_axis = tf.expand_dims(hidden, 1)
-            # score shape == (batch_size, max_length, 1)
-            # we get 1 at the last axis because we are applying score to self.V
-            # the shape of the tensor before applying self.V is (batch_size, max_length, units)
-            score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
-            # attention_weights shape == (batch_size, max_length, 1)
-            attention_weights = tf.nn.softmax(self.V(score), axis=1)
-            # context_vector shape after sum == (batch_size, hidden_size)
-            context_vector = attention_weights * features
-            context_vector = tf.reduce_sum(context_vector, axis=1)
-            return context_vector, attention_weights
     # Model
-    sequence_input = Input(shape=(MAX_LEN,), dtype="int32")
-    embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(sequence_input)
-    lstm = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences = True), name="bi_lstm_0")(embedded_sequences)
-    (lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences=True, return_state=True), name="bi_lstm_1")(lstm)
-    state_h = Concatenate()([forward_h, backward_h])
-    state_c = Concatenate()([forward_c, backward_c])
-    context_vector, attention_weights = Attention(10)(lstm, state_h)
-    dense1 = Dense(20, activation="relu")(context_vector)
-    dropout = Dropout(0.05)(dense1)
-    output = Dense(8, activation="sigmoid")(dropout)
-    model = keras.Model(inputs=sequence_input, outputs=output)
-    # Compile
-    from keras.callbacks import EarlyStopping
-    from keras import backend
-    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
     # Start tracking emissions
     tracker.start()
     tracker.start_task("inference")
-    model.compile(loss='SparseCategoricalCrossentropy', optimizer=optimizer, metrics=['accuracy'])
-    history = model.fit(X_train_np,y_train_np, shuffle=False,batch_size=BATCH_SIZE, verbose=1,epochs=EPOCHS)
-    # Make  predictions (placeholder for actual model inference)
-    candidate_labels = [
-        "Not related to climate change disinformation",
-        "Climate change is not real and not happening",
-        "Climate change is not human-induced",
-        "Climate change impacts are not that bad",
-        "Climate change solutions are harmful and unnecessary",
-        "Climate change science is unreliable",
-        "Climate change proponents are biased",
-        "Fossil fuels are needed to address climate change"
-    ]
-    def classifier(input_text,candidate_labels):
-        #PREPROCESS THE INPUT TEXT
-        input_text_cleaned = clean_text(input_text)
-        input_sequence = tokenizer.texts_to_sequences([input_text_cleaned])
-        input_padded = pad_sequences(input_sequence, maxlen = MAX_LEN, padding = 'post')
-        #PREDICTION
-        prediction = np.ravel(model.predict(input_padded))
-        return {'sequence': input_text,'labels': candidate_labels,'scores': list(prediction)}
-    predictions = []
-    for i, text in tqdm(enumerate(test_dataset["quote"])):
-        result = classifier(text, candidate_labels)
-        # Get index of highest scoring label
-        pred_label = candidate_labels.index(result["labels"][0])
-        predictions.append(pred_label)
     # Stop tracking emissions
     emissions_data = tracker.stop_task()

 import os
 import re
 import pandas as pd
 import tensorflow as tf
+from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
 router = APIRouter()
+DESCRIPTION = " XGBOOST classification"
 ROUTE = "/text"
 @router.post(ROUTE, tags=["Text Task"],
     train_dataset = train_test["train"]
     test_dataset = train_test["test"]
+    from sklearn.feature_extraction.text import TfidfVectorizer
+    from sklearn.model_selection import train_test_split
+    from sklearn import metrics
+    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
+    from sklearn.tree import DecisionTreeClassifier
+    from datetime import datetime
+    from sklearn.feature_extraction.text import CountVectorizer
+    tfidf_vect = TfidfVectorizer(stop_words = 'english')
+    tfidf_train = tfidf_vect.fit_transform(train_dataset['quote'])
+    tfidf_train = tfidf_vect.transform(train_dataset['quote'])
+    tfidf_test = tfidf_vect.fit_transform(test_dataset['quote'])
+    tfidf_test = tfidf_vect.transform(test_dataset['quote'])
     true_labels = test_dataset["label"]
     y_train = train_dataset["label"]
+    y_test = test_dataset["label"]
     # Model
+    import xgboost as xgb
+    #Parameters: {'colsample_bytree': 0.7039283369765, 'gamma': 0.3317686860083553, 'learning_rate': 0.08341079006092542, 'max_depth': 5, 'n_estimators': 140, 'subsample': 0.6594650911012452}
+    #Parameters: {'colsample_bytree': 0.7039283369765, 'gamma': 0.3317686860083553, 'learning_rate': 0.08341079006092542, 'max_depth': 5, 'n_estimators': 140, 'subsample': 0.6594650911012452}
+    #Parameters: {'colsample_bytree': 0.7498850106268238, 'gamma': 0.3690168082131852, 'learning_rate': 0.054839600377537934, 'max_depth': 5, 'n_estimators': 125, 'subsample': 0.6272998821416366}
+    #xgb_model = xgb.XGBRegressor(max_depth=5, objective='multi:softprob',
+                                 n_estimators=125, num_class=8, colsample_bytree=0.7498850106268238,gamma=0.3690168082131852,
+                                 learning_rate=0.054839600377537934, subsample=0.6272998821416366)
+    #xgb_model.fit(tfidf_train, y_train)
+    #y_pred = xgb_model.predict(tfidf_train)
     # Start tracking emissions
     tracker.start()
     tracker.start_task("inference")
+    xgb_model = xgb.XGBRegressor(max_depth=6, objective='multi:softprob',
+                             n_estimators=500, num_class=8, colsample_bytree=0.75,gamma=0.35,
+                             learning_rate=0.06, subsample=0.63)
+    xgb_model.fit(tfidf_test, y_test)
+    predictions = np.argmax(xgb_model.predict(tfidf_test), axis=1)
     # Stop tracking emissions
     emissions_data = tracker.stop_task()