Spaces:

mo01018
/

Deployment-Trial

Sleeping

App Files Files Community

mo01018 commited on 27 days ago

Commit

74c2449

verified ·

1 Parent(s): 9bb28fd

Upload 3 files

Browse files

Files changed (3) hide show

app.py +121 -0
customFunctions.py +470 -0
performance_test.py +64 -0

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from flask import Flask, render_template, request, redirect, url_for
+from joblib import load
+import pandas as pd
+import re
+from customFunctions import *
+import json
+import datetime
+pd.set_option('display.max_colwidth', 1000)
+PIPELINES = [
+    {
+        'id': 1,
+        'name': 'Baseline',
+        'pipeline': load("pipeline_ex1_s1.joblib")
+    },
+    {
+        'id': 2,
+        'name': 'Trained on a FeedForward NN',
+        'pipeline': load("pipeline_ex1_s2.joblib")
+    },
+    {
+        'id': 3,
+        'name': 'Trained on a CRF',
+        'pipeline': load("pipeline_ex1_s3.joblib")
+    },
+    #{
+     #   'id': 4,
+      #  'name': 'Trained on a small dataset',
+       # 'pipeline': load("pipeline_ex2_s1.joblib")
+    #},
+    #{
+     #   'id': 5,
+      #  'name': 'Trained on a large dataset',
+       # 'pipeline': load("pipeline_ex2_s2.joblib")
+    #},
+    #{
+     #   'id': 6,
+      #  'name': 'Embedded using TFIDF',
+       # 'pipeline': load("pipeline_ex3_s1.joblib")
+    #},
+    #{
+     #   'id': 7,
+      #  'name': 'Embedded using ?',
+       # 'pipeline': load("pipeline_ex3_s2.joblib")
+    #},
+]
+pipeline_metadata = [{'id': p['id'], 'name': p['name']} for p in PIPELINES]
+def get_pipeline_by_id(pipelines, pipeline_id):
+    return next((p['pipeline'] for p in pipelines if p['id'] == pipeline_id), None)
+def get_name_by_id(pipelines, pipeline_id):
+    return next((p['name'] for p in pipelines if p['id'] == pipeline_id), None)
+def requestResults(text, pipeline):
+    labels = pipeline.predict(text)
+    print(labels.ndim)
+    if labels.ndim != 1:
+        flattened_predictions = []
+        for sentence in labels:
+            for tag in sentence:
+                flattened_predictions.append(tag)
+        labels = flattened_predictions
+    print(labels)
+    labels = [int(label) for label in labels]
+    tag_encoder = LabelEncoder()
+    tag_encoder.fit(['B-AC', 'O', 'B-LF', 'I-LF'])
+    decoded_labels = tag_encoder.inverse_transform(labels)
+    return decoded_labels
+LOG_FILE = "usage_log.jsonl"  # Each line is a JSON object
+def log_interaction(user_input, model_name, predictions):
+    log_entry = {
+        "timestamp": datetime.datetime.utcnow().isoformat(),
+        "user_input": user_input,
+        "model": model_name,
+        "predictions": predictions
+    }
+    with open(LOG_FILE, "a") as f:
+        f.write(json.dumps(log_entry) + "\n")
+app = Flask(__name__)
+@app.route('/')
+def index():
+    return render_template('index.html', pipelines= pipeline_metadata)
+@app.route('/', methods=['POST'])
+def get_data():
+    if request.method == 'POST':
+        text = request.form['search']
+        tokens = re.findall(r"\w+|[^\w\s]", text)
+        tokens_fomatted = pd.Series([pd.Series(tokens)])
+        pipeline_id = int(request.form['pipeline_select'])
+        pipeline = get_pipeline_by_id(PIPELINES, pipeline_id)
+        name = get_name_by_id(PIPELINES, pipeline_id)
+        labels = requestResults(tokens_fomatted, pipeline)
+        results = dict(zip(tokens, labels))
+        log_interaction(text, name, results)
+        return render_template('index.html', results=results, name=name, pipelines= pipeline_metadata)
+if __name__ == '__main__':
+    app.run(host="0.0.0.0", port=7860)
+#if __name__ == '__main__':
+#app.run(host="0.0.0.0", port=7860)

customFunctions.py ADDED Viewed

	@@ -0,0 +1,470 @@

+import pandas as pd
+import numpy as np
+import random
+import torch
+import torch.nn as nn
+import torch.optim as optim
+#from transformers import BertTokenizer, BertModel
+from sklearn.metrics import accuracy_score, f1_score, classification_report
+import sklearn_crfsuite
+from sklearn_crfsuite import metrics
+from sklearn.metrics.pairwise import cosine_similarity
+from gensim.models import Word2Vec
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import LabelEncoder
+from torch.utils.data import Dataset, DataLoader
+from torch.nn.utils.rnn import pad_sequence
+from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
+from sklearn.feature_extraction.text import TfidfVectorizer
+EMBEDDING_DIM = 100
+PAD_VALUE= -1
+MAX_LENGTH = 376
+EMBEDDING_DIM = 100
+BATCH_SIZE = 16
+class preprocess_sentences():
+    def __init__(self):
+        pass
+    def fit(self, X, y=None):
+        print('PREPROCESSING')
+        return self
+    def transform(self, X):
+        # X = train['tokens'], y =
+        sentences = X.apply(lambda x: x.tolist()).tolist()
+        print('--> Preprocessing complete \n', flush=True)
+        return sentences
+class Word2VecTransformer():
+    def __init__(self, vector_size = 100, window = 5, min_count = 1, workers = 1, embedding_dim=EMBEDDING_DIM):
+        self.model = None
+        self.vector_size = vector_size
+        self.window = window
+        self.min_count = min_count
+        self.workers = workers
+        self.embedding_dim = embedding_dim
+    def fit(self, X, y):
+        # https://stackoverflow.com/questions/17242456/python-print-sys-stdout-write-not-visible-when-using-logging
+        # https://stackoverflow.com/questions/230751/how-can-i-flush-the-output-of-the-print-function
+        print('WORD2VEC:', flush=True)
+        # This fits the word2vec model
+        self.model = Word2Vec(sentences = X, vector_size=self.vector_size, window=self.window
+                              , min_count=self.min_count, workers=self.workers)
+        print('--> Word2Vec Fitted', flush=True)
+        return self
+    def transform(self, X):
+        # This bit should transform the sentences
+        embedded_sentences = []
+        for sentence in X:
+            sentence_vectors = []
+            for word in sentence:
+                if word in self.model.wv:
+                    vec = self.model.wv[word]
+                else:
+                    vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
+                sentence_vectors.append(vec)
+            embedded_sentences.append(torch.tensor(sentence_vectors, dtype=torch.float32))
+        print('--> Embeddings Complete \n', flush=True)
+        return embedded_sentences
+class Word2VecTransformer_CRF():
+    def __init__(self, vector_size = 100, window = 5, min_count = 1, workers = 1, embedding_dim=EMBEDDING_DIM):
+        self.model = None
+        self.vector_size = vector_size
+        self.window = window
+        self.min_count = min_count
+        self.workers = workers
+        self.embedding_dim = embedding_dim
+    def fit(self, X, y):
+        # https://stackoverflow.com/questions/17242456/python-print-sys-stdout-write-not-visible-when-using-logging
+        # https://stackoverflow.com/questions/230751/how-can-i-flush-the-output-of-the-print-function
+        print('WORD2VEC:', flush=True)
+        # This fits the word2vec model
+        self.model = Word2Vec(sentences = X, vector_size=self.vector_size, window=self.window
+                              , min_count=self.min_count, workers=self.workers)
+        print('--> Word2Vec Fitted', flush=True)
+        return self
+    def transform(self, X):
+        # This bit should transform the sentences
+        embedded_sentences = []
+        for sentence in X:
+            sentence_vectors = []
+            for word in sentence:
+                features = {
+                    'bias': 1.0,
+                    'word.lower()': word.lower(),
+                    'word[-3:]': word[-3:],
+                    'word[-2:]': word[-2:],
+                    'word.isupper()': word.isupper(),
+                    'word.istitle()': word.istitle(),
+                    'word.isdigit()': word.isdigit(),
+                }
+                if word in self.model.wv:
+                    vec = self.model.wv[word]
+                else:
+                    vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
+                # https://stackoverflow.com/questions/58736548/how-to-use-word-embedding-as-features-for-crf-sklearn-crfsuite-model-training
+                for index in range(len(vec)):
+                    features[f"embedding_{index}"] = vec[index]
+                sentence_vectors.append(features)
+            embedded_sentences.append(sentence_vectors)
+        print('--> Embeddings Complete \n', flush=True)
+        return embedded_sentences
+class tfidf(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        self.model = None
+        self.embedding_dim = None
+        self.idf = None
+        self.vocab_size = None
+        self.vocab = None
+        pass
+    def fit(self, X, y = None):
+        print('TFIDF:', flush=True)
+        joined_sentences = [' '.join(tokens) for tokens in X]
+        self.model = TfidfVectorizer()
+        self.model.fit(joined_sentences)
+        self.vocab = self.model.vocabulary_
+        self.idf = self.model.idf_
+        self.vocab_size = len(self.vocab)
+        self.embedding_dim = self.vocab_size
+        print('--> TFIDF Fitted', flush=True)
+        return self
+    def transform(self, X):
+        embedded = []
+        for sentence in X:
+            sent_vecs = []
+            token_counts = {}
+            for word in sentence:
+                token_counts[word] = token_counts.get(word, 0) + 1
+            sent_len = len(sentence)
+            for word in sentence:
+                vec = np.zeros(self.vocab_size)
+                if word in self.vocab:
+                    tf = token_counts[word] / sent_len
+                    token_idx = self.vocab[word]
+                    vec[token_idx] = tf * self.idf[token_idx]
+                sent_vecs.append(vec)
+            embedded.append(torch.tensor(sent_vecs, dtype=torch.float32))
+        print('--> Embeddings Complete \n', flush=True)
+        print(embedded[0][0], flush=True)
+        print('Those were the embeddings', flush=True)
+        return embedded
+class BiLSTM_NER(nn.Module):
+    def __init__(self,input_dim, hidden_dim, tagset_size):
+        super(BiLSTM_NER, self).__init__()
+        # Embedding layer
+        #Freeze= false means that it will fine tune
+        #self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze = False, padding_idx=-1)
+        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
+        self.fc = nn.Linear(hidden_dim*2, tagset_size)
+    def forward(self, sentences):
+        #embeds = self.embedding(sentences)
+        lstm_out, _ = self.lstm(sentences)
+        tag_scores = self.fc(lstm_out)
+        return tag_scores
+# Define the FeedForward NN Model
+class FeedForwardNN_NER(nn.Module):
+    def __init__(self, embedding_dim, hidden_dim, tagset_size):
+        super(FeedForwardNN_NER, self).__init__()
+        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
+        self.relu = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_dim, tagset_size)
+    def forward(self, x):
+        # x: (batch_size, seq_length, embedding_dim)
+        x = self.fc1(x)             # (batch_size, seq_length, hidden_dim)
+        x = self.relu(x)
+        logits = self.fc2(x)        # (batch_size, seq_length, tagset_size)
+        return logits
+def pad(batch):
+        # batch is a list of (X, y) pairs
+        X_batch, y_batch = zip(*batch)
+        # Convert to tensors
+        X_batch = [torch.tensor(seq, dtype=torch.float32) for seq in X_batch]
+        y_batch = [torch.tensor(seq, dtype=torch.long) for seq in y_batch]
+        # Pad sequences
+        X_padded = pad_sequence(X_batch, batch_first=True, padding_value=PAD_VALUE)
+        y_padded = pad_sequence(y_batch, batch_first=True, padding_value=PAD_VALUE)
+        return X_padded, y_padded
+def pred_pad(batch):
+    X_batch = [torch.tensor(seq, dtype=torch.float32) for seq in batch]
+    X_padded = pad_sequence(X_batch, batch_first=True, padding_value=PAD_VALUE)
+    return X_padded
+class Ner_Dataset(Dataset):
+        def __init__(self, X, y):
+            self.X = X
+            self.y = y
+        def __len__(self):
+            return len(self.X)
+        def __getitem__(self, idx):
+            return self.X[idx], self.y[idx]
+class LSTM(BaseEstimator, ClassifierMixin):
+    def __init__(self, embedding_dim = None, hidden_dim = 128, epochs = 5, learning_rate = 0.001, tag2idx = None):
+        self.embedding_dim = embedding_dim
+        self.hidden_dim = hidden_dim
+        self.epochs = epochs
+        self.learning_rate = learning_rate
+        self.tag2idx = tag2idx
+    def fit(self, embedded, encoded_tags):
+        print('LSTM:', flush=True)
+        data = Ner_Dataset(embedded, encoded_tags)
+        train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad)
+        self.model = self.train_LSTM(train_loader)
+        print('--> LSTM trained', flush=True)
+        return self
+    def predict(self, X):
+    # Switch to evaluation mode
+        test_loader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=pred_pad)
+        self.model.eval()
+        predictions = []
+        # Iterate through test data
+        with torch.no_grad():
+            for X_batch in test_loader:
+                X_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+                tag_scores = self.model(X_batch)
+                _, predicted_tags = torch.max(tag_scores, dim=2)
+                # Flatten the tensors to compare word-by-word
+                flattened_pred = predicted_tags.view(-1)
+                predictions.append(flattened_pred.cpu().numpy())
+        predictions = np.concatenate(predictions)
+        return predictions
+    def train_LSTM(self, train_loader, input_dim=None, hidden_dim=128, epochs=5, learning_rate=0.001):
+        input_dim = self.embedding_dim
+        # Instantiate the lstm_model
+        lstm_model = BiLSTM_NER(input_dim, hidden_dim=hidden_dim, tagset_size=len(self.tag2idx))
+        lstm_model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+        # Loss function and optimizer
+        loss_function = nn.CrossEntropyLoss(ignore_index=PAD_VALUE)  # Ignore padding
+        optimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)
+        print('--> Training LSTM')
+        # Training loop
+        for epoch in range(epochs):
+            total_loss = 0
+            total_correct = 0
+            total_words = 0
+            lstm_model.train()  # Set model to training mode
+            for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
+                X_batch, y_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), y_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+                # Zero gradients
+                optimizer.zero_grad()
+                # Forward pass
+                tag_scores = lstm_model(X_batch)
+                # Reshape and compute loss (ignore padded values)
+                loss = loss_function(tag_scores.view(-1, len(self.tag2idx)), y_batch.view(-1))
+                # Backward pass and optimization
+                loss.backward()
+                optimizer.step()
+                total_loss += loss.item()
+                # Compute accuracy for this batch
+                # Get the predicted tags (index of max score)
+                _, predicted_tags = torch.max(tag_scores, dim=2)
+                # Flatten the tensors to compare word-by-word
+                flattened_pred = predicted_tags.view(-1)
+                flattened_true = y_batch.view(-1)
+                # Exclude padding tokens from the accuracy calculation
+                mask = flattened_true != PAD_VALUE
+                correct = (flattened_pred[mask] == flattened_true[mask]).sum().item()
+                # Count the total words in the batch (ignoring padding)
+                total_words_batch = mask.sum().item()
+                # Update total correct and total words
+                total_correct += correct
+                total_words += total_words_batch
+            avg_loss = total_loss / len(train_loader)
+            avg_accuracy = total_correct / total_words * 100  # Accuracy in percentage
+            print(f'    ==> Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%')
+        return lstm_model
+class FeedforwardNN(BaseEstimator, ClassifierMixin):
+    def __init__(self, embedding_dim = None, hidden_dim = 128, epochs = 5, learning_rate = 0.001, tag2idx = None):
+        self.embedding_dim = embedding_dim
+        self.hidden_dim = hidden_dim
+        self.epochs = epochs
+        self.learning_rate = learning_rate
+        self.tag2idx = tag2idx
+    def fit(self, embedded, encoded_tags):
+        print('Feed Forward NN: ', flush=True)
+        data = Ner_Dataset(embedded, encoded_tags)
+        train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad)
+        self.model = self.train_FF(train_loader)
+        print('--> Feed Forward trained', flush=True)
+        return self
+    def predict(self, X):
+    # Switch to evaluation mode
+        test_loader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=pred_pad)
+        self.model.eval()
+        predictions = []
+        # Iterate through test data
+        with torch.no_grad():
+            for X_batch in test_loader:
+                X_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+                tag_scores = self.model(X_batch)
+                _, predicted_tags = torch.max(tag_scores, dim=2)
+                # Flatten the tensors to compare word-by-word
+                flattened_pred = predicted_tags.view(-1)
+                predictions.append(flattened_pred.cpu().numpy())
+        predictions = np.concatenate(predictions)
+        return predictions
+    def train_FF(self, train_loader, input_dim=None, hidden_dim=128, epochs=5, learning_rate=0.001):
+        input_dim = self.embedding_dim
+        # Instantiate the lstm_model
+        ff_model = FeedForwardNN_NER(self.embedding_dim, hidden_dim=hidden_dim, tagset_size=len(self.tag2idx))
+        ff_model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+        # Loss function and optimizer
+        loss_function = nn.CrossEntropyLoss(ignore_index=PAD_VALUE)  # Ignore padding
+        optimizer = optim.Adam(ff_model.parameters(), lr=learning_rate)
+        print('--> Training FF')
+        # Training loop
+        for epoch in range(epochs):
+            total_loss = 0
+            total_correct = 0
+            total_words = 0
+            ff_model.train()  # Set model to training mode
+            for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
+                X_batch, y_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), y_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+                # Zero gradients
+                optimizer.zero_grad()
+                # Forward pass
+                tag_scores = ff_model(X_batch)
+                # Reshape and compute loss (ignore padded values)
+                loss = loss_function(tag_scores.view(-1, len(self.tag2idx)), y_batch.view(-1))
+                # Backward pass and optimization
+                loss.backward()
+                optimizer.step()
+                total_loss += loss.item()
+                # Compute accuracy for this batch
+                # Get the predicted tags (index of max score)
+                _, predicted_tags = torch.max(tag_scores, dim=2)
+                # Flatten the tensors to compare word-by-word
+                flattened_pred = predicted_tags.view(-1)
+                flattened_true = y_batch.view(-1)
+                # Exclude padding tokens from the accuracy calculation
+                mask = flattened_true != PAD_VALUE
+                correct = (flattened_pred[mask] == flattened_true[mask]).sum().item()
+                # Count the total words in the batch (ignoring padding)
+                total_words_batch = mask.sum().item()
+                # Update total correct and total words
+                total_correct += correct
+                total_words += total_words_batch
+            avg_loss = total_loss / len(train_loader)
+            avg_accuracy = total_correct / total_words * 100  # Accuracy in percentage
+            print(f'    ==> Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%')
+        return ff_model
+crf = sklearn_crfsuite.CRF(
+    algorithm='lbfgs',
+    c1=0.1,
+    c2=0.1,
+    max_iterations=100,
+    all_possible_transitions=True)

performance_test.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import requests
+import time
+from concurrent.futures import ThreadPoolExecutor
+import csv
+NUM_REQUESTS = 5
+CONCURRENT_THREADS = 10
+URL = "http://localhost:5000/"
+def send_request():
+    data = {
+        'search': "A MRI, magnetic resonance imaging, scan is a very useful diagnosis tool.",
+        'pipeline_select': '1'
+    }
+    start_time = time.time()
+    try:
+        response = requests.post(URL, data=data)
+        elapsed = time.time() - start_time
+        if response.status_code != 200:
+            print(f"Error {response.status_code}: {response.text[:100]}")
+        return response.status_code, elapsed
+    except Exception as e:
+        print("Request failed:", e)
+        return 500, 0  # Treat exceptions as failures
+def run_stress_test():
+    results = []
+    with ThreadPoolExecutor(max_workers=CONCURRENT_THREADS) as executor:
+        futures = [executor.submit(send_request) for _ in range(NUM_REQUESTS)]
+        for future in futures:
+            results.append(future.result())
+    successes = sum(1 for r in results if r[0] == 200)
+    failures = NUM_REQUESTS - successes
+    avg_time = sum(r[1] for r in results) / NUM_REQUESTS
+    max_time = max(r[1] for r in results)
+    min_time = min(r[1] for r in results)
+    print(f"\n=== Stress Test Results ===")
+    print(f"Total Requests: {NUM_REQUESTS}")
+    print(f"Concurrency Level: {CONCURRENT_THREADS}")
+    print(f"Successes: {successes}")
+    print(f"Failures: {failures}")
+    print(f"Avg Time: {avg_time:.3f}s")
+    print(f"Min Time: {min_time:.3f}s")
+    print(f"Max Time: {max_time:.3f}s")
+    return [NUM_REQUESTS, CONCURRENT_THREADS, avg_time, max_time]
+if __name__ == "__main__":
+    # Open the CSV file for writing the summary results
+    with open('stress_test_results.csv', 'w', newline='') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(['Total Requests', 'Concurrency Level', 'Avg Time', 'Max Time'])
+        for users in [1, 5, 10, 20, 50, 100]:
+            CONCURRENT_THREADS = users
+            NUM_REQUESTS = users * 5
+            result = run_stress_test()
+            writer.writerow(result)