mo01018 commited on
Commit
74c2449
·
verified ·
1 Parent(s): 9bb28fd

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +121 -0
  2. customFunctions.py +470 -0
  3. performance_test.py +64 -0
app.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request, redirect, url_for
2
+ from joblib import load
3
+ import pandas as pd
4
+ import re
5
+ from customFunctions import *
6
+ import json
7
+ import datetime
8
+
9
+ pd.set_option('display.max_colwidth', 1000)
10
+
11
+ PIPELINES = [
12
+ {
13
+ 'id': 1,
14
+ 'name': 'Baseline',
15
+ 'pipeline': load("pipeline_ex1_s1.joblib")
16
+ },
17
+ {
18
+ 'id': 2,
19
+ 'name': 'Trained on a FeedForward NN',
20
+ 'pipeline': load("pipeline_ex1_s2.joblib")
21
+ },
22
+ {
23
+ 'id': 3,
24
+ 'name': 'Trained on a CRF',
25
+ 'pipeline': load("pipeline_ex1_s3.joblib")
26
+ },
27
+ #{
28
+ # 'id': 4,
29
+ # 'name': 'Trained on a small dataset',
30
+ # 'pipeline': load("pipeline_ex2_s1.joblib")
31
+ #},
32
+ #{
33
+ # 'id': 5,
34
+ # 'name': 'Trained on a large dataset',
35
+ # 'pipeline': load("pipeline_ex2_s2.joblib")
36
+ #},
37
+ #{
38
+ # 'id': 6,
39
+ # 'name': 'Embedded using TFIDF',
40
+ # 'pipeline': load("pipeline_ex3_s1.joblib")
41
+ #},
42
+ #{
43
+ # 'id': 7,
44
+ # 'name': 'Embedded using ?',
45
+ # 'pipeline': load("pipeline_ex3_s2.joblib")
46
+ #},
47
+
48
+ ]
49
+
50
+ pipeline_metadata = [{'id': p['id'], 'name': p['name']} for p in PIPELINES]
51
+
52
+ def get_pipeline_by_id(pipelines, pipeline_id):
53
+ return next((p['pipeline'] for p in pipelines if p['id'] == pipeline_id), None)
54
+
55
+ def get_name_by_id(pipelines, pipeline_id):
56
+ return next((p['name'] for p in pipelines if p['id'] == pipeline_id), None)
57
+
58
+
59
+
60
+ def requestResults(text, pipeline):
61
+ labels = pipeline.predict(text)
62
+ print(labels.ndim)
63
+ if labels.ndim != 1:
64
+ flattened_predictions = []
65
+ for sentence in labels:
66
+ for tag in sentence:
67
+ flattened_predictions.append(tag)
68
+ labels = flattened_predictions
69
+ print(labels)
70
+ labels = [int(label) for label in labels]
71
+ tag_encoder = LabelEncoder()
72
+ tag_encoder.fit(['B-AC', 'O', 'B-LF', 'I-LF'])
73
+ decoded_labels = tag_encoder.inverse_transform(labels)
74
+ return decoded_labels
75
+
76
+ LOG_FILE = "usage_log.jsonl" # Each line is a JSON object
77
+
78
+ def log_interaction(user_input, model_name, predictions):
79
+ log_entry = {
80
+ "timestamp": datetime.datetime.utcnow().isoformat(),
81
+ "user_input": user_input,
82
+ "model": model_name,
83
+ "predictions": predictions
84
+ }
85
+ with open(LOG_FILE, "a") as f:
86
+ f.write(json.dumps(log_entry) + "\n")
87
+
88
+
89
+ app = Flask(__name__)
90
+
91
+
92
+ @app.route('/')
93
+ def index():
94
+ return render_template('index.html', pipelines= pipeline_metadata)
95
+
96
+
97
+ @app.route('/', methods=['POST'])
98
+ def get_data():
99
+ if request.method == 'POST':
100
+
101
+ text = request.form['search']
102
+ tokens = re.findall(r"\w+|[^\w\s]", text)
103
+ tokens_fomatted = pd.Series([pd.Series(tokens)])
104
+
105
+ pipeline_id = int(request.form['pipeline_select'])
106
+ pipeline = get_pipeline_by_id(PIPELINES, pipeline_id)
107
+ name = get_name_by_id(PIPELINES, pipeline_id)
108
+
109
+ labels = requestResults(tokens_fomatted, pipeline)
110
+ results = dict(zip(tokens, labels))
111
+
112
+ log_interaction(text, name, results)
113
+
114
+ return render_template('index.html', results=results, name=name, pipelines= pipeline_metadata)
115
+
116
+
117
+ if __name__ == '__main__':
118
+ app.run(host="0.0.0.0", port=7860)
119
+
120
+ #if __name__ == '__main__':
121
+ #app.run(host="0.0.0.0", port=7860)
customFunctions.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import random
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.optim as optim
7
+ #from transformers import BertTokenizer, BertModel
8
+ from sklearn.metrics import accuracy_score, f1_score, classification_report
9
+ import sklearn_crfsuite
10
+ from sklearn_crfsuite import metrics
11
+ from sklearn.metrics.pairwise import cosine_similarity
12
+ from gensim.models import Word2Vec
13
+ from sklearn.pipeline import Pipeline
14
+ from sklearn.preprocessing import LabelEncoder
15
+ from torch.utils.data import Dataset, DataLoader
16
+ from torch.nn.utils.rnn import pad_sequence
17
+ from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
18
+ from sklearn.feature_extraction.text import TfidfVectorizer
19
+
20
+
21
+
22
+ EMBEDDING_DIM = 100
23
+ PAD_VALUE= -1
24
+ MAX_LENGTH = 376
25
+ EMBEDDING_DIM = 100
26
+ BATCH_SIZE = 16
27
+
28
+ class preprocess_sentences():
29
+ def __init__(self):
30
+ pass
31
+
32
+ def fit(self, X, y=None):
33
+ print('PREPROCESSING')
34
+ return self
35
+
36
+ def transform(self, X):
37
+ # X = train['tokens'], y =
38
+ sentences = X.apply(lambda x: x.tolist()).tolist()
39
+ print('--> Preprocessing complete \n', flush=True)
40
+ return sentences
41
+
42
+
43
+
44
+ class Word2VecTransformer():
45
+ def __init__(self, vector_size = 100, window = 5, min_count = 1, workers = 1, embedding_dim=EMBEDDING_DIM):
46
+ self.model = None
47
+ self.vector_size = vector_size
48
+ self.window = window
49
+ self.min_count = min_count
50
+ self.workers = workers
51
+ self.embedding_dim = embedding_dim
52
+
53
+ def fit(self, X, y):
54
+ # https://stackoverflow.com/questions/17242456/python-print-sys-stdout-write-not-visible-when-using-logging
55
+ # https://stackoverflow.com/questions/230751/how-can-i-flush-the-output-of-the-print-function
56
+ print('WORD2VEC:', flush=True)
57
+ # This fits the word2vec model
58
+ self.model = Word2Vec(sentences = X, vector_size=self.vector_size, window=self.window
59
+ , min_count=self.min_count, workers=self.workers)
60
+ print('--> Word2Vec Fitted', flush=True)
61
+ return self
62
+
63
+ def transform(self, X):
64
+ # This bit should transform the sentences
65
+ embedded_sentences = []
66
+
67
+ for sentence in X:
68
+ sentence_vectors = []
69
+
70
+ for word in sentence:
71
+ if word in self.model.wv:
72
+ vec = self.model.wv[word]
73
+ else:
74
+ vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
75
+
76
+ sentence_vectors.append(vec)
77
+
78
+ embedded_sentences.append(torch.tensor(sentence_vectors, dtype=torch.float32))
79
+ print('--> Embeddings Complete \n', flush=True)
80
+
81
+ return embedded_sentences
82
+
83
+ class Word2VecTransformer_CRF():
84
+ def __init__(self, vector_size = 100, window = 5, min_count = 1, workers = 1, embedding_dim=EMBEDDING_DIM):
85
+ self.model = None
86
+ self.vector_size = vector_size
87
+ self.window = window
88
+ self.min_count = min_count
89
+ self.workers = workers
90
+ self.embedding_dim = embedding_dim
91
+
92
+ def fit(self, X, y):
93
+ # https://stackoverflow.com/questions/17242456/python-print-sys-stdout-write-not-visible-when-using-logging
94
+ # https://stackoverflow.com/questions/230751/how-can-i-flush-the-output-of-the-print-function
95
+ print('WORD2VEC:', flush=True)
96
+ # This fits the word2vec model
97
+ self.model = Word2Vec(sentences = X, vector_size=self.vector_size, window=self.window
98
+ , min_count=self.min_count, workers=self.workers)
99
+ print('--> Word2Vec Fitted', flush=True)
100
+ return self
101
+
102
+ def transform(self, X):
103
+ # This bit should transform the sentences
104
+ embedded_sentences = []
105
+
106
+ for sentence in X:
107
+ sentence_vectors = []
108
+
109
+ for word in sentence:
110
+ features = {
111
+ 'bias': 1.0,
112
+ 'word.lower()': word.lower(),
113
+ 'word[-3:]': word[-3:],
114
+ 'word[-2:]': word[-2:],
115
+ 'word.isupper()': word.isupper(),
116
+ 'word.istitle()': word.istitle(),
117
+ 'word.isdigit()': word.isdigit(),
118
+ }
119
+ if word in self.model.wv:
120
+ vec = self.model.wv[word]
121
+ else:
122
+ vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
123
+
124
+ # https://stackoverflow.com/questions/58736548/how-to-use-word-embedding-as-features-for-crf-sklearn-crfsuite-model-training
125
+ for index in range(len(vec)):
126
+ features[f"embedding_{index}"] = vec[index]
127
+
128
+ sentence_vectors.append(features)
129
+
130
+ embedded_sentences.append(sentence_vectors)
131
+ print('--> Embeddings Complete \n', flush=True)
132
+
133
+ return embedded_sentences
134
+
135
+
136
+ class tfidf(BaseEstimator, TransformerMixin):
137
+ def __init__(self):
138
+ self.model = None
139
+ self.embedding_dim = None
140
+ self.idf = None
141
+ self.vocab_size = None
142
+ self.vocab = None
143
+ pass
144
+
145
+ def fit(self, X, y = None):
146
+ print('TFIDF:', flush=True)
147
+ joined_sentences = [' '.join(tokens) for tokens in X]
148
+ self.model = TfidfVectorizer()
149
+ self.model.fit(joined_sentences)
150
+ self.vocab = self.model.vocabulary_
151
+ self.idf = self.model.idf_
152
+ self.vocab_size = len(self.vocab)
153
+ self.embedding_dim = self.vocab_size
154
+ print('--> TFIDF Fitted', flush=True)
155
+ return self
156
+
157
+ def transform(self, X):
158
+
159
+ embedded = []
160
+ for sentence in X:
161
+ sent_vecs = []
162
+ token_counts = {}
163
+ for word in sentence:
164
+ token_counts[word] = token_counts.get(word, 0) + 1
165
+
166
+ sent_len = len(sentence)
167
+ for word in sentence:
168
+ vec = np.zeros(self.vocab_size)
169
+ if word in self.vocab:
170
+ tf = token_counts[word] / sent_len
171
+ token_idx = self.vocab[word]
172
+ vec[token_idx] = tf * self.idf[token_idx]
173
+ sent_vecs.append(vec)
174
+ embedded.append(torch.tensor(sent_vecs, dtype=torch.float32))
175
+ print('--> Embeddings Complete \n', flush=True)
176
+ print(embedded[0][0], flush=True)
177
+ print('Those were the embeddings', flush=True)
178
+
179
+
180
+ return embedded
181
+
182
+
183
+ class BiLSTM_NER(nn.Module):
184
+ def __init__(self,input_dim, hidden_dim, tagset_size):
185
+ super(BiLSTM_NER, self).__init__()
186
+
187
+ # Embedding layer
188
+ #Freeze= false means that it will fine tune
189
+ #self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze = False, padding_idx=-1)
190
+
191
+ self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
192
+ self.fc = nn.Linear(hidden_dim*2, tagset_size)
193
+
194
+ def forward(self, sentences):
195
+ #embeds = self.embedding(sentences)
196
+ lstm_out, _ = self.lstm(sentences)
197
+ tag_scores = self.fc(lstm_out)
198
+
199
+ return tag_scores
200
+
201
+ # Define the FeedForward NN Model
202
+ class FeedForwardNN_NER(nn.Module):
203
+ def __init__(self, embedding_dim, hidden_dim, tagset_size):
204
+ super(FeedForwardNN_NER, self).__init__()
205
+ self.fc1 = nn.Linear(embedding_dim, hidden_dim)
206
+ self.relu = nn.ReLU()
207
+ self.fc2 = nn.Linear(hidden_dim, tagset_size)
208
+
209
+ def forward(self, x):
210
+ # x: (batch_size, seq_length, embedding_dim)
211
+ x = self.fc1(x) # (batch_size, seq_length, hidden_dim)
212
+ x = self.relu(x)
213
+ logits = self.fc2(x) # (batch_size, seq_length, tagset_size)
214
+ return logits
215
+
216
+
217
+ def pad(batch):
218
+ # batch is a list of (X, y) pairs
219
+ X_batch, y_batch = zip(*batch)
220
+
221
+ # Convert to tensors
222
+ X_batch = [torch.tensor(seq, dtype=torch.float32) for seq in X_batch]
223
+ y_batch = [torch.tensor(seq, dtype=torch.long) for seq in y_batch]
224
+
225
+ # Pad sequences
226
+ X_padded = pad_sequence(X_batch, batch_first=True, padding_value=PAD_VALUE)
227
+ y_padded = pad_sequence(y_batch, batch_first=True, padding_value=PAD_VALUE)
228
+
229
+ return X_padded, y_padded
230
+
231
+ def pred_pad(batch):
232
+ X_batch = [torch.tensor(seq, dtype=torch.float32) for seq in batch]
233
+ X_padded = pad_sequence(X_batch, batch_first=True, padding_value=PAD_VALUE)
234
+ return X_padded
235
+
236
+
237
+ class Ner_Dataset(Dataset):
238
+ def __init__(self, X, y):
239
+ self.X = X
240
+ self.y = y
241
+
242
+ def __len__(self):
243
+ return len(self.X)
244
+
245
+ def __getitem__(self, idx):
246
+ return self.X[idx], self.y[idx]
247
+
248
+
249
+
250
+
251
+ class LSTM(BaseEstimator, ClassifierMixin):
252
+ def __init__(self, embedding_dim = None, hidden_dim = 128, epochs = 5, learning_rate = 0.001, tag2idx = None):
253
+ self.embedding_dim = embedding_dim
254
+ self.hidden_dim = hidden_dim
255
+ self.epochs = epochs
256
+ self.learning_rate = learning_rate
257
+ self.tag2idx = tag2idx
258
+
259
+
260
+
261
+ def fit(self, embedded, encoded_tags):
262
+ print('LSTM:', flush=True)
263
+ data = Ner_Dataset(embedded, encoded_tags)
264
+ train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad)
265
+
266
+ self.model = self.train_LSTM(train_loader)
267
+ print('--> LSTM trained', flush=True)
268
+ return self
269
+
270
+ def predict(self, X):
271
+ # Switch to evaluation mode
272
+
273
+ test_loader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=pred_pad)
274
+
275
+ self.model.eval()
276
+ predictions = []
277
+
278
+ # Iterate through test data
279
+ with torch.no_grad():
280
+ for X_batch in test_loader:
281
+ X_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
282
+
283
+ tag_scores = self.model(X_batch)
284
+ _, predicted_tags = torch.max(tag_scores, dim=2)
285
+
286
+ # Flatten the tensors to compare word-by-word
287
+ flattened_pred = predicted_tags.view(-1)
288
+ predictions.append(flattened_pred.cpu().numpy())
289
+
290
+ predictions = np.concatenate(predictions)
291
+ return predictions
292
+
293
+
294
+ def train_LSTM(self, train_loader, input_dim=None, hidden_dim=128, epochs=5, learning_rate=0.001):
295
+
296
+ input_dim = self.embedding_dim
297
+ # Instantiate the lstm_model
298
+ lstm_model = BiLSTM_NER(input_dim, hidden_dim=hidden_dim, tagset_size=len(self.tag2idx))
299
+ lstm_model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
300
+
301
+ # Loss function and optimizer
302
+ loss_function = nn.CrossEntropyLoss(ignore_index=PAD_VALUE) # Ignore padding
303
+ optimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)
304
+ print('--> Training LSTM')
305
+
306
+ # Training loop
307
+ for epoch in range(epochs):
308
+ total_loss = 0
309
+ total_correct = 0
310
+ total_words = 0
311
+ lstm_model.train() # Set model to training mode
312
+
313
+ for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
314
+ X_batch, y_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), y_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
315
+
316
+ # Zero gradients
317
+ optimizer.zero_grad()
318
+
319
+ # Forward pass
320
+ tag_scores = lstm_model(X_batch)
321
+
322
+ # Reshape and compute loss (ignore padded values)
323
+ loss = loss_function(tag_scores.view(-1, len(self.tag2idx)), y_batch.view(-1))
324
+
325
+ # Backward pass and optimization
326
+ loss.backward()
327
+ optimizer.step()
328
+
329
+ total_loss += loss.item()
330
+
331
+ # Compute accuracy for this batch
332
+ # Get the predicted tags (index of max score)
333
+ _, predicted_tags = torch.max(tag_scores, dim=2)
334
+
335
+ # Flatten the tensors to compare word-by-word
336
+ flattened_pred = predicted_tags.view(-1)
337
+ flattened_true = y_batch.view(-1)
338
+
339
+ # Exclude padding tokens from the accuracy calculation
340
+ mask = flattened_true != PAD_VALUE
341
+ correct = (flattened_pred[mask] == flattened_true[mask]).sum().item()
342
+
343
+ # Count the total words in the batch (ignoring padding)
344
+ total_words_batch = mask.sum().item()
345
+
346
+ # Update total correct and total words
347
+ total_correct += correct
348
+ total_words += total_words_batch
349
+
350
+ avg_loss = total_loss / len(train_loader)
351
+ avg_accuracy = total_correct / total_words * 100 # Accuracy in percentage
352
+
353
+ print(f' ==> Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%')
354
+
355
+ return lstm_model
356
+
357
+
358
+ class FeedforwardNN(BaseEstimator, ClassifierMixin):
359
+ def __init__(self, embedding_dim = None, hidden_dim = 128, epochs = 5, learning_rate = 0.001, tag2idx = None):
360
+ self.embedding_dim = embedding_dim
361
+ self.hidden_dim = hidden_dim
362
+ self.epochs = epochs
363
+ self.learning_rate = learning_rate
364
+ self.tag2idx = tag2idx
365
+
366
+
367
+
368
+ def fit(self, embedded, encoded_tags):
369
+ print('Feed Forward NN: ', flush=True)
370
+ data = Ner_Dataset(embedded, encoded_tags)
371
+ train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad)
372
+
373
+ self.model = self.train_FF(train_loader)
374
+ print('--> Feed Forward trained', flush=True)
375
+ return self
376
+
377
+ def predict(self, X):
378
+ # Switch to evaluation mode
379
+
380
+ test_loader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=pred_pad)
381
+
382
+ self.model.eval()
383
+ predictions = []
384
+
385
+ # Iterate through test data
386
+ with torch.no_grad():
387
+ for X_batch in test_loader:
388
+ X_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
389
+
390
+ tag_scores = self.model(X_batch)
391
+ _, predicted_tags = torch.max(tag_scores, dim=2)
392
+
393
+ # Flatten the tensors to compare word-by-word
394
+ flattened_pred = predicted_tags.view(-1)
395
+ predictions.append(flattened_pred.cpu().numpy())
396
+
397
+ predictions = np.concatenate(predictions)
398
+ return predictions
399
+
400
+
401
+ def train_FF(self, train_loader, input_dim=None, hidden_dim=128, epochs=5, learning_rate=0.001):
402
+
403
+ input_dim = self.embedding_dim
404
+ # Instantiate the lstm_model
405
+ ff_model = FeedForwardNN_NER(self.embedding_dim, hidden_dim=hidden_dim, tagset_size=len(self.tag2idx))
406
+ ff_model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
407
+
408
+ # Loss function and optimizer
409
+ loss_function = nn.CrossEntropyLoss(ignore_index=PAD_VALUE) # Ignore padding
410
+ optimizer = optim.Adam(ff_model.parameters(), lr=learning_rate)
411
+ print('--> Training FF')
412
+
413
+ # Training loop
414
+ for epoch in range(epochs):
415
+ total_loss = 0
416
+ total_correct = 0
417
+ total_words = 0
418
+ ff_model.train() # Set model to training mode
419
+
420
+ for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
421
+ X_batch, y_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), y_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
422
+
423
+ # Zero gradients
424
+ optimizer.zero_grad()
425
+
426
+ # Forward pass
427
+ tag_scores = ff_model(X_batch)
428
+
429
+ # Reshape and compute loss (ignore padded values)
430
+ loss = loss_function(tag_scores.view(-1, len(self.tag2idx)), y_batch.view(-1))
431
+
432
+ # Backward pass and optimization
433
+ loss.backward()
434
+ optimizer.step()
435
+
436
+ total_loss += loss.item()
437
+
438
+ # Compute accuracy for this batch
439
+ # Get the predicted tags (index of max score)
440
+ _, predicted_tags = torch.max(tag_scores, dim=2)
441
+
442
+ # Flatten the tensors to compare word-by-word
443
+ flattened_pred = predicted_tags.view(-1)
444
+ flattened_true = y_batch.view(-1)
445
+
446
+ # Exclude padding tokens from the accuracy calculation
447
+ mask = flattened_true != PAD_VALUE
448
+ correct = (flattened_pred[mask] == flattened_true[mask]).sum().item()
449
+
450
+ # Count the total words in the batch (ignoring padding)
451
+ total_words_batch = mask.sum().item()
452
+
453
+ # Update total correct and total words
454
+ total_correct += correct
455
+ total_words += total_words_batch
456
+
457
+ avg_loss = total_loss / len(train_loader)
458
+ avg_accuracy = total_correct / total_words * 100 # Accuracy in percentage
459
+
460
+ print(f' ==> Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%')
461
+
462
+ return ff_model
463
+
464
+ crf = sklearn_crfsuite.CRF(
465
+ algorithm='lbfgs',
466
+ c1=0.1,
467
+ c2=0.1,
468
+ max_iterations=100,
469
+ all_possible_transitions=True)
470
+
performance_test.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import time
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ import csv
5
+
6
+ NUM_REQUESTS = 5
7
+ CONCURRENT_THREADS = 10
8
+ URL = "http://localhost:5000/"
9
+
10
+ def send_request():
11
+ data = {
12
+ 'search': "A MRI, magnetic resonance imaging, scan is a very useful diagnosis tool.",
13
+ 'pipeline_select': '1'
14
+ }
15
+
16
+ start_time = time.time()
17
+ try:
18
+ response = requests.post(URL, data=data)
19
+ elapsed = time.time() - start_time
20
+ if response.status_code != 200:
21
+ print(f"Error {response.status_code}: {response.text[:100]}")
22
+ return response.status_code, elapsed
23
+ except Exception as e:
24
+ print("Request failed:", e)
25
+ return 500, 0 # Treat exceptions as failures
26
+
27
+ def run_stress_test():
28
+ results = []
29
+
30
+ with ThreadPoolExecutor(max_workers=CONCURRENT_THREADS) as executor:
31
+ futures = [executor.submit(send_request) for _ in range(NUM_REQUESTS)]
32
+ for future in futures:
33
+ results.append(future.result())
34
+
35
+ successes = sum(1 for r in results if r[0] == 200)
36
+ failures = NUM_REQUESTS - successes
37
+ avg_time = sum(r[1] for r in results) / NUM_REQUESTS
38
+ max_time = max(r[1] for r in results)
39
+ min_time = min(r[1] for r in results)
40
+
41
+ print(f"\n=== Stress Test Results ===")
42
+ print(f"Total Requests: {NUM_REQUESTS}")
43
+ print(f"Concurrency Level: {CONCURRENT_THREADS}")
44
+ print(f"Successes: {successes}")
45
+ print(f"Failures: {failures}")
46
+ print(f"Avg Time: {avg_time:.3f}s")
47
+ print(f"Min Time: {min_time:.3f}s")
48
+ print(f"Max Time: {max_time:.3f}s")
49
+
50
+ return [NUM_REQUESTS, CONCURRENT_THREADS, avg_time, max_time]
51
+
52
+ if __name__ == "__main__":
53
+ # Open the CSV file for writing the summary results
54
+ with open('stress_test_results.csv', 'w', newline='') as csvfile:
55
+ writer = csv.writer(csvfile)
56
+ writer.writerow(['Total Requests', 'Concurrency Level', 'Avg Time', 'Max Time'])
57
+
58
+ for users in [1, 5, 10, 20, 50, 100]:
59
+ CONCURRENT_THREADS = users
60
+ NUM_REQUESTS = users * 5
61
+ result = run_stress_test()
62
+
63
+ writer.writerow(result)
64
+