hw01558 commited on
Commit
6e39b37
·
verified ·
1 Parent(s): 5d7b3b3

Update customFunctions.py for new pipelines

Browse files

Added the updated functions, have not added the Bio2Vec file as not sure if need it as pipeline is already huge so should be in there?

Files changed (1) hide show
  1. customFunctions.py +547 -470
customFunctions.py CHANGED
@@ -1,470 +1,547 @@
1
- import pandas as pd
2
- import numpy as np
3
- import random
4
- import torch
5
- import torch.nn as nn
6
- import torch.optim as optim
7
- #from transformers import BertTokenizer, BertModel
8
- from sklearn.metrics import accuracy_score, f1_score, classification_report
9
- import sklearn_crfsuite
10
- from sklearn_crfsuite import metrics
11
- from sklearn.metrics.pairwise import cosine_similarity
12
- from gensim.models import Word2Vec
13
- from sklearn.pipeline import Pipeline
14
- from sklearn.preprocessing import LabelEncoder
15
- from torch.utils.data import Dataset, DataLoader
16
- from torch.nn.utils.rnn import pad_sequence
17
- from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
18
- from sklearn.feature_extraction.text import TfidfVectorizer
19
-
20
-
21
-
22
- EMBEDDING_DIM = 100
23
- PAD_VALUE= -1
24
- MAX_LENGTH = 376
25
- EMBEDDING_DIM = 100
26
- BATCH_SIZE = 16
27
-
28
- class preprocess_sentences():
29
- def __init__(self):
30
- pass
31
-
32
- def fit(self, X, y=None):
33
- print('PREPROCESSING')
34
- return self
35
-
36
- def transform(self, X):
37
- # X = train['tokens'], y =
38
- sentences = X.apply(lambda x: x.tolist()).tolist()
39
- print('--> Preprocessing complete \n', flush=True)
40
- return sentences
41
-
42
-
43
-
44
- class Word2VecTransformer():
45
- def __init__(self, vector_size = 100, window = 5, min_count = 1, workers = 1, embedding_dim=EMBEDDING_DIM):
46
- self.model = None
47
- self.vector_size = vector_size
48
- self.window = window
49
- self.min_count = min_count
50
- self.workers = workers
51
- self.embedding_dim = embedding_dim
52
-
53
- def fit(self, X, y):
54
- # https://stackoverflow.com/questions/17242456/python-print-sys-stdout-write-not-visible-when-using-logging
55
- # https://stackoverflow.com/questions/230751/how-can-i-flush-the-output-of-the-print-function
56
- print('WORD2VEC:', flush=True)
57
- # This fits the word2vec model
58
- self.model = Word2Vec(sentences = X, vector_size=self.vector_size, window=self.window
59
- , min_count=self.min_count, workers=self.workers)
60
- print('--> Word2Vec Fitted', flush=True)
61
- return self
62
-
63
- def transform(self, X):
64
- # This bit should transform the sentences
65
- embedded_sentences = []
66
-
67
- for sentence in X:
68
- sentence_vectors = []
69
-
70
- for word in sentence:
71
- if word in self.model.wv:
72
- vec = self.model.wv[word]
73
- else:
74
- vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
75
-
76
- sentence_vectors.append(vec)
77
-
78
- embedded_sentences.append(torch.tensor(sentence_vectors, dtype=torch.float32))
79
- print('--> Embeddings Complete \n', flush=True)
80
-
81
- return embedded_sentences
82
-
83
- class Word2VecTransformer_CRF():
84
- def __init__(self, vector_size = 100, window = 5, min_count = 1, workers = 1, embedding_dim=EMBEDDING_DIM):
85
- self.model = None
86
- self.vector_size = vector_size
87
- self.window = window
88
- self.min_count = min_count
89
- self.workers = workers
90
- self.embedding_dim = embedding_dim
91
-
92
- def fit(self, X, y):
93
- # https://stackoverflow.com/questions/17242456/python-print-sys-stdout-write-not-visible-when-using-logging
94
- # https://stackoverflow.com/questions/230751/how-can-i-flush-the-output-of-the-print-function
95
- print('WORD2VEC:', flush=True)
96
- # This fits the word2vec model
97
- self.model = Word2Vec(sentences = X, vector_size=self.vector_size, window=self.window
98
- , min_count=self.min_count, workers=self.workers)
99
- print('--> Word2Vec Fitted', flush=True)
100
- return self
101
-
102
- def transform(self, X):
103
- # This bit should transform the sentences
104
- embedded_sentences = []
105
-
106
- for sentence in X:
107
- sentence_vectors = []
108
-
109
- for word in sentence:
110
- features = {
111
- 'bias': 1.0,
112
- 'word.lower()': word.lower(),
113
- 'word[-3:]': word[-3:],
114
- 'word[-2:]': word[-2:],
115
- 'word.isupper()': word.isupper(),
116
- 'word.istitle()': word.istitle(),
117
- 'word.isdigit()': word.isdigit(),
118
- }
119
- if word in self.model.wv:
120
- vec = self.model.wv[word]
121
- else:
122
- vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
123
-
124
- # https://stackoverflow.com/questions/58736548/how-to-use-word-embedding-as-features-for-crf-sklearn-crfsuite-model-training
125
- for index in range(len(vec)):
126
- features[f"embedding_{index}"] = vec[index]
127
-
128
- sentence_vectors.append(features)
129
-
130
- embedded_sentences.append(sentence_vectors)
131
- print('--> Embeddings Complete \n', flush=True)
132
-
133
- return embedded_sentences
134
-
135
-
136
- class tfidf(BaseEstimator, TransformerMixin):
137
- def __init__(self):
138
- self.model = None
139
- self.embedding_dim = None
140
- self.idf = None
141
- self.vocab_size = None
142
- self.vocab = None
143
- pass
144
-
145
- def fit(self, X, y = None):
146
- print('TFIDF:', flush=True)
147
- joined_sentences = [' '.join(tokens) for tokens in X]
148
- self.model = TfidfVectorizer()
149
- self.model.fit(joined_sentences)
150
- self.vocab = self.model.vocabulary_
151
- self.idf = self.model.idf_
152
- self.vocab_size = len(self.vocab)
153
- self.embedding_dim = self.vocab_size
154
- print('--> TFIDF Fitted', flush=True)
155
- return self
156
-
157
- def transform(self, X):
158
-
159
- embedded = []
160
- for sentence in X:
161
- sent_vecs = []
162
- token_counts = {}
163
- for word in sentence:
164
- token_counts[word] = token_counts.get(word, 0) + 1
165
-
166
- sent_len = len(sentence)
167
- for word in sentence:
168
- vec = np.zeros(self.vocab_size)
169
- if word in self.vocab:
170
- tf = token_counts[word] / sent_len
171
- token_idx = self.vocab[word]
172
- vec[token_idx] = tf * self.idf[token_idx]
173
- sent_vecs.append(vec)
174
- embedded.append(torch.tensor(sent_vecs, dtype=torch.float32))
175
- print('--> Embeddings Complete \n', flush=True)
176
- print(embedded[0][0], flush=True)
177
- print('Those were the embeddings', flush=True)
178
-
179
-
180
- return embedded
181
-
182
-
183
- class BiLSTM_NER(nn.Module):
184
- def __init__(self,input_dim, hidden_dim, tagset_size):
185
- super(BiLSTM_NER, self).__init__()
186
-
187
- # Embedding layer
188
- #Freeze= false means that it will fine tune
189
- #self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze = False, padding_idx=-1)
190
-
191
- self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
192
- self.fc = nn.Linear(hidden_dim*2, tagset_size)
193
-
194
- def forward(self, sentences):
195
- #embeds = self.embedding(sentences)
196
- lstm_out, _ = self.lstm(sentences)
197
- tag_scores = self.fc(lstm_out)
198
-
199
- return tag_scores
200
-
201
- # Define the FeedForward NN Model
202
- class FeedForwardNN_NER(nn.Module):
203
- def __init__(self, embedding_dim, hidden_dim, tagset_size):
204
- super(FeedForwardNN_NER, self).__init__()
205
- self.fc1 = nn.Linear(embedding_dim, hidden_dim)
206
- self.relu = nn.ReLU()
207
- self.fc2 = nn.Linear(hidden_dim, tagset_size)
208
-
209
- def forward(self, x):
210
- # x: (batch_size, seq_length, embedding_dim)
211
- x = self.fc1(x) # (batch_size, seq_length, hidden_dim)
212
- x = self.relu(x)
213
- logits = self.fc2(x) # (batch_size, seq_length, tagset_size)
214
- return logits
215
-
216
-
217
- def pad(batch):
218
- # batch is a list of (X, y) pairs
219
- X_batch, y_batch = zip(*batch)
220
-
221
- # Convert to tensors
222
- X_batch = [torch.tensor(seq, dtype=torch.float32) for seq in X_batch]
223
- y_batch = [torch.tensor(seq, dtype=torch.long) for seq in y_batch]
224
-
225
- # Pad sequences
226
- X_padded = pad_sequence(X_batch, batch_first=True, padding_value=PAD_VALUE)
227
- y_padded = pad_sequence(y_batch, batch_first=True, padding_value=PAD_VALUE)
228
-
229
- return X_padded, y_padded
230
-
231
- def pred_pad(batch):
232
- X_batch = [torch.tensor(seq, dtype=torch.float32) for seq in batch]
233
- X_padded = pad_sequence(X_batch, batch_first=True, padding_value=PAD_VALUE)
234
- return X_padded
235
-
236
-
237
- class Ner_Dataset(Dataset):
238
- def __init__(self, X, y):
239
- self.X = X
240
- self.y = y
241
-
242
- def __len__(self):
243
- return len(self.X)
244
-
245
- def __getitem__(self, idx):
246
- return self.X[idx], self.y[idx]
247
-
248
-
249
-
250
-
251
- class LSTM(BaseEstimator, ClassifierMixin):
252
- def __init__(self, embedding_dim = None, hidden_dim = 128, epochs = 5, learning_rate = 0.001, tag2idx = None):
253
- self.embedding_dim = embedding_dim
254
- self.hidden_dim = hidden_dim
255
- self.epochs = epochs
256
- self.learning_rate = learning_rate
257
- self.tag2idx = tag2idx
258
-
259
-
260
-
261
- def fit(self, embedded, encoded_tags):
262
- print('LSTM:', flush=True)
263
- data = Ner_Dataset(embedded, encoded_tags)
264
- train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad)
265
-
266
- self.model = self.train_LSTM(train_loader)
267
- print('--> LSTM trained', flush=True)
268
- return self
269
-
270
- def predict(self, X):
271
- # Switch to evaluation mode
272
-
273
- test_loader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=pred_pad)
274
-
275
- self.model.eval()
276
- predictions = []
277
-
278
- # Iterate through test data
279
- with torch.no_grad():
280
- for X_batch in test_loader:
281
- X_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
282
-
283
- tag_scores = self.model(X_batch)
284
- _, predicted_tags = torch.max(tag_scores, dim=2)
285
-
286
- # Flatten the tensors to compare word-by-word
287
- flattened_pred = predicted_tags.view(-1)
288
- predictions.append(flattened_pred.cpu().numpy())
289
-
290
- predictions = np.concatenate(predictions)
291
- return predictions
292
-
293
-
294
- def train_LSTM(self, train_loader, input_dim=None, hidden_dim=128, epochs=5, learning_rate=0.001):
295
-
296
- input_dim = self.embedding_dim
297
- # Instantiate the lstm_model
298
- lstm_model = BiLSTM_NER(input_dim, hidden_dim=hidden_dim, tagset_size=len(self.tag2idx))
299
- lstm_model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
300
-
301
- # Loss function and optimizer
302
- loss_function = nn.CrossEntropyLoss(ignore_index=PAD_VALUE) # Ignore padding
303
- optimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)
304
- print('--> Training LSTM')
305
-
306
- # Training loop
307
- for epoch in range(epochs):
308
- total_loss = 0
309
- total_correct = 0
310
- total_words = 0
311
- lstm_model.train() # Set model to training mode
312
-
313
- for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
314
- X_batch, y_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), y_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
315
-
316
- # Zero gradients
317
- optimizer.zero_grad()
318
-
319
- # Forward pass
320
- tag_scores = lstm_model(X_batch)
321
-
322
- # Reshape and compute loss (ignore padded values)
323
- loss = loss_function(tag_scores.view(-1, len(self.tag2idx)), y_batch.view(-1))
324
-
325
- # Backward pass and optimization
326
- loss.backward()
327
- optimizer.step()
328
-
329
- total_loss += loss.item()
330
-
331
- # Compute accuracy for this batch
332
- # Get the predicted tags (index of max score)
333
- _, predicted_tags = torch.max(tag_scores, dim=2)
334
-
335
- # Flatten the tensors to compare word-by-word
336
- flattened_pred = predicted_tags.view(-1)
337
- flattened_true = y_batch.view(-1)
338
-
339
- # Exclude padding tokens from the accuracy calculation
340
- mask = flattened_true != PAD_VALUE
341
- correct = (flattened_pred[mask] == flattened_true[mask]).sum().item()
342
-
343
- # Count the total words in the batch (ignoring padding)
344
- total_words_batch = mask.sum().item()
345
-
346
- # Update total correct and total words
347
- total_correct += correct
348
- total_words += total_words_batch
349
-
350
- avg_loss = total_loss / len(train_loader)
351
- avg_accuracy = total_correct / total_words * 100 # Accuracy in percentage
352
-
353
- print(f' ==> Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%')
354
-
355
- return lstm_model
356
-
357
-
358
- class FeedforwardNN(BaseEstimator, ClassifierMixin):
359
- def __init__(self, embedding_dim = None, hidden_dim = 128, epochs = 5, learning_rate = 0.001, tag2idx = None):
360
- self.embedding_dim = embedding_dim
361
- self.hidden_dim = hidden_dim
362
- self.epochs = epochs
363
- self.learning_rate = learning_rate
364
- self.tag2idx = tag2idx
365
-
366
-
367
-
368
- def fit(self, embedded, encoded_tags):
369
- print('Feed Forward NN: ', flush=True)
370
- data = Ner_Dataset(embedded, encoded_tags)
371
- train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad)
372
-
373
- self.model = self.train_FF(train_loader)
374
- print('--> Feed Forward trained', flush=True)
375
- return self
376
-
377
- def predict(self, X):
378
- # Switch to evaluation mode
379
-
380
- test_loader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=pred_pad)
381
-
382
- self.model.eval()
383
- predictions = []
384
-
385
- # Iterate through test data
386
- with torch.no_grad():
387
- for X_batch in test_loader:
388
- X_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
389
-
390
- tag_scores = self.model(X_batch)
391
- _, predicted_tags = torch.max(tag_scores, dim=2)
392
-
393
- # Flatten the tensors to compare word-by-word
394
- flattened_pred = predicted_tags.view(-1)
395
- predictions.append(flattened_pred.cpu().numpy())
396
-
397
- predictions = np.concatenate(predictions)
398
- return predictions
399
-
400
-
401
- def train_FF(self, train_loader, input_dim=None, hidden_dim=128, epochs=5, learning_rate=0.001):
402
-
403
- input_dim = self.embedding_dim
404
- # Instantiate the lstm_model
405
- ff_model = FeedForwardNN_NER(self.embedding_dim, hidden_dim=hidden_dim, tagset_size=len(self.tag2idx))
406
- ff_model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
407
-
408
- # Loss function and optimizer
409
- loss_function = nn.CrossEntropyLoss(ignore_index=PAD_VALUE) # Ignore padding
410
- optimizer = optim.Adam(ff_model.parameters(), lr=learning_rate)
411
- print('--> Training FF')
412
-
413
- # Training loop
414
- for epoch in range(epochs):
415
- total_loss = 0
416
- total_correct = 0
417
- total_words = 0
418
- ff_model.train() # Set model to training mode
419
-
420
- for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
421
- X_batch, y_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), y_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
422
-
423
- # Zero gradients
424
- optimizer.zero_grad()
425
-
426
- # Forward pass
427
- tag_scores = ff_model(X_batch)
428
-
429
- # Reshape and compute loss (ignore padded values)
430
- loss = loss_function(tag_scores.view(-1, len(self.tag2idx)), y_batch.view(-1))
431
-
432
- # Backward pass and optimization
433
- loss.backward()
434
- optimizer.step()
435
-
436
- total_loss += loss.item()
437
-
438
- # Compute accuracy for this batch
439
- # Get the predicted tags (index of max score)
440
- _, predicted_tags = torch.max(tag_scores, dim=2)
441
-
442
- # Flatten the tensors to compare word-by-word
443
- flattened_pred = predicted_tags.view(-1)
444
- flattened_true = y_batch.view(-1)
445
-
446
- # Exclude padding tokens from the accuracy calculation
447
- mask = flattened_true != PAD_VALUE
448
- correct = (flattened_pred[mask] == flattened_true[mask]).sum().item()
449
-
450
- # Count the total words in the batch (ignoring padding)
451
- total_words_batch = mask.sum().item()
452
-
453
- # Update total correct and total words
454
- total_correct += correct
455
- total_words += total_words_batch
456
-
457
- avg_loss = total_loss / len(train_loader)
458
- avg_accuracy = total_correct / total_words * 100 # Accuracy in percentage
459
-
460
- print(f' ==> Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%')
461
-
462
- return ff_model
463
-
464
- crf = sklearn_crfsuite.CRF(
465
- algorithm='lbfgs',
466
- c1=0.1,
467
- c2=0.1,
468
- max_iterations=100,
469
- all_possible_transitions=True)
470
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import random
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.optim as optim
7
+ from transformers import BertTokenizer, BertModel
8
+ from seqeval.metrics import accuracy_score, f1_score, classification_report
9
+ from seqeval.scheme import IOB2
10
+ import sklearn_crfsuite
11
+ from sklearn_crfsuite import metrics
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+ from gensim.models import Word2Vec, KeyedVectors
14
+ from sklearn.pipeline import Pipeline
15
+ from sklearn.preprocessing import LabelEncoder
16
+ from torch.utils.data import Dataset, DataLoader
17
+ from torch.nn.utils.rnn import pad_sequence
18
+ from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
19
+ from sklearn.feature_extraction.text import TfidfVectorizer
20
+ import gensim.downloader as api
21
+ from itertools import product
22
+ from sklearn.model_selection import train_test_split, GridSearchCV
23
+ from joblib import dump
24
+
25
+
26
+ class preprocess_sentences():
27
+ def __init__(self):
28
+ pass
29
+
30
+ def fit(self, X, y=None):
31
+ print('PREPROCESSING')
32
+ return self
33
+
34
+ def transform(self, X):
35
+ # X = train['tokens'], y =
36
+ sentences = X.apply(lambda x: x.tolist()).tolist()
37
+ print('--> Preprocessing complete \n', flush=True)
38
+ return sentences
39
+
40
+ EMBEDDING_DIM = 500
41
+ PAD_VALUE= -1
42
+ MAX_LENGTH = 376
43
+ BATCH_SIZE = 16
44
+
45
+ class Word2VecTransformer():
46
+ def __init__(self, vector_size = EMBEDDING_DIM, window = 5, min_count = 1, workers = 1, embedding_dim=EMBEDDING_DIM):
47
+ self.model = None
48
+ self.vector_size = vector_size
49
+ self.window = window
50
+ self.min_count = min_count
51
+ self.workers = workers
52
+ self.embedding_dim = embedding_dim
53
+
54
+ def fit(self, X, y):
55
+ # https://stackoverflow.com/questions/17242456/python-print-sys-stdout-write-not-visible-when-using-logging
56
+ # https://stackoverflow.com/questions/230751/how-can-i-flush-the-output-of-the-print-function
57
+ print('WORD2VEC:', flush=True)
58
+ # This fits the word2vec model
59
+ self.model = Word2Vec(sentences = X, vector_size=self.vector_size, window=self.window
60
+ , min_count=self.min_count, workers=self.workers)
61
+ print('--> Word2Vec Fitted', flush=True)
62
+ return self
63
+
64
+ def transform(self, X):
65
+ # This bit should transform the sentences
66
+ embedded_sentences = []
67
+
68
+ for sentence in X:
69
+ sentence_vectors = []
70
+
71
+ for word in sentence:
72
+ if word in self.model.wv:
73
+ vec = self.model.wv[word]
74
+ else:
75
+ vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
76
+
77
+ sentence_vectors.append(vec)
78
+
79
+ embedded_sentences.append(torch.tensor(sentence_vectors, dtype=torch.float32))
80
+ print('--> Embeddings Complete \n', flush=True)
81
+
82
+ return embedded_sentences
83
+
84
+ class Word2VecTransformer_CRF():
85
+ def __init__(self, vector_size = EMBEDDING_DIM, window = 5, min_count = 1, workers = 1, embedding_dim=EMBEDDING_DIM):
86
+ self.model = None
87
+ self.vector_size = vector_size
88
+ self.window = window
89
+ self.min_count = min_count
90
+ self.workers = workers
91
+ self.embedding_dim = embedding_dim
92
+
93
+ def fit(self, X, y):
94
+ # https://stackoverflow.com/questions/17242456/python-print-sys-stdout-write-not-visible-when-using-logging
95
+ # https://stackoverflow.com/questions/230751/how-can-i-flush-the-output-of-the-print-function
96
+ print('WORD2VEC:', flush=True)
97
+ # This fits the word2vec model
98
+ self.model = Word2Vec(sentences = X, vector_size=self.vector_size, window=self.window
99
+ , min_count=self.min_count, workers=self.workers)
100
+ print('--> Word2Vec Fitted', flush=True)
101
+ return self
102
+
103
+ def transform(self, X):
104
+ # This bit should transform the sentences
105
+ embedded_sentences = []
106
+
107
+ for sentence in X:
108
+ sentence_vectors = []
109
+
110
+ for word in sentence:
111
+ features = {
112
+ 'bias': 1.0,
113
+ 'word.lower()': word.lower(),
114
+ 'word[-3:]': word[-3:],
115
+ 'word[-2:]': word[-2:],
116
+ 'word.isupper()': word.isupper(),
117
+ 'word.istitle()': word.istitle(),
118
+ 'word.isdigit()': word.isdigit(),
119
+ }
120
+ if word in self.model.wv:
121
+ vec = self.model.wv[word]
122
+ else:
123
+ vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
124
+
125
+ # https://stackoverflow.com/questions/58736548/how-to-use-word-embedding-as-features-for-crf-sklearn-crfsuite-model-training
126
+ for index in range(len(vec)):
127
+ features[f"embedding_{index}"] = vec[index]
128
+
129
+ sentence_vectors.append(features)
130
+
131
+ embedded_sentences.append(sentence_vectors)
132
+ print('--> Embeddings Complete \n', flush=True)
133
+
134
+ return embedded_sentences
135
+
136
+ class tfidfTransformer(BaseEstimator, TransformerMixin):
137
+ def __init__(self):
138
+ self.model = None
139
+ self.embedding_dim = None
140
+ self.idf = None
141
+ self.vocab_size = None
142
+ self.vocab = None
143
+
144
+ def fit(self, X, y = None):
145
+ print('TFIDF:', flush=True)
146
+ joined_sentences = [' '.join(tokens) for tokens in X]
147
+ self.model = TfidfVectorizer()
148
+ self.model.fit(joined_sentences)
149
+ self.vocab = self.model.vocabulary_
150
+ self.idf = self.model.idf_
151
+ self.vocab_size = len(self.vocab)
152
+ self.embedding_dim = self.vocab_size
153
+ print('--> TFIDF Fitted', flush=True)
154
+ return self
155
+
156
+ def transform(self, X):
157
+
158
+ embedded = []
159
+ for sentence in X:
160
+ sent_vecs = []
161
+ token_counts = {}
162
+ for word in sentence:
163
+ token_counts[word] = token_counts.get(word, 0) + 1
164
+
165
+ sent_len = len(sentence)
166
+ for word in sentence:
167
+ vec = np.zeros(self.vocab_size)
168
+ if word in self.vocab:
169
+ tf = token_counts[word] / sent_len
170
+ token_idx = self.vocab[word]
171
+ vec[token_idx] = tf * self.idf[token_idx]
172
+ sent_vecs.append(vec)
173
+ embedded.append(torch.tensor(sent_vecs, dtype=torch.float32))
174
+ print('--> Embeddings Complete \n', flush=True)
175
+
176
+
177
+ return embedded
178
+
179
+ class GloveTransformer(BaseEstimator, TransformerMixin):
180
+ def __init__(self):
181
+ self.model = None
182
+ self.embedding_dim = 300
183
+
184
+ def fit(self, X, y=None):
185
+ print('GLOVE', flush = True)
186
+ self.model = api.load('glove-wiki-gigaword-300')
187
+ print('--> Glove Downloaded', flush=True)
188
+ return self
189
+
190
+ def transform(self, X):
191
+ # This bit should transform the sentences
192
+ print('--> Beginning embeddings', flush=True)
193
+ embedded_sentences = []
194
+
195
+ for sentence in X:
196
+ sentence_vectors = []
197
+
198
+ for word in sentence:
199
+ if word in self.model:
200
+ vec = self.model[word]
201
+ else:
202
+ vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
203
+
204
+ sentence_vectors.append(vec)
205
+
206
+ embedded_sentences.append(torch.tensor(sentence_vectors, dtype=torch.float32))
207
+ print('--> Embeddings Complete \n', flush=True)
208
+
209
+ return embedded_sentences
210
+
211
+ class Bio2VecTransformer():
212
+ def __init__(self, vector_size = 200, window = 5, min_count = 1, workers = 1, embedding_dim=200):
213
+ self.model = None
214
+ self.vector_size = vector_size
215
+ self.window = window
216
+ self.min_count = min_count
217
+ self.workers = workers
218
+ self.embedding_dim = embedding_dim
219
+
220
+ def fit(self, X, y):
221
+ print('BIO2VEC:', flush=True)
222
+ # https://stackoverflow.com/questions/58055415/how-to-load-bio2vec-in-gensim
223
+ self.model = Bio2VecModel
224
+ print('--> BIO2VEC Fitted', flush=True)
225
+ return self
226
+
227
+ def transform(self, X):
228
+ # This bit should transform the sentences
229
+ embedded_sentences = []
230
+
231
+ for sentence in X:
232
+ sentence_vectors = []
233
+
234
+ for word in sentence:
235
+ if word in self.model:
236
+ vec = self.model[word]
237
+ else:
238
+ vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
239
+
240
+ sentence_vectors.append(vec)
241
+
242
+ embedded_sentences.append(torch.tensor(sentence_vectors, dtype=torch.float32))
243
+ print('--> Embeddings Complete \n', flush=True)
244
+
245
+ return embedded_sentences
246
+
247
+ class BiLSTM_NER(nn.Module):
248
+ def __init__(self,input_dim, hidden_dim, tagset_size):
249
+ super(BiLSTM_NER, self).__init__()
250
+
251
+ # Embedding layer
252
+ #Freeze= false means that it will fine tune
253
+ #self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze = False, padding_idx=-1)
254
+
255
+ self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
256
+ self.fc = nn.Linear(hidden_dim*2, tagset_size)
257
+
258
+ def forward(self, sentences):
259
+ #embeds = self.embedding(sentences)
260
+ lstm_out, _ = self.lstm(sentences)
261
+ tag_scores = self.fc(lstm_out)
262
+
263
+ return tag_scores
264
+
265
+ def pad(batch):
266
+ # batch is a list of (X, y) pairs
267
+ X_batch, y_batch = zip(*batch)
268
+
269
+ # Convert to tensors
270
+ X_batch = [torch.tensor(seq, dtype=torch.float32) for seq in X_batch]
271
+ y_batch = [torch.tensor(seq, dtype=torch.long) for seq in y_batch]
272
+
273
+ # Pad sequences
274
+ X_padded = pad_sequence(X_batch, batch_first=True, padding_value=PAD_VALUE)
275
+ y_padded = pad_sequence(y_batch, batch_first=True, padding_value=PAD_VALUE)
276
+
277
+ return X_padded, y_padded
278
+
279
+ def pred_pad(batch):
280
+ X_batch = [torch.tensor(seq, dtype=torch.float32) for seq in batch]
281
+ X_padded = pad_sequence(X_batch, batch_first=True, padding_value=PAD_VALUE)
282
+ return X_padded
283
+
284
+ class Ner_Dataset(Dataset):
285
+ def __init__(self, X, y):
286
+ self.X = X
287
+ self.y = y
288
+
289
+ def __len__(self):
290
+ return len(self.X)
291
+
292
+ def __getitem__(self, idx):
293
+ return self.X[idx], self.y[idx]
294
+
295
+
296
+ class LSTM(BaseEstimator, ClassifierMixin):
297
+ def __init__(self, embedding_dim = None, hidden_dim = 128, epochs = 5, learning_rate = 0.001, tag2idx = None):
298
+ self.embedding_dim = embedding_dim
299
+ self.hidden_dim = hidden_dim
300
+ self.epochs = epochs
301
+ self.learning_rate = learning_rate
302
+ self.tag2idx = tag2idx
303
+
304
+
305
+
306
+ def fit(self, embedded, encoded_tags):
307
+ #print('LSTM started:', flush=True)
308
+ data = Ner_Dataset(embedded, encoded_tags)
309
+ train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad)
310
+
311
+ self.model = self.train_LSTM(train_loader)
312
+ #print('--> Epochs: ', self.epochs, flush=True)
313
+ #print('--> Learning Rate: ', self.learning_rate)
314
+ return self
315
+
316
+ def predict(self, X):
317
+ # Switch to evaluation mode
318
+
319
+ test_loader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=pred_pad)
320
+
321
+ self.model.eval()
322
+ predictions = []
323
+
324
+ # Iterate through test data
325
+ with torch.no_grad():
326
+ for X_batch in test_loader:
327
+ X_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
328
+
329
+ tag_scores = self.model(X_batch)
330
+ _, predicted_tags = torch.max(tag_scores, dim=2)
331
+
332
+ flattened_pred = predicted_tags.view(-1)
333
+
334
+ predictions.append(list(flattened_pred.cpu().numpy()))
335
+
336
+
337
+ #print('before concat',predictions)
338
+ #predictions = np.concatenate(predictions)
339
+ #print('after concat',predictions)
340
+
341
+ tag_encoder = LabelEncoder()
342
+ tag_encoder.fit(['B-AC', 'O', 'B-LF', 'I-LF'])
343
+
344
+ str_pred = []
345
+ for sentence in predictions:
346
+ str_sentence = tag_encoder.inverse_transform(sentence)
347
+ str_pred.append(list(str_sentence))
348
+ return str_pred
349
+
350
+
351
+ def train_LSTM(self, train_loader):
352
+
353
+ input_dim = self.embedding_dim
354
+ # Instantiate the lstm_model
355
+ lstm_model = BiLSTM_NER(input_dim, hidden_dim=self.hidden_dim, tagset_size=len(self.tag2idx))
356
+ lstm_model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
357
+
358
+ # Loss function and optimizer
359
+ loss_function = nn.CrossEntropyLoss(ignore_index=PAD_VALUE) # Ignore padding
360
+ optimizer = optim.Adam(lstm_model.parameters(), lr=self.learning_rate)
361
+ #print('--> Training LSTM')
362
+
363
+ # Training loop
364
+ for epoch in range(self.epochs):
365
+ total_loss = 0
366
+ total_correct = 0
367
+ total_words = 0
368
+ lstm_model.train() # Set model to training mode
369
+
370
+ for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
371
+ X_batch, y_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), y_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
372
+
373
+ # Zero gradients
374
+ optimizer.zero_grad()
375
+
376
+ # Forward pass
377
+ tag_scores = lstm_model(X_batch)
378
+
379
+ # Reshape and compute loss (ignore padded values)
380
+ loss = loss_function(tag_scores.view(-1, len(self.tag2idx)), y_batch.view(-1))
381
+
382
+ # Backward pass and optimization
383
+ loss.backward()
384
+ optimizer.step()
385
+
386
+ total_loss += loss.item()
387
+
388
+ # Compute accuracy for this batch
389
+ # Get the predicted tags (index of max score)
390
+ _, predicted_tags = torch.max(tag_scores, dim=2)
391
+
392
+ # Flatten the tensors to compare word-by-word
393
+ flattened_pred = predicted_tags.view(-1)
394
+ flattened_true = y_batch.view(-1)
395
+
396
+ # Exclude padding tokens from the accuracy calculation
397
+ mask = flattened_true != PAD_VALUE
398
+ correct = (flattened_pred[mask] == flattened_true[mask]).sum().item()
399
+
400
+ # Count the total words in the batch (ignoring padding)
401
+ total_words_batch = mask.sum().item()
402
+
403
+ # Update total correct and total words
404
+ total_correct += correct
405
+ total_words += total_words_batch
406
+
407
+ avg_loss = total_loss / len(train_loader)
408
+ avg_accuracy = total_correct / total_words * 100 # Accuracy in percentage
409
+
410
+ #print(f' ==> Epoch {epoch + 1}/{self.epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%')
411
+
412
+ return lstm_model
413
+
414
+
415
+ # Define the FeedForward NN Model
416
+ class FeedForwardNN_NER(nn.Module):
417
+ def __init__(self, embedding_dim, hidden_dim, tagset_size):
418
+ super(FeedForwardNN_NER, self).__init__()
419
+ self.fc1 = nn.Linear(embedding_dim, hidden_dim)
420
+ self.relu = nn.ReLU()
421
+ self.fc2 = nn.Linear(hidden_dim, tagset_size)
422
+
423
+ def forward(self, x):
424
+ x = self.fc1(x)
425
+ x = self.relu(x)
426
+ logits = self.fc2(x)
427
+ return logits
428
+
429
+
430
+
431
+ class FeedforwardNN(BaseEstimator, ClassifierMixin):
432
+ def __init__(self, embedding_dim = None, hidden_dim = 128, epochs = 5, learning_rate = 0.001, tag2idx = None):
433
+ self.embedding_dim = embedding_dim
434
+ self.hidden_dim = hidden_dim
435
+ self.epochs = epochs
436
+ self.learning_rate = learning_rate
437
+ self.tag2idx = tag2idx
438
+
439
+
440
+
441
+ def fit(self, embedded, encoded_tags):
442
+ print('Feed Forward NN: ', flush=True)
443
+ data = Ner_Dataset(embedded, encoded_tags)
444
+ train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad)
445
+
446
+ self.model = self.train_FF(train_loader)
447
+ print('--> Feed Forward trained', flush=True)
448
+ return self
449
+
450
+ def predict(self, X):
451
+ # Switch to evaluation mode
452
+
453
+ test_loader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=pred_pad)
454
+
455
+ self.model.eval()
456
+ predictions = []
457
+
458
+ # Iterate through test data
459
+ with torch.no_grad():
460
+ for X_batch in test_loader:
461
+ X_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
462
+
463
+ tag_scores = self.model(X_batch)
464
+ _, predicted_tags = torch.max(tag_scores, dim=2)
465
+
466
+ # Flatten the tensors to compare word-by-word
467
+ flattened_pred = predicted_tags.view(-1)
468
+ predictions.append(flattened_pred.cpu().numpy())
469
+
470
+ str_pred = []
471
+ for sentence in predictions:
472
+ str_sentence = tag_encoder.inverse_transform(sentence)
473
+ str_pred.append(list(str_sentence))
474
+ return str_pred
475
+
476
+
477
+ def train_FF(self, train_loader):
478
+
479
+
480
+
481
+ # Instantiate the lstm_model
482
+ ff_model = FeedForwardNN_NER(self.embedding_dim, hidden_dim=self.hidden_dim, tagset_size=len(self.tag2idx))
483
+ ff_model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
484
+
485
+ # Loss function and optimizer
486
+ loss_function = nn.CrossEntropyLoss(ignore_index=PAD_VALUE) # Ignore padding
487
+ optimizer = optim.Adam(ff_model.parameters(), lr=self.learning_rate)
488
+ print('--> Training FF')
489
+
490
+ # Training loop
491
+ for epoch in range(self.epochs):
492
+ total_loss = 0
493
+ total_correct = 0
494
+ total_words = 0
495
+ ff_model.train() # Set model to training mode
496
+
497
+ for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
498
+ X_batch, y_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), y_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
499
+
500
+ # Zero gradients
501
+ optimizer.zero_grad()
502
+
503
+ # Forward pass
504
+ tag_scores = ff_model(X_batch)
505
+
506
+ # Reshape and compute loss (ignore padded values)
507
+ loss = loss_function(tag_scores.view(-1, len(self.tag2idx)), y_batch.view(-1))
508
+
509
+ # Backward pass and optimization
510
+ loss.backward()
511
+ optimizer.step()
512
+
513
+ total_loss += loss.item()
514
+
515
+ # Compute accuracy for this batch
516
+ # Get the predicted tags (index of max score)
517
+ _, predicted_tags = torch.max(tag_scores, dim=2)
518
+
519
+ # Flatten the tensors to compare word-by-word
520
+ flattened_pred = predicted_tags.view(-1)
521
+ flattened_true = y_batch.view(-1)
522
+
523
+ # Exclude padding tokens from the accuracy calculation
524
+ mask = flattened_true != PAD_VALUE
525
+ correct = (flattened_pred[mask] == flattened_true[mask]).sum().item()
526
+
527
+ # Count the total words in the batch (ignoring padding)
528
+ total_words_batch = mask.sum().item()
529
+
530
+ # Update total correct and total words
531
+ total_correct += correct
532
+ total_words += total_words_batch
533
+
534
+ avg_loss = total_loss / len(train_loader)
535
+ avg_accuracy = total_correct / total_words * 100 # Accuracy in percentage
536
+
537
+ print(f' ==> Epoch {epoch + 1}/{self.epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%')
538
+
539
+ return ff_model
540
+
541
+ crf = sklearn_crfsuite.CRF(
542
+ algorithm='lbfgs',
543
+ c1=0.1,
544
+ c2=0.1,
545
+ max_iterations=100,
546
+ all_possible_transitions=True)
547
+