Spaces:
Sleeping
Sleeping
Update customFunctions.py for new pipelines
Browse filesAdded the updated functions, have not added the Bio2Vec file as not sure if need it as pipeline is already huge so should be in there?
- customFunctions.py +547 -470
customFunctions.py
CHANGED
@@ -1,470 +1,547 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import numpy as np
|
3 |
-
import random
|
4 |
-
import torch
|
5 |
-
import torch.nn as nn
|
6 |
-
import torch.optim as optim
|
7 |
-
|
8 |
-
from
|
9 |
-
import
|
10 |
-
|
11 |
-
from
|
12 |
-
from
|
13 |
-
from
|
14 |
-
from sklearn.
|
15 |
-
from
|
16 |
-
from torch.
|
17 |
-
from
|
18 |
-
from sklearn.
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
sentences
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
self.
|
48 |
-
self.
|
49 |
-
self.
|
50 |
-
self.
|
51 |
-
self.
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
# https://stackoverflow.com/questions/
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
self.
|
87 |
-
self.
|
88 |
-
self.
|
89 |
-
self.
|
90 |
-
self.
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
# https://stackoverflow.com/questions/
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
'
|
113 |
-
'word
|
114 |
-
'word[-
|
115 |
-
'word
|
116 |
-
'word.
|
117 |
-
'word.
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
for
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
class
|
137 |
-
def __init__(self):
|
138 |
-
self.model = None
|
139 |
-
self.embedding_dim = None
|
140 |
-
self.idf = None
|
141 |
-
self.vocab_size = None
|
142 |
-
self.vocab = None
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
self.model
|
149 |
-
self.model.
|
150 |
-
self.
|
151 |
-
self.
|
152 |
-
self.
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
token_idx = self.
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
def
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
self.
|
254 |
-
|
255 |
-
self.
|
256 |
-
self.
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
self.
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import random
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
import torch.optim as optim
|
7 |
+
from transformers import BertTokenizer, BertModel
|
8 |
+
from seqeval.metrics import accuracy_score, f1_score, classification_report
|
9 |
+
from seqeval.scheme import IOB2
|
10 |
+
import sklearn_crfsuite
|
11 |
+
from sklearn_crfsuite import metrics
|
12 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
13 |
+
from gensim.models import Word2Vec, KeyedVectors
|
14 |
+
from sklearn.pipeline import Pipeline
|
15 |
+
from sklearn.preprocessing import LabelEncoder
|
16 |
+
from torch.utils.data import Dataset, DataLoader
|
17 |
+
from torch.nn.utils.rnn import pad_sequence
|
18 |
+
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
|
19 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
20 |
+
import gensim.downloader as api
|
21 |
+
from itertools import product
|
22 |
+
from sklearn.model_selection import train_test_split, GridSearchCV
|
23 |
+
from joblib import dump
|
24 |
+
|
25 |
+
|
26 |
+
class preprocess_sentences():
|
27 |
+
def __init__(self):
|
28 |
+
pass
|
29 |
+
|
30 |
+
def fit(self, X, y=None):
|
31 |
+
print('PREPROCESSING')
|
32 |
+
return self
|
33 |
+
|
34 |
+
def transform(self, X):
|
35 |
+
# X = train['tokens'], y =
|
36 |
+
sentences = X.apply(lambda x: x.tolist()).tolist()
|
37 |
+
print('--> Preprocessing complete \n', flush=True)
|
38 |
+
return sentences
|
39 |
+
|
40 |
+
EMBEDDING_DIM = 500
|
41 |
+
PAD_VALUE= -1
|
42 |
+
MAX_LENGTH = 376
|
43 |
+
BATCH_SIZE = 16
|
44 |
+
|
45 |
+
class Word2VecTransformer():
|
46 |
+
def __init__(self, vector_size = EMBEDDING_DIM, window = 5, min_count = 1, workers = 1, embedding_dim=EMBEDDING_DIM):
|
47 |
+
self.model = None
|
48 |
+
self.vector_size = vector_size
|
49 |
+
self.window = window
|
50 |
+
self.min_count = min_count
|
51 |
+
self.workers = workers
|
52 |
+
self.embedding_dim = embedding_dim
|
53 |
+
|
54 |
+
def fit(self, X, y):
|
55 |
+
# https://stackoverflow.com/questions/17242456/python-print-sys-stdout-write-not-visible-when-using-logging
|
56 |
+
# https://stackoverflow.com/questions/230751/how-can-i-flush-the-output-of-the-print-function
|
57 |
+
print('WORD2VEC:', flush=True)
|
58 |
+
# This fits the word2vec model
|
59 |
+
self.model = Word2Vec(sentences = X, vector_size=self.vector_size, window=self.window
|
60 |
+
, min_count=self.min_count, workers=self.workers)
|
61 |
+
print('--> Word2Vec Fitted', flush=True)
|
62 |
+
return self
|
63 |
+
|
64 |
+
def transform(self, X):
|
65 |
+
# This bit should transform the sentences
|
66 |
+
embedded_sentences = []
|
67 |
+
|
68 |
+
for sentence in X:
|
69 |
+
sentence_vectors = []
|
70 |
+
|
71 |
+
for word in sentence:
|
72 |
+
if word in self.model.wv:
|
73 |
+
vec = self.model.wv[word]
|
74 |
+
else:
|
75 |
+
vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
|
76 |
+
|
77 |
+
sentence_vectors.append(vec)
|
78 |
+
|
79 |
+
embedded_sentences.append(torch.tensor(sentence_vectors, dtype=torch.float32))
|
80 |
+
print('--> Embeddings Complete \n', flush=True)
|
81 |
+
|
82 |
+
return embedded_sentences
|
83 |
+
|
84 |
+
class Word2VecTransformer_CRF():
|
85 |
+
def __init__(self, vector_size = EMBEDDING_DIM, window = 5, min_count = 1, workers = 1, embedding_dim=EMBEDDING_DIM):
|
86 |
+
self.model = None
|
87 |
+
self.vector_size = vector_size
|
88 |
+
self.window = window
|
89 |
+
self.min_count = min_count
|
90 |
+
self.workers = workers
|
91 |
+
self.embedding_dim = embedding_dim
|
92 |
+
|
93 |
+
def fit(self, X, y):
|
94 |
+
# https://stackoverflow.com/questions/17242456/python-print-sys-stdout-write-not-visible-when-using-logging
|
95 |
+
# https://stackoverflow.com/questions/230751/how-can-i-flush-the-output-of-the-print-function
|
96 |
+
print('WORD2VEC:', flush=True)
|
97 |
+
# This fits the word2vec model
|
98 |
+
self.model = Word2Vec(sentences = X, vector_size=self.vector_size, window=self.window
|
99 |
+
, min_count=self.min_count, workers=self.workers)
|
100 |
+
print('--> Word2Vec Fitted', flush=True)
|
101 |
+
return self
|
102 |
+
|
103 |
+
def transform(self, X):
|
104 |
+
# This bit should transform the sentences
|
105 |
+
embedded_sentences = []
|
106 |
+
|
107 |
+
for sentence in X:
|
108 |
+
sentence_vectors = []
|
109 |
+
|
110 |
+
for word in sentence:
|
111 |
+
features = {
|
112 |
+
'bias': 1.0,
|
113 |
+
'word.lower()': word.lower(),
|
114 |
+
'word[-3:]': word[-3:],
|
115 |
+
'word[-2:]': word[-2:],
|
116 |
+
'word.isupper()': word.isupper(),
|
117 |
+
'word.istitle()': word.istitle(),
|
118 |
+
'word.isdigit()': word.isdigit(),
|
119 |
+
}
|
120 |
+
if word in self.model.wv:
|
121 |
+
vec = self.model.wv[word]
|
122 |
+
else:
|
123 |
+
vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
|
124 |
+
|
125 |
+
# https://stackoverflow.com/questions/58736548/how-to-use-word-embedding-as-features-for-crf-sklearn-crfsuite-model-training
|
126 |
+
for index in range(len(vec)):
|
127 |
+
features[f"embedding_{index}"] = vec[index]
|
128 |
+
|
129 |
+
sentence_vectors.append(features)
|
130 |
+
|
131 |
+
embedded_sentences.append(sentence_vectors)
|
132 |
+
print('--> Embeddings Complete \n', flush=True)
|
133 |
+
|
134 |
+
return embedded_sentences
|
135 |
+
|
136 |
+
class tfidfTransformer(BaseEstimator, TransformerMixin):
|
137 |
+
def __init__(self):
|
138 |
+
self.model = None
|
139 |
+
self.embedding_dim = None
|
140 |
+
self.idf = None
|
141 |
+
self.vocab_size = None
|
142 |
+
self.vocab = None
|
143 |
+
|
144 |
+
def fit(self, X, y = None):
|
145 |
+
print('TFIDF:', flush=True)
|
146 |
+
joined_sentences = [' '.join(tokens) for tokens in X]
|
147 |
+
self.model = TfidfVectorizer()
|
148 |
+
self.model.fit(joined_sentences)
|
149 |
+
self.vocab = self.model.vocabulary_
|
150 |
+
self.idf = self.model.idf_
|
151 |
+
self.vocab_size = len(self.vocab)
|
152 |
+
self.embedding_dim = self.vocab_size
|
153 |
+
print('--> TFIDF Fitted', flush=True)
|
154 |
+
return self
|
155 |
+
|
156 |
+
def transform(self, X):
|
157 |
+
|
158 |
+
embedded = []
|
159 |
+
for sentence in X:
|
160 |
+
sent_vecs = []
|
161 |
+
token_counts = {}
|
162 |
+
for word in sentence:
|
163 |
+
token_counts[word] = token_counts.get(word, 0) + 1
|
164 |
+
|
165 |
+
sent_len = len(sentence)
|
166 |
+
for word in sentence:
|
167 |
+
vec = np.zeros(self.vocab_size)
|
168 |
+
if word in self.vocab:
|
169 |
+
tf = token_counts[word] / sent_len
|
170 |
+
token_idx = self.vocab[word]
|
171 |
+
vec[token_idx] = tf * self.idf[token_idx]
|
172 |
+
sent_vecs.append(vec)
|
173 |
+
embedded.append(torch.tensor(sent_vecs, dtype=torch.float32))
|
174 |
+
print('--> Embeddings Complete \n', flush=True)
|
175 |
+
|
176 |
+
|
177 |
+
return embedded
|
178 |
+
|
179 |
+
class GloveTransformer(BaseEstimator, TransformerMixin):
|
180 |
+
def __init__(self):
|
181 |
+
self.model = None
|
182 |
+
self.embedding_dim = 300
|
183 |
+
|
184 |
+
def fit(self, X, y=None):
|
185 |
+
print('GLOVE', flush = True)
|
186 |
+
self.model = api.load('glove-wiki-gigaword-300')
|
187 |
+
print('--> Glove Downloaded', flush=True)
|
188 |
+
return self
|
189 |
+
|
190 |
+
def transform(self, X):
|
191 |
+
# This bit should transform the sentences
|
192 |
+
print('--> Beginning embeddings', flush=True)
|
193 |
+
embedded_sentences = []
|
194 |
+
|
195 |
+
for sentence in X:
|
196 |
+
sentence_vectors = []
|
197 |
+
|
198 |
+
for word in sentence:
|
199 |
+
if word in self.model:
|
200 |
+
vec = self.model[word]
|
201 |
+
else:
|
202 |
+
vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
|
203 |
+
|
204 |
+
sentence_vectors.append(vec)
|
205 |
+
|
206 |
+
embedded_sentences.append(torch.tensor(sentence_vectors, dtype=torch.float32))
|
207 |
+
print('--> Embeddings Complete \n', flush=True)
|
208 |
+
|
209 |
+
return embedded_sentences
|
210 |
+
|
211 |
+
class Bio2VecTransformer():
|
212 |
+
def __init__(self, vector_size = 200, window = 5, min_count = 1, workers = 1, embedding_dim=200):
|
213 |
+
self.model = None
|
214 |
+
self.vector_size = vector_size
|
215 |
+
self.window = window
|
216 |
+
self.min_count = min_count
|
217 |
+
self.workers = workers
|
218 |
+
self.embedding_dim = embedding_dim
|
219 |
+
|
220 |
+
def fit(self, X, y):
|
221 |
+
print('BIO2VEC:', flush=True)
|
222 |
+
# https://stackoverflow.com/questions/58055415/how-to-load-bio2vec-in-gensim
|
223 |
+
self.model = Bio2VecModel
|
224 |
+
print('--> BIO2VEC Fitted', flush=True)
|
225 |
+
return self
|
226 |
+
|
227 |
+
def transform(self, X):
|
228 |
+
# This bit should transform the sentences
|
229 |
+
embedded_sentences = []
|
230 |
+
|
231 |
+
for sentence in X:
|
232 |
+
sentence_vectors = []
|
233 |
+
|
234 |
+
for word in sentence:
|
235 |
+
if word in self.model:
|
236 |
+
vec = self.model[word]
|
237 |
+
else:
|
238 |
+
vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
|
239 |
+
|
240 |
+
sentence_vectors.append(vec)
|
241 |
+
|
242 |
+
embedded_sentences.append(torch.tensor(sentence_vectors, dtype=torch.float32))
|
243 |
+
print('--> Embeddings Complete \n', flush=True)
|
244 |
+
|
245 |
+
return embedded_sentences
|
246 |
+
|
247 |
+
class BiLSTM_NER(nn.Module):
|
248 |
+
def __init__(self,input_dim, hidden_dim, tagset_size):
|
249 |
+
super(BiLSTM_NER, self).__init__()
|
250 |
+
|
251 |
+
# Embedding layer
|
252 |
+
#Freeze= false means that it will fine tune
|
253 |
+
#self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze = False, padding_idx=-1)
|
254 |
+
|
255 |
+
self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
|
256 |
+
self.fc = nn.Linear(hidden_dim*2, tagset_size)
|
257 |
+
|
258 |
+
def forward(self, sentences):
|
259 |
+
#embeds = self.embedding(sentences)
|
260 |
+
lstm_out, _ = self.lstm(sentences)
|
261 |
+
tag_scores = self.fc(lstm_out)
|
262 |
+
|
263 |
+
return tag_scores
|
264 |
+
|
265 |
+
def pad(batch):
|
266 |
+
# batch is a list of (X, y) pairs
|
267 |
+
X_batch, y_batch = zip(*batch)
|
268 |
+
|
269 |
+
# Convert to tensors
|
270 |
+
X_batch = [torch.tensor(seq, dtype=torch.float32) for seq in X_batch]
|
271 |
+
y_batch = [torch.tensor(seq, dtype=torch.long) for seq in y_batch]
|
272 |
+
|
273 |
+
# Pad sequences
|
274 |
+
X_padded = pad_sequence(X_batch, batch_first=True, padding_value=PAD_VALUE)
|
275 |
+
y_padded = pad_sequence(y_batch, batch_first=True, padding_value=PAD_VALUE)
|
276 |
+
|
277 |
+
return X_padded, y_padded
|
278 |
+
|
279 |
+
def pred_pad(batch):
|
280 |
+
X_batch = [torch.tensor(seq, dtype=torch.float32) for seq in batch]
|
281 |
+
X_padded = pad_sequence(X_batch, batch_first=True, padding_value=PAD_VALUE)
|
282 |
+
return X_padded
|
283 |
+
|
284 |
+
class Ner_Dataset(Dataset):
|
285 |
+
def __init__(self, X, y):
|
286 |
+
self.X = X
|
287 |
+
self.y = y
|
288 |
+
|
289 |
+
def __len__(self):
|
290 |
+
return len(self.X)
|
291 |
+
|
292 |
+
def __getitem__(self, idx):
|
293 |
+
return self.X[idx], self.y[idx]
|
294 |
+
|
295 |
+
|
296 |
+
class LSTM(BaseEstimator, ClassifierMixin):
|
297 |
+
def __init__(self, embedding_dim = None, hidden_dim = 128, epochs = 5, learning_rate = 0.001, tag2idx = None):
|
298 |
+
self.embedding_dim = embedding_dim
|
299 |
+
self.hidden_dim = hidden_dim
|
300 |
+
self.epochs = epochs
|
301 |
+
self.learning_rate = learning_rate
|
302 |
+
self.tag2idx = tag2idx
|
303 |
+
|
304 |
+
|
305 |
+
|
306 |
+
def fit(self, embedded, encoded_tags):
|
307 |
+
#print('LSTM started:', flush=True)
|
308 |
+
data = Ner_Dataset(embedded, encoded_tags)
|
309 |
+
train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad)
|
310 |
+
|
311 |
+
self.model = self.train_LSTM(train_loader)
|
312 |
+
#print('--> Epochs: ', self.epochs, flush=True)
|
313 |
+
#print('--> Learning Rate: ', self.learning_rate)
|
314 |
+
return self
|
315 |
+
|
316 |
+
def predict(self, X):
|
317 |
+
# Switch to evaluation mode
|
318 |
+
|
319 |
+
test_loader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=pred_pad)
|
320 |
+
|
321 |
+
self.model.eval()
|
322 |
+
predictions = []
|
323 |
+
|
324 |
+
# Iterate through test data
|
325 |
+
with torch.no_grad():
|
326 |
+
for X_batch in test_loader:
|
327 |
+
X_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
|
328 |
+
|
329 |
+
tag_scores = self.model(X_batch)
|
330 |
+
_, predicted_tags = torch.max(tag_scores, dim=2)
|
331 |
+
|
332 |
+
flattened_pred = predicted_tags.view(-1)
|
333 |
+
|
334 |
+
predictions.append(list(flattened_pred.cpu().numpy()))
|
335 |
+
|
336 |
+
|
337 |
+
#print('before concat',predictions)
|
338 |
+
#predictions = np.concatenate(predictions)
|
339 |
+
#print('after concat',predictions)
|
340 |
+
|
341 |
+
tag_encoder = LabelEncoder()
|
342 |
+
tag_encoder.fit(['B-AC', 'O', 'B-LF', 'I-LF'])
|
343 |
+
|
344 |
+
str_pred = []
|
345 |
+
for sentence in predictions:
|
346 |
+
str_sentence = tag_encoder.inverse_transform(sentence)
|
347 |
+
str_pred.append(list(str_sentence))
|
348 |
+
return str_pred
|
349 |
+
|
350 |
+
|
351 |
+
def train_LSTM(self, train_loader):
|
352 |
+
|
353 |
+
input_dim = self.embedding_dim
|
354 |
+
# Instantiate the lstm_model
|
355 |
+
lstm_model = BiLSTM_NER(input_dim, hidden_dim=self.hidden_dim, tagset_size=len(self.tag2idx))
|
356 |
+
lstm_model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
|
357 |
+
|
358 |
+
# Loss function and optimizer
|
359 |
+
loss_function = nn.CrossEntropyLoss(ignore_index=PAD_VALUE) # Ignore padding
|
360 |
+
optimizer = optim.Adam(lstm_model.parameters(), lr=self.learning_rate)
|
361 |
+
#print('--> Training LSTM')
|
362 |
+
|
363 |
+
# Training loop
|
364 |
+
for epoch in range(self.epochs):
|
365 |
+
total_loss = 0
|
366 |
+
total_correct = 0
|
367 |
+
total_words = 0
|
368 |
+
lstm_model.train() # Set model to training mode
|
369 |
+
|
370 |
+
for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
|
371 |
+
X_batch, y_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), y_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
|
372 |
+
|
373 |
+
# Zero gradients
|
374 |
+
optimizer.zero_grad()
|
375 |
+
|
376 |
+
# Forward pass
|
377 |
+
tag_scores = lstm_model(X_batch)
|
378 |
+
|
379 |
+
# Reshape and compute loss (ignore padded values)
|
380 |
+
loss = loss_function(tag_scores.view(-1, len(self.tag2idx)), y_batch.view(-1))
|
381 |
+
|
382 |
+
# Backward pass and optimization
|
383 |
+
loss.backward()
|
384 |
+
optimizer.step()
|
385 |
+
|
386 |
+
total_loss += loss.item()
|
387 |
+
|
388 |
+
# Compute accuracy for this batch
|
389 |
+
# Get the predicted tags (index of max score)
|
390 |
+
_, predicted_tags = torch.max(tag_scores, dim=2)
|
391 |
+
|
392 |
+
# Flatten the tensors to compare word-by-word
|
393 |
+
flattened_pred = predicted_tags.view(-1)
|
394 |
+
flattened_true = y_batch.view(-1)
|
395 |
+
|
396 |
+
# Exclude padding tokens from the accuracy calculation
|
397 |
+
mask = flattened_true != PAD_VALUE
|
398 |
+
correct = (flattened_pred[mask] == flattened_true[mask]).sum().item()
|
399 |
+
|
400 |
+
# Count the total words in the batch (ignoring padding)
|
401 |
+
total_words_batch = mask.sum().item()
|
402 |
+
|
403 |
+
# Update total correct and total words
|
404 |
+
total_correct += correct
|
405 |
+
total_words += total_words_batch
|
406 |
+
|
407 |
+
avg_loss = total_loss / len(train_loader)
|
408 |
+
avg_accuracy = total_correct / total_words * 100 # Accuracy in percentage
|
409 |
+
|
410 |
+
#print(f' ==> Epoch {epoch + 1}/{self.epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%')
|
411 |
+
|
412 |
+
return lstm_model
|
413 |
+
|
414 |
+
|
415 |
+
# Define the FeedForward NN Model
|
416 |
+
class FeedForwardNN_NER(nn.Module):
|
417 |
+
def __init__(self, embedding_dim, hidden_dim, tagset_size):
|
418 |
+
super(FeedForwardNN_NER, self).__init__()
|
419 |
+
self.fc1 = nn.Linear(embedding_dim, hidden_dim)
|
420 |
+
self.relu = nn.ReLU()
|
421 |
+
self.fc2 = nn.Linear(hidden_dim, tagset_size)
|
422 |
+
|
423 |
+
def forward(self, x):
|
424 |
+
x = self.fc1(x)
|
425 |
+
x = self.relu(x)
|
426 |
+
logits = self.fc2(x)
|
427 |
+
return logits
|
428 |
+
|
429 |
+
|
430 |
+
|
431 |
+
class FeedforwardNN(BaseEstimator, ClassifierMixin):
|
432 |
+
def __init__(self, embedding_dim = None, hidden_dim = 128, epochs = 5, learning_rate = 0.001, tag2idx = None):
|
433 |
+
self.embedding_dim = embedding_dim
|
434 |
+
self.hidden_dim = hidden_dim
|
435 |
+
self.epochs = epochs
|
436 |
+
self.learning_rate = learning_rate
|
437 |
+
self.tag2idx = tag2idx
|
438 |
+
|
439 |
+
|
440 |
+
|
441 |
+
def fit(self, embedded, encoded_tags):
|
442 |
+
print('Feed Forward NN: ', flush=True)
|
443 |
+
data = Ner_Dataset(embedded, encoded_tags)
|
444 |
+
train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad)
|
445 |
+
|
446 |
+
self.model = self.train_FF(train_loader)
|
447 |
+
print('--> Feed Forward trained', flush=True)
|
448 |
+
return self
|
449 |
+
|
450 |
+
def predict(self, X):
|
451 |
+
# Switch to evaluation mode
|
452 |
+
|
453 |
+
test_loader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=pred_pad)
|
454 |
+
|
455 |
+
self.model.eval()
|
456 |
+
predictions = []
|
457 |
+
|
458 |
+
# Iterate through test data
|
459 |
+
with torch.no_grad():
|
460 |
+
for X_batch in test_loader:
|
461 |
+
X_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
|
462 |
+
|
463 |
+
tag_scores = self.model(X_batch)
|
464 |
+
_, predicted_tags = torch.max(tag_scores, dim=2)
|
465 |
+
|
466 |
+
# Flatten the tensors to compare word-by-word
|
467 |
+
flattened_pred = predicted_tags.view(-1)
|
468 |
+
predictions.append(flattened_pred.cpu().numpy())
|
469 |
+
|
470 |
+
str_pred = []
|
471 |
+
for sentence in predictions:
|
472 |
+
str_sentence = tag_encoder.inverse_transform(sentence)
|
473 |
+
str_pred.append(list(str_sentence))
|
474 |
+
return str_pred
|
475 |
+
|
476 |
+
|
477 |
+
def train_FF(self, train_loader):
|
478 |
+
|
479 |
+
|
480 |
+
|
481 |
+
# Instantiate the lstm_model
|
482 |
+
ff_model = FeedForwardNN_NER(self.embedding_dim, hidden_dim=self.hidden_dim, tagset_size=len(self.tag2idx))
|
483 |
+
ff_model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
|
484 |
+
|
485 |
+
# Loss function and optimizer
|
486 |
+
loss_function = nn.CrossEntropyLoss(ignore_index=PAD_VALUE) # Ignore padding
|
487 |
+
optimizer = optim.Adam(ff_model.parameters(), lr=self.learning_rate)
|
488 |
+
print('--> Training FF')
|
489 |
+
|
490 |
+
# Training loop
|
491 |
+
for epoch in range(self.epochs):
|
492 |
+
total_loss = 0
|
493 |
+
total_correct = 0
|
494 |
+
total_words = 0
|
495 |
+
ff_model.train() # Set model to training mode
|
496 |
+
|
497 |
+
for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
|
498 |
+
X_batch, y_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), y_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
|
499 |
+
|
500 |
+
# Zero gradients
|
501 |
+
optimizer.zero_grad()
|
502 |
+
|
503 |
+
# Forward pass
|
504 |
+
tag_scores = ff_model(X_batch)
|
505 |
+
|
506 |
+
# Reshape and compute loss (ignore padded values)
|
507 |
+
loss = loss_function(tag_scores.view(-1, len(self.tag2idx)), y_batch.view(-1))
|
508 |
+
|
509 |
+
# Backward pass and optimization
|
510 |
+
loss.backward()
|
511 |
+
optimizer.step()
|
512 |
+
|
513 |
+
total_loss += loss.item()
|
514 |
+
|
515 |
+
# Compute accuracy for this batch
|
516 |
+
# Get the predicted tags (index of max score)
|
517 |
+
_, predicted_tags = torch.max(tag_scores, dim=2)
|
518 |
+
|
519 |
+
# Flatten the tensors to compare word-by-word
|
520 |
+
flattened_pred = predicted_tags.view(-1)
|
521 |
+
flattened_true = y_batch.view(-1)
|
522 |
+
|
523 |
+
# Exclude padding tokens from the accuracy calculation
|
524 |
+
mask = flattened_true != PAD_VALUE
|
525 |
+
correct = (flattened_pred[mask] == flattened_true[mask]).sum().item()
|
526 |
+
|
527 |
+
# Count the total words in the batch (ignoring padding)
|
528 |
+
total_words_batch = mask.sum().item()
|
529 |
+
|
530 |
+
# Update total correct and total words
|
531 |
+
total_correct += correct
|
532 |
+
total_words += total_words_batch
|
533 |
+
|
534 |
+
avg_loss = total_loss / len(train_loader)
|
535 |
+
avg_accuracy = total_correct / total_words * 100 # Accuracy in percentage
|
536 |
+
|
537 |
+
print(f' ==> Epoch {epoch + 1}/{self.epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%')
|
538 |
+
|
539 |
+
return ff_model
|
540 |
+
|
541 |
+
crf = sklearn_crfsuite.CRF(
|
542 |
+
algorithm='lbfgs',
|
543 |
+
c1=0.1,
|
544 |
+
c2=0.1,
|
545 |
+
max_iterations=100,
|
546 |
+
all_possible_transitions=True)
|
547 |
+
|