Fixed truncating
Browse filesRemoved truncating in favor of more padding. resized any mismatched sizes to the largest sequence. Fixed other minor issues
- chatbotTrainer.py +14 -25
- runCorpus.py +21 -8
chatbotTrainer.py
CHANGED
@@ -469,37 +469,32 @@ class ChatbotTrainer:
|
|
469 |
if word not in self.vocabularyList:
|
470 |
self.vocabularyList.append(word)
|
471 |
|
|
|
|
|
472 |
return txt
|
473 |
|
474 |
# Training
|
475 |
def preprocess_texts(self, input_texts, target_texts):
|
476 |
input_texts = [self.clean_text(text) for text in input_texts.split(" ")]
|
477 |
target_texts = [self.clean_text(text) for text in target_texts.split(" ")]
|
478 |
-
|
479 |
# Initialize lists to store processed inputs and targets
|
480 |
-
input_texts = [f"<start> {texts} <end>" for texts in input_texts if input_texts and input_texts != ""
|
481 |
-
target_texts = [f"<start> {texts} <end>" for texts in target_texts if target_texts and target_texts != ""
|
482 |
|
483 |
-
input_sequences = self.tokenizer.texts_to_sequences(input_texts)
|
484 |
-
target_sequences = self.tokenizer.texts_to_sequences(target_texts)
|
485 |
|
486 |
-
input_sequences = pad_sequences(input_sequences, maxlen=self.max_seq_length, padding='post'
|
487 |
-
target_sequences = pad_sequences(target_sequences, maxlen=self.max_seq_length, padding='post'
|
488 |
-
target_sequences = pad_sequences(target_sequences, maxlen=self.max_seq_length, padding='post', truncating='post')
|
489 |
|
490 |
# Ensure target_sequences has enough samples
|
491 |
if target_sequences.shape[0] != input_sequences.shape[0]:
|
492 |
print(f"Padding mismatch! Input: {input_sequences.shape}, Target: {target_sequences.shape}")
|
493 |
-
target_sequences
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
input_sequences = input_sequences[:min_samples]
|
498 |
-
target_sequences = target_sequences[:min_samples]
|
499 |
-
|
500 |
-
print(f"Preprocessed Encoder Input Shape: {input_sequences.shape}")
|
501 |
-
print(f"Preprocessed Decoder Input Shape: {target_sequences.shape}")
|
502 |
-
print(f"Preprocessed Decoder Target Shape: {target_sequences.shape}")
|
503 |
|
504 |
return input_sequences, target_sequences
|
505 |
|
@@ -627,7 +622,7 @@ class ChatbotTrainer:
|
|
627 |
raise ValueError("Corpus or tokenizer is not initialized.")
|
628 |
|
629 |
# Preprocess the texts into sequences
|
630 |
-
input_sequences, target_sequences = input_texts, target_texts
|
631 |
|
632 |
# Debug Lines
|
633 |
# for token in ['<start>', '<end>', '<oov>']:
|
@@ -643,12 +638,6 @@ class ChatbotTrainer:
|
|
643 |
decoder_input_data = target_sequences[:, :-1]
|
644 |
decoder_target_data = target_sequences[:, 1:]
|
645 |
|
646 |
-
min_samples = min(encoder_input_data.shape[0], decoder_input_data.shape[0])
|
647 |
-
|
648 |
-
encoder_input_data = encoder_input_data[:min_samples]
|
649 |
-
decoder_input_data = decoder_input_data[:min_samples]
|
650 |
-
decoder_target_data = decoder_target_data[:min_samples]
|
651 |
-
|
652 |
self.logger.info(f"Encoder Input Data Shape: {encoder_input_data.shape}")
|
653 |
self.logger.info(f"Decoder Input Data Shape: {decoder_input_data.shape}")
|
654 |
self.logger.info(f"Decoder Target Data Shape: {decoder_target_data.shape}")
|
|
|
469 |
if word not in self.vocabularyList:
|
470 |
self.vocabularyList.append(word)
|
471 |
|
472 |
+
self.save_tokenizer(self.vocabularyList)
|
473 |
+
|
474 |
return txt
|
475 |
|
476 |
# Training
|
477 |
def preprocess_texts(self, input_texts, target_texts):
|
478 |
input_texts = [self.clean_text(text) for text in input_texts.split(" ")]
|
479 |
target_texts = [self.clean_text(text) for text in target_texts.split(" ")]
|
480 |
+
|
481 |
# Initialize lists to store processed inputs and targets
|
482 |
+
input_texts = [f"<start> {texts} <end>" for texts in input_texts if input_texts and input_texts != ""]
|
483 |
+
target_texts = [f"<start> {texts} <end>" for texts in target_texts if target_texts and target_texts != ""]
|
484 |
|
485 |
+
input_sequences = self.tokenizer.texts_to_sequences(input_texts)
|
486 |
+
target_sequences = self.tokenizer.texts_to_sequences(target_texts)
|
487 |
|
488 |
+
input_sequences = pad_sequences(input_sequences, maxlen=self.max_seq_length, padding='post')
|
489 |
+
target_sequences = pad_sequences(target_sequences, maxlen=self.max_seq_length, padding='post')
|
|
|
490 |
|
491 |
# Ensure target_sequences has enough samples
|
492 |
if target_sequences.shape[0] != input_sequences.shape[0]:
|
493 |
print(f"Padding mismatch! Input: {input_sequences.shape}, Target: {target_sequences.shape}")
|
494 |
+
if target_sequences.shape[0] < input_sequences.shape[0]:
|
495 |
+
target_sequences = np.resize(target_sequences, input_sequences.shape)
|
496 |
+
if target_sequences.shape[0] > input_sequences.shape[0]:
|
497 |
+
target_sequences = np.resize(input_sequences, target_sequences.shape)
|
|
|
|
|
|
|
|
|
|
|
|
|
498 |
|
499 |
return input_sequences, target_sequences
|
500 |
|
|
|
622 |
raise ValueError("Corpus or tokenizer is not initialized.")
|
623 |
|
624 |
# Preprocess the texts into sequences
|
625 |
+
input_sequences, target_sequences = self.preprocess_texts(input_texts, target_texts)
|
626 |
|
627 |
# Debug Lines
|
628 |
# for token in ['<start>', '<end>', '<oov>']:
|
|
|
638 |
decoder_input_data = target_sequences[:, :-1]
|
639 |
decoder_target_data = target_sequences[:, 1:]
|
640 |
|
|
|
|
|
|
|
|
|
|
|
|
|
641 |
self.logger.info(f"Encoder Input Data Shape: {encoder_input_data.shape}")
|
642 |
self.logger.info(f"Decoder Input Data Shape: {decoder_input_data.shape}")
|
643 |
self.logger.info(f"Decoder Target Data Shape: {decoder_target_data.shape}")
|
runCorpus.py
CHANGED
@@ -4,6 +4,7 @@ from playsound3 import playsound
|
|
4 |
import tensorflow
|
5 |
from chatbotTrainer import ChatbotTrainer
|
6 |
import time
|
|
|
7 |
import random
|
8 |
import pdb
|
9 |
import sys
|
@@ -19,7 +20,7 @@ class CorpusTrainer:
|
|
19 |
self.choices_yes = ["yes", "ya", "yeah", "yessir", "yesir", "y", "ye", "yah"]
|
20 |
self.exit_commands = ["exit", "quit", "stop", "x", "q", ""]
|
21 |
|
22 |
-
self.log_file = "
|
23 |
self.counter = 0
|
24 |
self.bad_count = 0
|
25 |
self.top_num = 0
|
@@ -91,11 +92,23 @@ class CorpusTrainer:
|
|
91 |
|
92 |
# Input conversation data into input and target data from dialog pairs
|
93 |
for input_text, target_text in dialog_pairs:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
if input_text != "" and target_text != "":
|
95 |
-
self.speaker_input_texts.append(input_text)
|
96 |
-
self.all_input_texts.append(input_text)
|
97 |
-
self.speaker_target_texts.append(target_text)
|
98 |
-
self.all_target_texts.append(target_text)
|
99 |
|
100 |
|
101 |
if self.failsafe_trigger is False:
|
@@ -105,8 +118,6 @@ class CorpusTrainer:
|
|
105 |
self.top_num = self.conversation_id
|
106 |
|
107 |
print(f"Conversation: {self.conversation_id}")
|
108 |
-
input_text, target_text = chatbot_trainer.preprocess_texts(input_text, target_text)
|
109 |
-
data = [input_text, target_text]
|
110 |
|
111 |
# Limit is defined within -3 of the early_patience, meaning if it gets close we're adding it to the list
|
112 |
limit = self.chatbot_trainer.early_patience - 3
|
@@ -120,6 +131,8 @@ class CorpusTrainer:
|
|
120 |
# time.sleep(1)
|
121 |
return self.chatbot_trainer, user_choice, dialog_data, topConvo, self.top_num, self.failsafe_trigger
|
122 |
|
|
|
|
|
123 |
# User Choices
|
124 |
if user_choice in self.choices_yes and play_notification in self.choices_yes:
|
125 |
self.user_yes(speaker=speaker, data=data, limit=limit, play_notification=play_notification)
|
@@ -312,7 +325,7 @@ class CorpusTrainer:
|
|
312 |
print("No failure data found.")
|
313 |
return
|
314 |
|
315 |
-
with open("
|
316 |
self.failure_history = [int(line.strip()) for line in f.readlines()]
|
317 |
|
318 |
if len(self.failure_history) == 0:
|
|
|
4 |
import tensorflow
|
5 |
from chatbotTrainer import ChatbotTrainer
|
6 |
import time
|
7 |
+
import numpy as np
|
8 |
import random
|
9 |
import pdb
|
10 |
import sys
|
|
|
20 |
self.choices_yes = ["yes", "ya", "yeah", "yessir", "yesir", "y", "ye", "yah"]
|
21 |
self.exit_commands = ["exit", "quit", "stop", "x", "q", ""]
|
22 |
|
23 |
+
self.log_file = "failure_history.txt"
|
24 |
self.counter = 0
|
25 |
self.bad_count = 0
|
26 |
self.top_num = 0
|
|
|
92 |
|
93 |
# Input conversation data into input and target data from dialog pairs
|
94 |
for input_text, target_text in dialog_pairs:
|
95 |
+
self.speaker_input_texts = []
|
96 |
+
self.speaker_target_texts = []
|
97 |
+
input_shape = np.array(input_text).shape
|
98 |
+
target_shape = np.array(target_text).shape
|
99 |
+
if input_shape in [(1, 64), (1, 63)] or target_shape in [(1, 64), (1, 63)]:
|
100 |
+
print(f"Conversation {speaker} skipped for NOT providing properly shaped data... ")
|
101 |
+
continue
|
102 |
+
|
103 |
+
if len(input_text) < 3 or len(target_text) < 3:
|
104 |
+
print(f"Conversation {speaker} skipped for NOT providing enough data... ")
|
105 |
+
continue
|
106 |
+
|
107 |
if input_text != "" and target_text != "":
|
108 |
+
self.speaker_input_texts.append(input_text.strip())
|
109 |
+
self.all_input_texts.append(input_text.strip())
|
110 |
+
self.speaker_target_texts.append(target_text.strip())
|
111 |
+
self.all_target_texts.append(target_text.strip())
|
112 |
|
113 |
|
114 |
if self.failsafe_trigger is False:
|
|
|
118 |
self.top_num = self.conversation_id
|
119 |
|
120 |
print(f"Conversation: {self.conversation_id}")
|
|
|
|
|
121 |
|
122 |
# Limit is defined within -3 of the early_patience, meaning if it gets close we're adding it to the list
|
123 |
limit = self.chatbot_trainer.early_patience - 3
|
|
|
131 |
# time.sleep(1)
|
132 |
return self.chatbot_trainer, user_choice, dialog_data, topConvo, self.top_num, self.failsafe_trigger
|
133 |
|
134 |
+
data = [input_text, target_text]
|
135 |
+
|
136 |
# User Choices
|
137 |
if user_choice in self.choices_yes and play_notification in self.choices_yes:
|
138 |
self.user_yes(speaker=speaker, data=data, limit=limit, play_notification=play_notification)
|
|
|
325 |
print("No failure data found.")
|
326 |
return
|
327 |
|
328 |
+
with open("failure_history.txt", "r") as f:
|
329 |
self.failure_history = [int(line.strip()) for line in f.readlines()]
|
330 |
|
331 |
if len(self.failure_history) == 0:
|