Fixed truncating

Removed truncating in favor of more padding. resized any mismatched sizes to the largest sequence. Fixed other minor issues

Files changed (2) hide show

chatbotTrainer.py +14 -25
runCorpus.py +21 -8

chatbotTrainer.py CHANGED Viewed

@@ -469,37 +469,32 @@ class ChatbotTrainer:
             if word not in self.vocabularyList:
                 self.vocabularyList.append(word)
         return txt
     # Training
     def preprocess_texts(self, input_texts, target_texts):
         input_texts = [self.clean_text(text) for text in input_texts.split(" ")]
         target_texts = [self.clean_text(text) for text in target_texts.split(" ")]
-        self.save_tokenizer(self.vocabularyList)
         # Initialize lists to store processed inputs and targets
-        input_texts = [f"<start> {texts} <end>" for texts in input_texts if input_texts and input_texts != "" and input_texts is not None]
-        target_texts = [f"<start> {texts} <end>" for texts in target_texts if target_texts and target_texts != "" and target_texts is not None]
-        input_sequences = self.tokenizer.texts_to_sequences(input_texts)    # [0]
-        target_sequences = self.tokenizer.texts_to_sequences(target_texts)  # [0]
-        input_sequences = pad_sequences(input_sequences, maxlen=self.max_seq_length, padding='post', truncating='post') # [0]
-        target_sequences = pad_sequences(target_sequences, maxlen=self.max_seq_length, padding='post', truncating='post')   # [0]
-        target_sequences = pad_sequences(target_sequences, maxlen=self.max_seq_length, padding='post', truncating='post')
         # Ensure target_sequences has enough samples
         if target_sequences.shape[0] != input_sequences.shape[0]:
             print(f"Padding mismatch! Input: {input_sequences.shape}, Target: {target_sequences.shape}")
-            target_sequences = np.resize(target_sequences, input_sequences.shape)  # Resize if necessary
-        # Ensure both lists have the same number of sequences
-        min_samples = min(len(input_sequences), len(target_sequences))
-        input_sequences = input_sequences[:min_samples]
-        target_sequences = target_sequences[:min_samples]
-        print(f"Preprocessed Encoder Input Shape: {input_sequences.shape}")
-        print(f"Preprocessed Decoder Input Shape: {target_sequences.shape}")
-        print(f"Preprocessed Decoder Target Shape: {target_sequences.shape}")
         return input_sequences, target_sequences
@@ -627,7 +622,7 @@ class ChatbotTrainer:
             raise ValueError("Corpus or tokenizer is not initialized.")
         # Preprocess the texts into sequences
-        input_sequences, target_sequences = input_texts, target_texts
         # Debug Lines
         # for token in ['<start>', '<end>', '<oov>']:
@@ -643,12 +638,6 @@ class ChatbotTrainer:
         decoder_input_data = target_sequences[:, :-1]
         decoder_target_data = target_sequences[:, 1:]
-        min_samples = min(encoder_input_data.shape[0], decoder_input_data.shape[0])
-        encoder_input_data = encoder_input_data[:min_samples]
-        decoder_input_data = decoder_input_data[:min_samples]
-        decoder_target_data = decoder_target_data[:min_samples]
         self.logger.info(f"Encoder Input Data Shape: {encoder_input_data.shape}")
         self.logger.info(f"Decoder Input Data Shape: {decoder_input_data.shape}")
         self.logger.info(f"Decoder Target Data Shape: {decoder_target_data.shape}")

             if word not in self.vocabularyList:
                 self.vocabularyList.append(word)
+        self.save_tokenizer(self.vocabularyList)
         return txt
     # Training
     def preprocess_texts(self, input_texts, target_texts):
         input_texts = [self.clean_text(text) for text in input_texts.split(" ")]
         target_texts = [self.clean_text(text) for text in target_texts.split(" ")]
         # Initialize lists to store processed inputs and targets
+        input_texts = [f"<start> {texts} <end>" for texts in input_texts if input_texts and input_texts != ""]
+        target_texts = [f"<start> {texts} <end>" for texts in target_texts if target_texts and target_texts != ""]
+        input_sequences = self.tokenizer.texts_to_sequences(input_texts)
+        target_sequences = self.tokenizer.texts_to_sequences(target_texts)
+        input_sequences = pad_sequences(input_sequences, maxlen=self.max_seq_length, padding='post')
+        target_sequences = pad_sequences(target_sequences, maxlen=self.max_seq_length, padding='post')
         # Ensure target_sequences has enough samples
         if target_sequences.shape[0] != input_sequences.shape[0]:
             print(f"Padding mismatch! Input: {input_sequences.shape}, Target: {target_sequences.shape}")
+            if target_sequences.shape[0] < input_sequences.shape[0]:
+                target_sequences = np.resize(target_sequences, input_sequences.shape)
+            if target_sequences.shape[0] > input_sequences.shape[0]:
+                target_sequences = np.resize(input_sequences, target_sequences.shape)
         return input_sequences, target_sequences
             raise ValueError("Corpus or tokenizer is not initialized.")
         # Preprocess the texts into sequences
+        input_sequences, target_sequences = self.preprocess_texts(input_texts, target_texts)
         # Debug Lines
         # for token in ['<start>', '<end>', '<oov>']:
         decoder_input_data = target_sequences[:, :-1]
         decoder_target_data = target_sequences[:, 1:]
         self.logger.info(f"Encoder Input Data Shape: {encoder_input_data.shape}")
         self.logger.info(f"Decoder Input Data Shape: {decoder_input_data.shape}")
         self.logger.info(f"Decoder Target Data Shape: {decoder_target_data.shape}")

runCorpus.py CHANGED Viewed

@@ -4,6 +4,7 @@ from playsound3 import playsound
 import tensorflow
 from chatbotTrainer import ChatbotTrainer
 import time
 import random
 import pdb
 import sys
@@ -19,7 +20,7 @@ class CorpusTrainer:
 		self.choices_yes = ["yes", "ya", "yeah", "yessir", "yesir", "y", "ye", "yah"]
 		self.exit_commands = ["exit", "quit", "stop", "x", "q", ""]
-		self.log_file = "self.failure_history.txt"
 		self.counter = 0
 		self.bad_count = 0
 		self.top_num = 0
@@ -91,11 +92,23 @@ class CorpusTrainer:
 			# Input conversation data into input and target data from dialog pairs
 			for input_text, target_text in dialog_pairs:
 				if input_text != "" and target_text != "":
-					self.speaker_input_texts.append(input_text)
-					self.all_input_texts.append(input_text)
-					self.speaker_target_texts.append(target_text)
-					self.all_target_texts.append(target_text)
 				if self.failsafe_trigger is False:
@@ -105,8 +118,6 @@ class CorpusTrainer:
 							self.top_num = self.conversation_id
 						print(f"Conversation: {self.conversation_id}")
-						input_text, target_text = chatbot_trainer.preprocess_texts(input_text, target_text)
-						data = [input_text, target_text]
 						# Limit is defined within -3 of the early_patience, meaning if it gets close we're adding it to the list
 						limit = self.chatbot_trainer.early_patience - 3
@@ -120,6 +131,8 @@ class CorpusTrainer:
 							# time.sleep(1)
 							return self.chatbot_trainer, user_choice, dialog_data, topConvo, self.top_num, self.failsafe_trigger
 						# User Choices
 						if user_choice in self.choices_yes and play_notification in self.choices_yes:
 							self.user_yes(speaker=speaker, data=data, limit=limit, play_notification=play_notification)
@@ -312,7 +325,7 @@ class CorpusTrainer:
 			print("No failure data found.")
 			return
-		with open("self.failure_history.txt", "r") as f:
 			self.failure_history = [int(line.strip()) for line in f.readlines()]
 		if len(self.failure_history) == 0:

 import tensorflow
 from chatbotTrainer import ChatbotTrainer
 import time
+import numpy as np
 import random
 import pdb
 import sys
 		self.choices_yes = ["yes", "ya", "yeah", "yessir", "yesir", "y", "ye", "yah"]
 		self.exit_commands = ["exit", "quit", "stop", "x", "q", ""]
+		self.log_file = "failure_history.txt"
 		self.counter = 0
 		self.bad_count = 0
 		self.top_num = 0
 			# Input conversation data into input and target data from dialog pairs
 			for input_text, target_text in dialog_pairs:
+				self.speaker_input_texts = []
+				self.speaker_target_texts = []
+				input_shape = np.array(input_text).shape
+				target_shape = np.array(target_text).shape
+				if input_shape in [(1, 64), (1, 63)] or target_shape in [(1, 64), (1, 63)]:
+					print(f"Conversation {speaker} skipped for NOT providing properly shaped data...  ")
+					continue
+				if len(input_text) < 3 or len(target_text) < 3:
+					print(f"Conversation {speaker} skipped for NOT providing enough data...  ")
+					continue
 				if input_text != "" and target_text != "":
+					self.speaker_input_texts.append(input_text.strip())
+					self.all_input_texts.append(input_text.strip())
+					self.speaker_target_texts.append(target_text.strip())
+					self.all_target_texts.append(target_text.strip())
 				if self.failsafe_trigger is False:
 							self.top_num = self.conversation_id
 						print(f"Conversation: {self.conversation_id}")
 						# Limit is defined within -3 of the early_patience, meaning if it gets close we're adding it to the list
 						limit = self.chatbot_trainer.early_patience - 3
 							# time.sleep(1)
 							return self.chatbot_trainer, user_choice, dialog_data, topConvo, self.top_num, self.failsafe_trigger
+						data = [input_text, target_text]
 						# User Choices
 						if user_choice in self.choices_yes and play_notification in self.choices_yes:
 							self.user_yes(speaker=speaker, data=data, limit=limit, play_notification=play_notification)
 			print("No failure data found.")
 			return
+		with open("failure_history.txt", "r") as f:
 			self.failure_history = [int(line.strip()) for line in f.readlines()]
 		if len(self.failure_history) == 0: