Hyacinthax commited on
Commit
206a44a
·
verified ·
1 Parent(s): 4ec6a0f

Fixed truncating

Browse files

Removed truncating in favor of more padding. resized any mismatched sizes to the largest sequence. Fixed other minor issues

Files changed (2) hide show
  1. chatbotTrainer.py +14 -25
  2. runCorpus.py +21 -8
chatbotTrainer.py CHANGED
@@ -469,37 +469,32 @@ class ChatbotTrainer:
469
  if word not in self.vocabularyList:
470
  self.vocabularyList.append(word)
471
 
 
 
472
  return txt
473
 
474
  # Training
475
  def preprocess_texts(self, input_texts, target_texts):
476
  input_texts = [self.clean_text(text) for text in input_texts.split(" ")]
477
  target_texts = [self.clean_text(text) for text in target_texts.split(" ")]
478
- self.save_tokenizer(self.vocabularyList)
479
  # Initialize lists to store processed inputs and targets
480
- input_texts = [f"<start> {texts} <end>" for texts in input_texts if input_texts and input_texts != "" and input_texts is not None]
481
- target_texts = [f"<start> {texts} <end>" for texts in target_texts if target_texts and target_texts != "" and target_texts is not None]
482
 
483
- input_sequences = self.tokenizer.texts_to_sequences(input_texts) # [0]
484
- target_sequences = self.tokenizer.texts_to_sequences(target_texts) # [0]
485
 
486
- input_sequences = pad_sequences(input_sequences, maxlen=self.max_seq_length, padding='post', truncating='post') # [0]
487
- target_sequences = pad_sequences(target_sequences, maxlen=self.max_seq_length, padding='post', truncating='post') # [0]
488
- target_sequences = pad_sequences(target_sequences, maxlen=self.max_seq_length, padding='post', truncating='post')
489
 
490
  # Ensure target_sequences has enough samples
491
  if target_sequences.shape[0] != input_sequences.shape[0]:
492
  print(f"Padding mismatch! Input: {input_sequences.shape}, Target: {target_sequences.shape}")
493
- target_sequences = np.resize(target_sequences, input_sequences.shape) # Resize if necessary
494
-
495
- # Ensure both lists have the same number of sequences
496
- min_samples = min(len(input_sequences), len(target_sequences))
497
- input_sequences = input_sequences[:min_samples]
498
- target_sequences = target_sequences[:min_samples]
499
-
500
- print(f"Preprocessed Encoder Input Shape: {input_sequences.shape}")
501
- print(f"Preprocessed Decoder Input Shape: {target_sequences.shape}")
502
- print(f"Preprocessed Decoder Target Shape: {target_sequences.shape}")
503
 
504
  return input_sequences, target_sequences
505
 
@@ -627,7 +622,7 @@ class ChatbotTrainer:
627
  raise ValueError("Corpus or tokenizer is not initialized.")
628
 
629
  # Preprocess the texts into sequences
630
- input_sequences, target_sequences = input_texts, target_texts
631
 
632
  # Debug Lines
633
  # for token in ['<start>', '<end>', '<oov>']:
@@ -643,12 +638,6 @@ class ChatbotTrainer:
643
  decoder_input_data = target_sequences[:, :-1]
644
  decoder_target_data = target_sequences[:, 1:]
645
 
646
- min_samples = min(encoder_input_data.shape[0], decoder_input_data.shape[0])
647
-
648
- encoder_input_data = encoder_input_data[:min_samples]
649
- decoder_input_data = decoder_input_data[:min_samples]
650
- decoder_target_data = decoder_target_data[:min_samples]
651
-
652
  self.logger.info(f"Encoder Input Data Shape: {encoder_input_data.shape}")
653
  self.logger.info(f"Decoder Input Data Shape: {decoder_input_data.shape}")
654
  self.logger.info(f"Decoder Target Data Shape: {decoder_target_data.shape}")
 
469
  if word not in self.vocabularyList:
470
  self.vocabularyList.append(word)
471
 
472
+ self.save_tokenizer(self.vocabularyList)
473
+
474
  return txt
475
 
476
  # Training
477
  def preprocess_texts(self, input_texts, target_texts):
478
  input_texts = [self.clean_text(text) for text in input_texts.split(" ")]
479
  target_texts = [self.clean_text(text) for text in target_texts.split(" ")]
480
+
481
  # Initialize lists to store processed inputs and targets
482
+ input_texts = [f"<start> {texts} <end>" for texts in input_texts if input_texts and input_texts != ""]
483
+ target_texts = [f"<start> {texts} <end>" for texts in target_texts if target_texts and target_texts != ""]
484
 
485
+ input_sequences = self.tokenizer.texts_to_sequences(input_texts)
486
+ target_sequences = self.tokenizer.texts_to_sequences(target_texts)
487
 
488
+ input_sequences = pad_sequences(input_sequences, maxlen=self.max_seq_length, padding='post')
489
+ target_sequences = pad_sequences(target_sequences, maxlen=self.max_seq_length, padding='post')
 
490
 
491
  # Ensure target_sequences has enough samples
492
  if target_sequences.shape[0] != input_sequences.shape[0]:
493
  print(f"Padding mismatch! Input: {input_sequences.shape}, Target: {target_sequences.shape}")
494
+ if target_sequences.shape[0] < input_sequences.shape[0]:
495
+ target_sequences = np.resize(target_sequences, input_sequences.shape)
496
+ if target_sequences.shape[0] > input_sequences.shape[0]:
497
+ target_sequences = np.resize(input_sequences, target_sequences.shape)
 
 
 
 
 
 
498
 
499
  return input_sequences, target_sequences
500
 
 
622
  raise ValueError("Corpus or tokenizer is not initialized.")
623
 
624
  # Preprocess the texts into sequences
625
+ input_sequences, target_sequences = self.preprocess_texts(input_texts, target_texts)
626
 
627
  # Debug Lines
628
  # for token in ['<start>', '<end>', '<oov>']:
 
638
  decoder_input_data = target_sequences[:, :-1]
639
  decoder_target_data = target_sequences[:, 1:]
640
 
 
 
 
 
 
 
641
  self.logger.info(f"Encoder Input Data Shape: {encoder_input_data.shape}")
642
  self.logger.info(f"Decoder Input Data Shape: {decoder_input_data.shape}")
643
  self.logger.info(f"Decoder Target Data Shape: {decoder_target_data.shape}")
runCorpus.py CHANGED
@@ -4,6 +4,7 @@ from playsound3 import playsound
4
  import tensorflow
5
  from chatbotTrainer import ChatbotTrainer
6
  import time
 
7
  import random
8
  import pdb
9
  import sys
@@ -19,7 +20,7 @@ class CorpusTrainer:
19
  self.choices_yes = ["yes", "ya", "yeah", "yessir", "yesir", "y", "ye", "yah"]
20
  self.exit_commands = ["exit", "quit", "stop", "x", "q", ""]
21
 
22
- self.log_file = "self.failure_history.txt"
23
  self.counter = 0
24
  self.bad_count = 0
25
  self.top_num = 0
@@ -91,11 +92,23 @@ class CorpusTrainer:
91
 
92
  # Input conversation data into input and target data from dialog pairs
93
  for input_text, target_text in dialog_pairs:
 
 
 
 
 
 
 
 
 
 
 
 
94
  if input_text != "" and target_text != "":
95
- self.speaker_input_texts.append(input_text)
96
- self.all_input_texts.append(input_text)
97
- self.speaker_target_texts.append(target_text)
98
- self.all_target_texts.append(target_text)
99
 
100
 
101
  if self.failsafe_trigger is False:
@@ -105,8 +118,6 @@ class CorpusTrainer:
105
  self.top_num = self.conversation_id
106
 
107
  print(f"Conversation: {self.conversation_id}")
108
- input_text, target_text = chatbot_trainer.preprocess_texts(input_text, target_text)
109
- data = [input_text, target_text]
110
 
111
  # Limit is defined within -3 of the early_patience, meaning if it gets close we're adding it to the list
112
  limit = self.chatbot_trainer.early_patience - 3
@@ -120,6 +131,8 @@ class CorpusTrainer:
120
  # time.sleep(1)
121
  return self.chatbot_trainer, user_choice, dialog_data, topConvo, self.top_num, self.failsafe_trigger
122
 
 
 
123
  # User Choices
124
  if user_choice in self.choices_yes and play_notification in self.choices_yes:
125
  self.user_yes(speaker=speaker, data=data, limit=limit, play_notification=play_notification)
@@ -312,7 +325,7 @@ class CorpusTrainer:
312
  print("No failure data found.")
313
  return
314
 
315
- with open("self.failure_history.txt", "r") as f:
316
  self.failure_history = [int(line.strip()) for line in f.readlines()]
317
 
318
  if len(self.failure_history) == 0:
 
4
  import tensorflow
5
  from chatbotTrainer import ChatbotTrainer
6
  import time
7
+ import numpy as np
8
  import random
9
  import pdb
10
  import sys
 
20
  self.choices_yes = ["yes", "ya", "yeah", "yessir", "yesir", "y", "ye", "yah"]
21
  self.exit_commands = ["exit", "quit", "stop", "x", "q", ""]
22
 
23
+ self.log_file = "failure_history.txt"
24
  self.counter = 0
25
  self.bad_count = 0
26
  self.top_num = 0
 
92
 
93
  # Input conversation data into input and target data from dialog pairs
94
  for input_text, target_text in dialog_pairs:
95
+ self.speaker_input_texts = []
96
+ self.speaker_target_texts = []
97
+ input_shape = np.array(input_text).shape
98
+ target_shape = np.array(target_text).shape
99
+ if input_shape in [(1, 64), (1, 63)] or target_shape in [(1, 64), (1, 63)]:
100
+ print(f"Conversation {speaker} skipped for NOT providing properly shaped data... ")
101
+ continue
102
+
103
+ if len(input_text) < 3 or len(target_text) < 3:
104
+ print(f"Conversation {speaker} skipped for NOT providing enough data... ")
105
+ continue
106
+
107
  if input_text != "" and target_text != "":
108
+ self.speaker_input_texts.append(input_text.strip())
109
+ self.all_input_texts.append(input_text.strip())
110
+ self.speaker_target_texts.append(target_text.strip())
111
+ self.all_target_texts.append(target_text.strip())
112
 
113
 
114
  if self.failsafe_trigger is False:
 
118
  self.top_num = self.conversation_id
119
 
120
  print(f"Conversation: {self.conversation_id}")
 
 
121
 
122
  # Limit is defined within -3 of the early_patience, meaning if it gets close we're adding it to the list
123
  limit = self.chatbot_trainer.early_patience - 3
 
131
  # time.sleep(1)
132
  return self.chatbot_trainer, user_choice, dialog_data, topConvo, self.top_num, self.failsafe_trigger
133
 
134
+ data = [input_text, target_text]
135
+
136
  # User Choices
137
  if user_choice in self.choices_yes and play_notification in self.choices_yes:
138
  self.user_yes(speaker=speaker, data=data, limit=limit, play_notification=play_notification)
 
325
  print("No failure data found.")
326
  return
327
 
328
+ with open("failure_history.txt", "r") as f:
329
  self.failure_history = [int(line.strip()) for line in f.readlines()]
330
 
331
  if len(self.failure_history) == 0: