Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Dec 31, 2024

Commit

9b5daff

1 Parent(s): f5346f7

improve split processes

Browse files

Files changed (4) hide show

chatbot_model.py +86 -94
run_data_preparer.py +1 -1
run_model_train.py +31 -10
tf_data_pipeline.py +230 -57

chatbot_model.py CHANGED Viewed

@@ -138,12 +138,14 @@ class RetrievalChatbot(DeviceAwareModel):
         device: str = None,
         strategy=None,
         reranker: Optional[CrossEncoderReranker] = None,
-        summarizer: Optional[Summarizer] = None
     ):
         super().__init__()
         self.config = config
         self.strategy = strategy
         self.device = device or self._setup_default_device()
         # Initialize reranker, summarizer, tokenizer, and memory monitor
         self.reranker = reranker or self._initialize_reranker()
@@ -151,13 +153,10 @@ class RetrievalChatbot(DeviceAwareModel):
         self.tokenizer = self._initialize_tokenizer()
         self.memory_monitor = GPUMemoryMonitor()
-        # Initialize models
-        self.min_batch_size = 8
-        self.max_batch_size = 128
-        self.current_batch_size = 32
-        # Collect unique responses from dialogues
-        self.response_pool, self.unique_responses = self._collect_responses(dialogues)
         # Initialize training history
         self.history = {
@@ -166,6 +165,15 @@ class RetrievalChatbot(DeviceAwareModel):
             "train_metrics": {},
             "val_metrics": {}
         }
     def _setup_default_device(self) -> str:
         """Set up default device if none is provided."""
@@ -236,11 +244,14 @@ class RetrievalChatbot(DeviceAwareModel):
         self.encoder.pretrained.resize_token_embeddings(new_vocab_size)
         logger.info(f"Token embeddings resized to: {new_vocab_size}")
-        # Initialize FAISS index
-        self._initialize_faiss()
-        # Compute and index embeddings
-        self._compute_and_index_embeddings()
         # Retrieve embedding dimension from encoder
         embedding_dim = self.config.embedding_dim
@@ -271,10 +282,10 @@ class RetrievalChatbot(DeviceAwareModel):
                 self.current_batch_size = new_size
     def _initialize_faiss(self):
-        """Initialize FAISS with safer GPU handling and memory monitoring."""
         logger.info("Initializing FAISS index...")
-        # First, detect if we have GPU-enabled FAISS
         self.faiss_gpu = False
         self.gpu_resources = []
@@ -294,53 +305,30 @@ class RetrievalChatbot(DeviceAwareModel):
                         self.gpu_resources.append(res)
                     self.faiss_gpu = True
                     logger.info(f"FAISS GPU resources initialized on {ngpus} GPUs")
-            else:
-                logger.info("Using CPU-only FAISS build")
         except Exception as e:
             logger.warning(f"Using CPU due to GPU initialization error: {e}")
-        # TODO: figure out buf with faiss-gpu
-        # TODO: consider IndexIVFFlat in the future (speed).
         try:
             # Create appropriate index based on dataset size
             if len(self.unique_responses) < 1000:
                 logger.info("Small dataset detected, using simple FlatIP index")
                 self.index = faiss.IndexFlatIP(self.config.embedding_dim)
             else:
-                # Use IVF index with dynamic number of clusters
-                # nlist = min(
-                #     25,  # max clusters
-                #     max(int(math.sqrt(len(self.unique_responses))), 1)  # min 1 cluster
-                # )
-                # logger.info(f"Using IVF index with {nlist} clusters")
-                # quantizer = faiss.IndexFlatIP(self.config.embedding_dim)
-                # self.index = faiss.IndexIVFFlat(
-                #     quantizer,
-                #     self.config.embedding_dim,
-                #     nlist,
-                #     faiss.METRIC_INNER_PRODUCT
-                # )
                 self.index = faiss.IndexFlatIP(self.config.embedding_dim)
-            # # Move to GPU(s) if available
-            # if self.faiss_gpu and self.gpu_resources:
-            #     try:
-            #         if len(self.gpu_resources) > 1:
-            #             self.index = faiss.index_cpu_to_gpus_list(self.index, self.gpu_resources)
-            #             logger.info("FAISS index distributed across multiple GPUs")
-            #         else:
-            #             self.index = faiss.index_cpu_to_gpu(self.gpu_resources[0], 0, self.index)
-            #             logger.info("FAISS index moved to single GPU")
-            #     except Exception as e:
-            #         logger.warning(f"Failed to move index to GPU: {e}. Falling back to CPU")
-            #         self.faiss_gpu = False
-            # # Set search parameters for IVF index
-            # if isinstance(self.index, faiss.IndexIVFFlat):
-            #     self.index.nprobe = min(10, nlist)
         except Exception as e:
             logger.error(f"Error initializing FAISS: {e}")
             raise
@@ -353,21 +341,16 @@ class RetrievalChatbot(DeviceAwareModel):
         """
         Encodes responses with more conservative memory management.
         """
         all_embeddings = []
         self.current_batch_size = batch_size
         if self.memory_monitor.has_gpu:
             batch_size = 128
-        # Memory stats
-        # if self.memory_monitor.has_gpu:
-        #     initial_stats = self.memory_monitor.get_memory_stats()
-        #     if initial_stats:
-        #         logger.info("Initial GPU memory state:")
-        #         logger.info(f"Total: {initial_stats.total / 1e9:.2f}GB")
-        #         logger.info(f"Used: {initial_stats.used / 1e9:.2f}GB")
-        #         logger.info(f"Free: {initial_stats.free / 1e9:.2f}GB")
         total_processed = 0
         with tqdm(total=len(responses), desc="Encoding responses") as pbar:
@@ -434,7 +417,10 @@ class RetrievalChatbot(DeviceAwareModel):
                     raise
         # Concatenate results
-        #logger.info("Concatenating embeddings...")
         if len(all_embeddings) == 1:
             final_embeddings = all_embeddings[0]
         else:
@@ -727,7 +713,11 @@ class RetrievalChatbot(DeviceAwareModel):
             raise
     def verify_faiss_index(self):
-        """Verify that FAISS index matches the response pool."""
         indexed_size = self.index.ntotal
         pool_size = len(self.response_pool)
         logger.info(f"FAISS index size: {indexed_size}")
@@ -813,6 +803,10 @@ class RetrievalChatbot(DeviceAwareModel):
     def retrieve_responses_faiss(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]:
         """Retrieve top-k responses using FAISS."""
         # Encode the query
         q_emb = self.encode_query(query)  # Shape: [1, embedding_dim]
         q_emb_np = q_emb.numpy().astype('float32')  # Ensure type match
@@ -874,32 +868,6 @@ class RetrievalChatbot(DeviceAwareModel):
         logger.info(f"Models and tokenizer loaded from {load_dir}.")
         return chatbot
-    def parse_tfrecord_fn(example_proto, max_length, neg_samples):
-        """
-        Parses a single TFRecord example.
-        Args:
-            example_proto: A serialized TFRecord example.
-            max_length: The maximum sequence length for tokenization.
-            neg_samples: The number of hard negatives per query.
-        Returns:
-            A tuple of (query_ids, positive_ids, negative_ids).
-        """
-        feature_description = {
-            'query_ids': tf.io.FixedLenFeature([max_length], tf.int64),
-            'positive_ids': tf.io.FixedLenFeature([max_length], tf.int64),
-            'negative_ids': tf.io.FixedLenFeature([neg_samples * max_length], tf.int64),
-        }
-        parsed_features = tf.io.parse_single_example(example_proto, feature_description)
-        query_ids = tf.cast(parsed_features['query_ids'], tf.int32)
-        positive_ids = tf.cast(parsed_features['positive_ids'], tf.int32)
-        negative_ids = tf.cast(parsed_features['negative_ids'], tf.int32)
-        negative_ids = tf.reshape(negative_ids, [neg_samples, max_length])
-        return query_ids, positive_ids, negative_ids
     def train_streaming(
         self,
         tfrecord_file_path: str,
@@ -915,10 +883,34 @@ class RetrievalChatbot(DeviceAwareModel):
     ) -> None:
         """Training using a pre-prepared TFRecord dataset."""
         logger.info("Starting training with pre-prepared TFRecord dataset...")
-        # Calculate total steps for learning rate schedule
-        # Estimate total pairs by counting the number of records in the TFRecord
-        # Assuming each record corresponds to one pair
         raw_dataset = tf.data.TFRecordDataset(tfrecord_file_path)
         total_pairs = sum(1 for _ in raw_dataset)
         logger.info(f"Total pairs in TFRecord: {total_pairs}")
@@ -964,12 +956,12 @@ class RetrievalChatbot(DeviceAwareModel):
         logger.info(f"TensorBoard logs will be saved in {log_dir}")
         # Define the parsing function with the appropriate max_length and neg_samples
-        parse_fn = lambda x: self.parse_tfrecord_fn(x, self.config.max_context_token_limit, self.config.neg_samples)
         # Create the full dataset
         dataset = tf.data.TFRecordDataset(tfrecord_file_path)
         dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.AUTOTUNE)
-        dataset = dataset.shuffle(buffer_size=10000)  # Adjust buffer size as needed TODO: what is this?
         dataset = dataset.batch(batch_size, drop_remainder=True)
         dataset = dataset.prefetch(tf.data.AUTOTUNE)

         device: str = None,
         strategy=None,
         reranker: Optional[CrossEncoderReranker] = None,
+        summarizer: Optional[Summarizer] = None,
+        mode: str = 'preparation'
     ):
         super().__init__()
         self.config = config
         self.strategy = strategy
         self.device = device or self._setup_default_device()
+        self.mode = mode.lower()
         # Initialize reranker, summarizer, tokenizer, and memory monitor
         self.reranker = reranker or self._initialize_reranker()
         self.tokenizer = self._initialize_tokenizer()
         self.memory_monitor = GPUMemoryMonitor()
+        # # Initialize models
+        # self.min_batch_size = 8
+        # self.max_batch_size = 128
+        # self.current_batch_size = 32
         # Initialize training history
         self.history = {
             "train_metrics": {},
             "val_metrics": {}
         }
+        # Collect unique responses from dialogues
+        if self.mode == 'preparation':
+            # Collect unique responses from dialogues only in preparation mode
+            self.response_pool, self.unique_responses = self._collect_responses(dialogues)
+        else:
+            # In training mode, assume response_pool is handled via TFRecord
+            self.response_pool = []
+            self.unique_responses = []
     def _setup_default_device(self) -> str:
         """Set up default device if none is provided."""
         self.encoder.pretrained.resize_token_embeddings(new_vocab_size)
         logger.info(f"Token embeddings resized to: {new_vocab_size}")
+        if self.mode == 'preparation':
+            # Initialize FAISS index only in preparation mode
+            self._initialize_faiss()
+            # Compute and index embeddings
+            self._compute_and_index_embeddings()
+        else:
+            # In training mode, skip FAISS indexing from dialogues
+            logger.info("Training mode: Skipping FAISS index initialization from dialogues.")
         # Retrieve embedding dimension from encoder
         embedding_dim = self.config.embedding_dim
                 self.current_batch_size = new_size
     def _initialize_faiss(self):
+        """Initialize FAISS with safe GPU handling and memory monitoring."""
         logger.info("Initializing FAISS index...")
+        # Detect if we have GPU-enabled FAISS
         self.faiss_gpu = False
         self.gpu_resources = []
                         self.gpu_resources.append(res)
                     self.faiss_gpu = True
                     logger.info(f"FAISS GPU resources initialized on {ngpus} GPUs")
         except Exception as e:
             logger.warning(f"Using CPU due to GPU initialization error: {e}")
         try:
             # Create appropriate index based on dataset size
             if len(self.unique_responses) < 1000:
                 logger.info("Small dataset detected, using simple FlatIP index")
                 self.index = faiss.IndexFlatIP(self.config.embedding_dim)
             else:
+                # For larger datasets, consider using more efficient indices like IVF
                 self.index = faiss.IndexFlatIP(self.config.embedding_dim)
+            # Move to GPU(s) if available and needed
+            if self.faiss_gpu and self.gpu_resources:
+                try:
+                    if len(self.gpu_resources) > 1:
+                        self.index = faiss.index_cpu_to_gpus_list(self.index, self.gpu_resources)
+                        logger.info("FAISS index distributed across multiple GPUs")
+                    else:
+                        self.index = faiss.index_cpu_to_gpu(self.gpu_resources[0], 0, self.index)
+                        logger.info("FAISS index moved to single GPU")
+                except Exception as e:
+                    logger.warning(f"Failed to move index to GPU: {e}. Falling back to CPU")
+                    self.faiss_gpu = False
         except Exception as e:
             logger.error(f"Error initializing FAISS: {e}")
             raise
         """
         Encodes responses with more conservative memory management.
         """
+        if not responses:
+            logger.info("No responses to encode. Returning empty tensor.")
+            return tf.constant([], dtype=tf.float32)
         all_embeddings = []
         self.current_batch_size = batch_size
         if self.memory_monitor.has_gpu:
             batch_size = 128
         total_processed = 0
         with tqdm(total=len(responses), desc="Encoding responses") as pbar:
                     raise
         # Concatenate results
+        if not all_embeddings:
+            logger.info("No embeddings were encoded. Returning empty tensor.")
+            return tf.constant([], dtype=tf.float32)
         if len(all_embeddings) == 1:
             final_embeddings = all_embeddings[0]
         else:
             raise
     def verify_faiss_index(self):
+        """Verify that FAISS index matches the response pool, if index exists."""
+        if not hasattr(self, 'index') or self.index is None:
+            logger.info("FAISS index not initialized. Skipping verification.")
+            return
         indexed_size = self.index.ntotal
         pool_size = len(self.response_pool)
         logger.info(f"FAISS index size: {indexed_size}")
     def retrieve_responses_faiss(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]:
         """Retrieve top-k responses using FAISS."""
+        if not hasattr(self, 'index') or self.index is None:
+            logger.warning("FAISS index not initialized. Cannot retrieve responses.")
+            return []
         # Encode the query
         q_emb = self.encode_query(query)  # Shape: [1, embedding_dim]
         q_emb_np = q_emb.numpy().astype('float32')  # Ensure type match
         logger.info(f"Models and tokenizer loaded from {load_dir}.")
         return chatbot
     def train_streaming(
         self,
         tfrecord_file_path: str,
     ) -> None:
         """Training using a pre-prepared TFRecord dataset."""
         logger.info("Starting training with pre-prepared TFRecord dataset...")
+        def parse_tfrecord_fn(example_proto, max_length, neg_samples):
+            """
+            Parses a single TFRecord example.
+            Args:
+                example_proto: A serialized TFRecord example.
+                max_length: The maximum sequence length for tokenization.
+                neg_samples: The number of hard negatives per query.
+            Returns:
+                A tuple of (query_ids, positive_ids, negative_ids).
+            """
+            feature_description = {
+                'query_ids': tf.io.FixedLenFeature([max_length], tf.int64),
+                'positive_ids': tf.io.FixedLenFeature([max_length], tf.int64),
+                'negative_ids': tf.io.FixedLenFeature([neg_samples * max_length], tf.int64),
+            }
+            parsed_features = tf.io.parse_single_example(example_proto, feature_description)
+            query_ids = tf.cast(parsed_features['query_ids'], tf.int32)
+            positive_ids = tf.cast(parsed_features['positive_ids'], tf.int32)
+            negative_ids = tf.cast(parsed_features['negative_ids'], tf.int32)
+            negative_ids = tf.reshape(negative_ids, [neg_samples, max_length])
+            return query_ids, positive_ids, negative_ids
+        # Calculate total steps by counting the number of records in the TFRecord
         raw_dataset = tf.data.TFRecordDataset(tfrecord_file_path)
         total_pairs = sum(1 for _ in raw_dataset)
         logger.info(f"Total pairs in TFRecord: {total_pairs}")
         logger.info(f"TensorBoard logs will be saved in {log_dir}")
         # Define the parsing function with the appropriate max_length and neg_samples
+        parse_fn = lambda x: parse_tfrecord_fn(x, self.config.max_context_token_limit, self.config.neg_samples)
         # Create the full dataset
         dataset = tf.data.TFRecordDataset(tfrecord_file_path)
         dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.AUTOTUNE)
+        dataset = dataset.shuffle(buffer_size=10000)  # Adjust buffer size as needed
         dataset = dataset.batch(batch_size, drop_remainder=True)
         dataset = dataset.prefetch(tf.data.AUTOTUNE)

run_data_preparer.py CHANGED Viewed

@@ -30,7 +30,7 @@ def main():
     TF_RECORD_DIR = 'training_data'
     FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_production.index')
     FAISS_INDEX_TEST_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_test.index')
-    ENVIRONMENT = 'test'  # or 'production'
     if ENVIRONMENT == 'test':
         FAISS_INDEX_PATH = FAISS_INDEX_TEST_PATH
     else:

     TF_RECORD_DIR = 'training_data'
     FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_production.index')
     FAISS_INDEX_TEST_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_test.index')
+    ENVIRONMENT = 'production'  # or 'test'
     if ENVIRONMENT == 'test':
         FAISS_INDEX_PATH = FAISS_INDEX_TEST_PATH
     else:

run_model_train.py CHANGED Viewed

@@ -31,36 +31,57 @@ def run_interactive_chat(chatbot, quality_checker):
             for resp, score in candidates[1:4]:
                 print(f"Score: {score:.4f} - {resp}")
 def main():
     # Initialize environment
     tf.keras.backend.clear_session()
     env = EnvironmentSetup()
     env.initialize()
-    DEBUG_SAMPLES = 5
-    EPOCHS = 5 if DEBUG_SAMPLES else 20
     TF_RECORD_FILE_PATH = 'training_data/training_data.tfrecord'
     # Optimize batch size for Colab
     batch_size = env.optimize_batch_size(base_batch_size=16)
     # Initialize configuration
     config = ChatbotConfig(
         embedding_dim=768, # DistilBERT
         max_context_token_limit=512,
         freeze_embeddings=False,
-        neg_samples=3,
     )
-    # # Load training data
-    # dialogues = RetrievalChatbot.load_training_data(data_path=TRAINING_DATA_PATH, debug_samples=DEBUG_SAMPLES)
-    # print(dialogues)
-    # Initialize chatbot and verify FAISS index
     #with env.strategy.scope():
-    chatbot = RetrievalChatbot(config)
     chatbot.build_models()
-    chatbot.verify_faiss_index()
     chatbot.train_streaming(
         tfrecord_file_path=TF_RECORD_FILE_PATH,

             for resp, score in candidates[1:4]:
                 print(f"Score: {score:.4f} - {resp}")
+def inspect_tfrecord(tfrecord_file_path, num_examples=3):
+    def parse_example(example_proto):
+        feature_description = {
+            'query_ids': tf.io.FixedLenFeature([512], tf.int64),  # Adjust max_length if different
+            'positive_ids': tf.io.FixedLenFeature([512], tf.int64),
+            'negative_ids': tf.io.FixedLenFeature([3 * 512], tf.int64),  # Adjust neg_samples if different
+        }
+        return tf.io.parse_single_example(example_proto, feature_description)
+    dataset = tf.data.TFRecordDataset(tfrecord_file_path)
+    dataset = dataset.map(parse_example)
+    for i, example in enumerate(dataset.take(num_examples)):
+        print(f"Example {i+1}:")
+        print(f"Query IDs: {example['query_ids'].numpy()}")
+        print(f"Positive IDs: {example['positive_ids'].numpy()}")
+        print(f"Negative IDs: {example['negative_ids'].numpy()}")
+        print("-" * 50)
 def main():
+    # Quick test to inspect TFRecord
+    #inspect_tfrecord('training_data/training_data.tfrecord', num_examples=3)
     # Initialize environment
     tf.keras.backend.clear_session()
     env = EnvironmentSetup()
     env.initialize()
+    # Training configuration
+    EPOCHS = 20
     TF_RECORD_FILE_PATH = 'training_data/training_data.tfrecord'
     # Optimize batch size for Colab
     batch_size = env.optimize_batch_size(base_batch_size=16)
     # Initialize configuration
     config = ChatbotConfig(
         embedding_dim=768, # DistilBERT
         max_context_token_limit=512,
         freeze_embeddings=False,
     )
+    # Initialize chatbot
     #with env.strategy.scope():
+    chatbot = RetrievalChatbot(config, mode='training')
     chatbot.build_models()
+    if chatbot.mode == 'preparation':
+        chatbot.verify_faiss_index()
     chatbot.train_streaming(
         tfrecord_file_path=TF_RECORD_FILE_PATH,

tf_data_pipeline.py CHANGED Viewed

@@ -4,6 +4,7 @@ import numpy as np
 import faiss
 import tensorflow as tf
 import h5py
 from tqdm import tqdm
 import json
 from pathlib import Path
@@ -146,7 +147,7 @@ class TFDataPipeline:
     def collect_responses(self, dialogues: List[dict]) -> List[str]:
         """Extract unique assistant responses from dialogues."""
         response_set = set()
-        for dialogue in dialogues:
             turns = dialogue.get('turns', [])
             for turn in turns:
                 speaker = turn.get('speaker')
@@ -180,20 +181,17 @@ class TFDataPipeline:
     def _compute_and_index_response_embeddings(self):
         """
-        Computes embeddings for the response pool and adds them to the FAISS index.
         """
         logger.info("Computing embeddings for the response pool...")
-        # Log the contents and types of response_pool
-        for idx, response in enumerate(self.response_pool[:5], 1):  # Log first 5 responses
-            logger.debug(f"Response {idx}: {response} (Type: {type(response)})")
         # Ensure all responses are strings
         if not all(isinstance(response, str) for response in self.response_pool):
             logger.error("All elements in response_pool must be strings.")
             raise ValueError("Invalid data type in response_pool.")
-        # Proceed with tokenization
         encoded_responses = self.tokenizer(
             self.response_pool,
             padding=True,
@@ -203,28 +201,87 @@ class TFDataPipeline:
         )
         response_ids = encoded_responses['input_ids']
-        # Compute embeddings in batches
         batch_size = getattr(self, 'embedding_batch_size', 64)  # Default to 64 if not set
         embeddings = []
-        for i in range(0, len(response_ids), batch_size):
-            batch_ids = response_ids[i:i+batch_size]
-            # Compute embeddings
-            batch_embeddings = self.encoder(batch_ids, training=False).numpy()
-            # Normalize embeddings if using inner product or cosine similarity
-            faiss.normalize_L2(batch_embeddings)
-            embeddings.append(batch_embeddings)
         if embeddings:
             embeddings = np.vstack(embeddings).astype(np.float32)
-            # Add embeddings to FAISS index
             logger.info(f"Adding {len(embeddings)} response embeddings to FAISS index...")
-            self.index.add(embeddings)
             logger.info("Response embeddings added to FAISS index.")
         else:
             logger.warning("No embeddings to add to FAISS index.")
         # **Sanity Check:** Verify the number of embeddings in FAISS index
         logger.info(f"Total embeddings in FAISS index after addition: {self.index.ntotal}")
     def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
         """Find hard negatives for a batch of queries with error handling and retries."""
@@ -349,53 +406,169 @@ class TFDataPipeline:
         return query_ids, positive_ids, negative_ids
-    def prepare_and_save_data(self, dialogues: List[dict], tfrecord_file_path: str, batch_size: int = 32):
-        """Processes dialogues in batches and saves to a TFRecord file."""
-        with tf.io.TFRecordWriter(tfrecord_file_path) as writer:
-            total_dialogues = len(dialogues)
-            logger.debug(f"Total dialogues to process: {total_dialogues}")
-            with tqdm(total=total_dialogues, desc="Processing Dialogues", unit="dialogue") as pbar:
-                for i in range(0, total_dialogues, batch_size):
-                    batch_dialogues = dialogues[i:i+batch_size]
-                    # Process each batch_dialogues
-                    # Extract pairs, find negatives, tokenize, and serialize
-                    # Example:
                     for dialogue in batch_dialogues:
                         pairs = self._extract_pairs_from_dialogue(dialogue)
-                        queries = []
-                        positives = []
                         for query, positive in pairs:
-                            queries.append(query)
-                            positives.append(positive)
-                        if queries:
-                            # **Compute and cache query embeddings before searching**
-                            self._compute_embeddings(queries)
-                            # Find hard negatives
-                            hard_negatives = self._find_hard_negatives_batch(queries, positives)
-                            for idx, negatives in enumerate(hard_negatives[:5]):  # Log first 5 examples
-                                logger.debug(f"Query: {queries[idx]}")
-                                logger.debug(f"Positive: {positives[idx]}")
-                                logger.debug(f"Hard Negatives: {negatives}")
-                            # Tokenize and encode
-                            query_ids, positive_ids, negative_ids = self._tokenize_and_encode(queries, positives, hard_negatives)
-                            # Serialize each example and write to TFRecord
-                            for q_id, p_id, n_id in zip(query_ids, positive_ids, negative_ids):
-                                feature = {
-                                    'query_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=q_id)),
-                                    'positive_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=p_id)),
-                                    'negative_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=n_id.flatten())),
-                                }
-                                example = tf.train.Example(features=tf.train.Features(feature=feature))
-                                writer.write(example.SerializeToString())
-                    pbar.update(len(batch_dialogues))
-        logger.info(f"Data preparation complete. TFRecord saved at {tfrecord_file_path}")
     def _tokenize_negatives_tf(self, negatives):
         """Tokenizes negatives using tf.py_function."""

 import faiss
 import tensorflow as tf
 import h5py
+import math
 from tqdm import tqdm
 import json
 from pathlib import Path
     def collect_responses(self, dialogues: List[dict]) -> List[str]:
         """Extract unique assistant responses from dialogues."""
         response_set = set()
+        for dialogue in tqdm(dialogues, desc="Processing Dialogues", unit="dialogue"):
             turns = dialogue.get('turns', [])
             for turn in turns:
                 speaker = turn.get('speaker')
     def _compute_and_index_response_embeddings(self):
         """
+        Computes embeddings for the response pool and adds them to the FAISS index with progress bars.
         """
         logger.info("Computing embeddings for the response pool...")
         # Ensure all responses are strings
         if not all(isinstance(response, str) for response in self.response_pool):
             logger.error("All elements in response_pool must be strings.")
             raise ValueError("Invalid data type in response_pool.")
+        # Tokenization
+        logger.info("Tokenizing responses...")
         encoded_responses = self.tokenizer(
             self.response_pool,
             padding=True,
         )
         response_ids = encoded_responses['input_ids']
+        # Compute embeddings in batches with progress bar
         batch_size = getattr(self, 'embedding_batch_size', 64)  # Default to 64 if not set
+        total_responses = len(response_ids)
+        logger.info(f"Computing embeddings in batches of {batch_size}...")
         embeddings = []
+        with tqdm(total=total_responses, desc="Computing Embeddings", unit="response") as pbar:
+            for i in range(0, total_responses, batch_size):
+                batch_ids = response_ids[i:i + batch_size]
+                # Compute embeddings
+                batch_embeddings = self.encoder(batch_ids, training=False).numpy()
+                # Normalize embeddings for cosine similarity
+                faiss.normalize_L2(batch_embeddings)
+                embeddings.append(batch_embeddings)
+                pbar.update(len(batch_ids))
         if embeddings:
             embeddings = np.vstack(embeddings).astype(np.float32)
+            # Add embeddings to FAISS index with progress bar
             logger.info(f"Adding {len(embeddings)} response embeddings to FAISS index...")
+            # Determine number of batches for indexing
+            index_batch_size = getattr(self, 'index_batch_size', 1000)  # Adjust as needed
+            total_embeddings = len(embeddings)
+            num_index_batches = math.ceil(total_embeddings / index_batch_size)
+            with tqdm(total=total_embeddings, desc="Indexing Embeddings", unit="embedding") as pbar_index:
+                for i in range(0, total_embeddings, index_batch_size):
+                    batch = embeddings[i:i + index_batch_size]
+                    self.index.add(batch)
+                    pbar_index.update(len(batch))
             logger.info("Response embeddings added to FAISS index.")
         else:
             logger.warning("No embeddings to add to FAISS index.")
         # **Sanity Check:** Verify the number of embeddings in FAISS index
         logger.info(f"Total embeddings in FAISS index after addition: {self.index.ntotal}")
+    # def _compute_and_index_response_embeddings(self):
+    #     """
+    #     Computes embeddings for the response pool and adds them to the FAISS index.
+    #     """
+    #     logger.info("Computing embeddings for the response pool...")
+    #     # Ensure all responses are strings
+    #     if not all(isinstance(response, str) for response in self.response_pool):
+    #         logger.error("All elements in response_pool must be strings.")
+    #         raise ValueError("Invalid data type in response_pool.")
+    #     # Proceed with tokenization
+    #     encoded_responses = self.tokenizer(
+    #         self.response_pool,
+    #         padding=True,
+    #         truncation=True,
+    #         max_length=self.max_length,
+    #         return_tensors='tf'
+    #     )
+    #     response_ids = encoded_responses['input_ids']
+    #     # Compute embeddings in batches
+    #     batch_size = getattr(self, 'embedding_batch_size', 64)  # Default to 64 if not set
+    #     embeddings = []
+    #     for i in range(0, len(response_ids), batch_size):
+    #         batch_ids = response_ids[i:i+batch_size]
+    #         # Compute embeddings
+    #         batch_embeddings = self.encoder(batch_ids, training=False).numpy()
+    #         # Normalize embeddings if using inner product or cosine similarity
+    #         faiss.normalize_L2(batch_embeddings)
+    #         embeddings.append(batch_embeddings)
+    #     if embeddings:
+    #         embeddings = np.vstack(embeddings).astype(np.float32)
+    #         # Add embeddings to FAISS index
+    #         logger.info(f"Adding {len(embeddings)} response embeddings to FAISS index...")
+    #         self.index.add(embeddings)
+    #         logger.info("Response embeddings added to FAISS index.")
+    #     else:
+    #         logger.warning("No embeddings to add to FAISS index.")
+    #     # **Sanity Check:** Verify the number of embeddings in FAISS index
+    #     logger.info(f"Total embeddings in FAISS index after addition: {self.index.ntotal}")
     def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
         """Find hard negatives for a batch of queries with error handling and retries."""
         return query_ids, positive_ids, negative_ids
+    # Testing updated batch tokenization
+    def prepare_and_save_data(self, dialogues: List[dict], tf_record_path: str, batch_size: int = 32):
+        """
+        Processes dialogues in batches and saves to a TFRecord file using optimized batch tokenization and encoding.
+        Args:
+            dialogues (List[dict]): List of dialogue dictionaries.
+            tf_record_path (str): Path to save the TFRecord file.
+            batch_size (int): Number of dialogues to process per batch.
+        """
+        logger.info(f"Preparing and saving data to {tf_record_path}...")
+        num_dialogues = len(dialogues)
+        num_batches = math.ceil(num_dialogues / batch_size)
+        with tf.io.TFRecordWriter(tf_record_path) as writer:
+            # Initialize progress bar
+            with tqdm(total=num_batches, desc="Preparing Data Batches", unit="batch") as pbar:
+                for i in range(num_batches):
+                    start_idx = i * batch_size
+                    end_idx = min(start_idx + batch_size, num_dialogues)
+                    batch_dialogues = dialogues[start_idx:end_idx]
+                    # Extract all query-positive pairs in the batch
+                    queries = []
+                    positives = []
                     for dialogue in batch_dialogues:
                         pairs = self._extract_pairs_from_dialogue(dialogue)
                         for query, positive in pairs:
+                            if len(query) <= self.max_length and len(positive) <= self.max_length:
+                                queries.append(query)
+                                positives.append(positive)
+                    if not queries:
+                        pbar.update(1)
+                        continue  # Skip if no valid queries
+                    # Compute and cache query embeddings
+                    try:
+                        self._compute_embeddings(queries)
+                    except Exception as e:
+                        logger.error(f"Error computing embeddings: {e}")
+                        pbar.update(1)
+                        continue  # Skip to the next batch
+                    # Find hard negatives for the batch
+                    try:
+                        hard_negatives = self._find_hard_negatives_batch(queries, positives)
+                    except Exception as e:
+                        logger.error(f"Error finding hard negatives: {e}")
+                        pbar.update(1)
+                        continue  # Skip to the next batch
+                    # Tokenize and encode all queries, positives, and negatives in the batch
+                    try:
+                        encoded_queries = self.tokenizer.batch_encode_plus(
+                            queries,
+                            max_length=self.config.max_context_token_limit,
+                            truncation=True,
+                            padding='max_length',
+                            return_tensors='tf'
+                        )
+                        encoded_positives = self.tokenizer.batch_encode_plus(
+                            positives,
+                            max_length=self.config.max_context_token_limit,
+                            truncation=True,
+                            padding='max_length',
+                            return_tensors='tf'
+                        )
+                    except Exception as e:
+                        logger.error(f"Error during tokenization: {e}")
+                        pbar.update(1)
+                        continue  # Skip to the next batch
+                    # Flatten hard_negatives while maintaining alignment
+                    # Assuming hard_negatives is a list of lists, where each sublist corresponds to a query
+                    try:
+                        flattened_negatives = [neg for sublist in hard_negatives for neg in sublist]
+                        encoded_negatives = self.tokenizer.batch_encode_plus(
+                            flattened_negatives,
+                            max_length=self.config.max_context_token_limit,
+                            truncation=True,
+                            padding='max_length',
+                            return_tensors='tf'
+                        )
+                        # Reshape encoded_negatives['input_ids'] to [num_queries, num_negatives, max_length]
+                        num_negatives = self.config.neg_samples
+                        reshaped_negatives = encoded_negatives['input_ids'].numpy().reshape(-1, num_negatives, self.config.max_context_token_limit)
+                    except Exception as e:
+                        logger.error(f"Error during negatives tokenization: {e}")
+                        pbar.update(1)
+                        continue  # Skip to the next batch
+                    # Serialize each example and write to TFRecord
+                    for j in range(len(queries)):
+                        try:
+                            q_id = encoded_queries['input_ids'][j].numpy()
+                            p_id = encoded_positives['input_ids'][j].numpy()
+                            n_id = reshaped_negatives[j]
+                            feature = {
+                                'query_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=q_id)),
+                                'positive_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=p_id)),
+                                'negative_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=n_id.flatten())),
+                            }
+                            example = tf.train.Example(features=tf.train.Features(feature=feature))
+                            writer.write(example.SerializeToString())
+                        except Exception as e:
+                            logger.error(f"Error serializing example {j} in batch {i}: {e}")
+                            continue  # Skip to the next example
+                    # Update progress bar
+                    pbar.update(1)
+        logger.info(f"Data preparation complete. TFRecord saved.")
+    # def prepare_and_save_data(self, dialogues: List[dict], tfrecord_file_path: str, batch_size: int = 32):
+    #     """Processes dialogues in batches and saves to a TFRecord file."""
+    #     with tf.io.TFRecordWriter(tfrecord_file_path) as writer:
+    #         total_dialogues = len(dialogues)
+    #         logger.debug(f"Total dialogues to process: {total_dialogues}")
+    #         with tqdm(total=total_dialogues, desc="Processing Dialogues", unit="dialogue") as pbar:
+    #             for i in range(0, total_dialogues, batch_size):
+    #                 batch_dialogues = dialogues[i:i+batch_size]
+    #                 # Process each batch_dialogues
+    #                 # Extract pairs, find negatives, tokenize, and serialize
+    #                 # Example:
+    #                 for dialogue in batch_dialogues:
+    #                     pairs = self._extract_pairs_from_dialogue(dialogue)
+    #                     queries = []
+    #                     positives = []
+    #                     for query, positive in pairs:
+    #                         queries.append(query)
+    #                         positives.append(positive)
+    #                     if queries:
+    #                         # **Compute and cache query embeddings before searching**
+    #                         self._compute_embeddings(queries)
+    #                         # Find hard negatives
+    #                         hard_negatives = self._find_hard_negatives_batch(queries, positives)
+    #                         # for idx, negatives in enumerate(hard_negatives[:5]):  # Log first 5 examples
+    #                         #     logger.debug(f"Query: {queries[idx]}")
+    #                         #     logger.debug(f"Positive: {positives[idx]}")
+    #                         #     logger.debug(f"Hard Negatives: {negatives}")
+    #                         # Tokenize and encode
+    #                         query_ids, positive_ids, negative_ids = self._tokenize_and_encode(queries, positives, hard_negatives)
+    #                         # Serialize each example and write to TFRecord
+    #                         for q_id, p_id, n_id in zip(query_ids, positive_ids, negative_ids):
+    #                             feature = {
+    #                                 'query_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=q_id)),
+    #                                 'positive_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=p_id)),
+    #                                 'negative_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=n_id.flatten())),
+    #                             }
+    #                             example = tf.train.Example(features=tf.train.Features(feature=feature))
+    #                             writer.write(example.SerializeToString())
+    #                 pbar.update(len(batch_dialogues))
+    #     logger.info(f"Data preparation complete. TFRecord saved at {tfrecord_file_path}")
     def _tokenize_negatives_tf(self, negatives):
         """Tokenizes negatives using tf.py_function."""