Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Dec 30, 2024

Commit

9decf80

1 Parent(s): 19403c5

FAISS and streaming updates

Browse files

Files changed (5) hide show

chatbot_model.py +1336 -671
conversation_summarizer.py +3 -3
environment_setup.py +15 -11
gpu_monitor.py +68 -0
run_model_train.py +11 -16

chatbot_model.py CHANGED Viewed

@@ -1,26 +1,35 @@
 from transformers import TFAutoModel, AutoTokenizer
 import tensorflow as tf
 import numpy as np
-from typing import List, Tuple, Dict, Optional, Union, Any
 import math
 from dataclasses import dataclass
 import json
-from tqdm import tqdm
 from pathlib import Path
 import datetime
 import faiss
 from response_quality_checker import ResponseQualityChecker
 from cross_encoder_reranker import CrossEncoderReranker
 from conversation_summarizer import DeviceAwareModel, Summarizer
 from logger_config import config_logger
 logger = config_logger(__name__)
 @dataclass
 class ChatbotConfig:
     """Configuration for the RetrievalChatbot."""
-    vocab_size: int = 30526  # DistilBERT vocab size
     max_context_token_limit: int = 512
-    embedding_dim: int = 512  # Match DistilBERT's dimension
     encoder_units: int = 256
     num_attention_heads: int = 8
     dropout_rate: float = 0.2
@@ -130,16 +139,16 @@ class RetrievalChatbot(DeviceAwareModel):
             summarizer = Summarizer(device=self.device)
         self.summarizer = summarizer
-        # Configure XLA optimization if on GPU/TPU
-        if self.device in ["GPU", "TPU"]:
-            tf.config.optimizer.set_jit(True)
-            logger.info(f"XLA compilation enabled for {self.device}")
-        # Configure mixed precision for GPU/TPU
-        if self.device != "CPU":
-            policy = tf.keras.mixed_precision.Policy('mixed_float16')
-            tf.keras.mixed_precision.set_global_policy(policy)
-            logger.info("Mixed precision training enabled (float16)")
         # Special tokens
         self.special_tokens = {
@@ -155,18 +164,13 @@ class RetrievalChatbot(DeviceAwareModel):
             {'additional_special_tokens': list(self.special_tokens.values())}
         )
-        # Build encoders within device strategy scope
-        if self.strategy:
-            with self.strategy.scope():
-                self._build_models()
-        else:
-            self._build_models()
-        # Initialize FAISS index
-        self._initialize_faiss()
-        # Precompute and index response embeddings
-        self._precompute_and_index_responses(dialogues)
         # Initialize training history
         self.history = {
@@ -176,9 +180,10 @@ class RetrievalChatbot(DeviceAwareModel):
             "val_metrics": {}
         }
-    def _build_models(self):
         """Initialize the shared encoder."""
         logger.info("Building encoder model...")
         # Shared encoder for both queries and responses
         self.encoder = EncoderModel(
@@ -191,11 +196,10 @@ class RetrievalChatbot(DeviceAwareModel):
         self.encoder.pretrained.resize_token_embeddings(new_vocab_size)
         logger.info(f"Token embeddings resized to: {new_vocab_size}")
-        # Debug embeddings attributes
-        logger.info("Inspecting embeddings attributes:")
-        for attr in dir(self.encoder.pretrained.distilbert.embeddings):
-            if not attr.startswith('_'):
-                logger.info(f"  {attr}")
         # Try different ways to get embedding dimension
         try:
@@ -227,45 +231,18 @@ class RetrievalChatbot(DeviceAwareModel):
             logger.error("Vocabulary size is less than embedding dimension.")
             raise ValueError("Vocabulary size is less than embedding dimension.")
-    def _initialize_faiss(self):
-        """Initialize FAISS index based on available resources."""
-        logger.info("Initializing FAISS index...")
-        # Determine if GPU FAISS is available
-        try:
-            res = faiss.StandardGpuResources()
-            self.faiss_gpu = True
-            logger.info("FAISS GPU resources initialized.")
-        except Exception as e:
-            self.faiss_gpu = False
-            logger.info("FAISS GPU resources not available. Using FAISS CPU.")
-        # Initialize FAISS index for Inner Product (for cosine similarity)
-        if self.faiss_gpu:
-            self.index = faiss.IndexFlatIP(self.config.embedding_dim)
-            self.index = faiss.index_cpu_to_gpu(res, 0, self.index)
-        else:
-            self.index = faiss.IndexFlatIP(self.config.embedding_dim)
-        logger.info("FAISS index initialized.")
-    def verify_faiss_index(self):
-        """Verify that FAISS index matches the response pool."""
-        indexed_size = self.index.ntotal
-        pool_size = len(self.response_pool)
-        logger.info(f"FAISS index size: {indexed_size}")
-        logger.info(f"Response pool size: {pool_size}")
-        if indexed_size != pool_size:
-            logger.warning("Mismatch between FAISS index size and response pool size.")
-        else:
-            logger.info("FAISS index correctly matches the response pool.")
-    def _precompute_and_index_responses(self, dialogues: List[dict]):
-        """Precompute embeddings for all responses and index them using FAISS."""
-        logger.info("Precomputing response embeddings and indexing with FAISS...")
-        # Use tqdm for collecting responses
         responses = []
-        for dialogue in tqdm(dialogues, desc="Collecting assistant responses"):
             turns = dialogue.get('turns', [])
             for turn in turns:
                 if turn.get('speaker') == 'assistant' and 'text' in turn:
@@ -275,33 +252,97 @@ class RetrievalChatbot(DeviceAwareModel):
         unique_responses = list(set(responses))
         logger.info(f"Found {len(unique_responses)} unique responses.")
-        # Encode responses
-        logger.info("Encoding unique responses")
-        response_embeddings = self.encode_responses(unique_responses)
-        response_embeddings = response_embeddings.numpy()
-        # Ensure float32
-        if response_embeddings.dtype != np.float32:
-            response_embeddings = response_embeddings.astype('float32')
-        # Ensure the array is contiguous in memory
-        if not response_embeddings.flags['C_CONTIGUOUS']:
-            logger.info("Making embeddings contiguous in memory.")
-            response_embeddings = np.ascontiguousarray(response_embeddings)
-        # Normalize embeddings for cosine similarity
-        logger.info("Normalizing embeddings with FAISS.")
-        faiss.normalize_L2(response_embeddings)
-        # Add to FAISS index
-        logger.info("Adding embeddings to FAISS index...")
-        self.index.add(response_embeddings)
-        logger.info(f"Indexed {self.index.ntotal} responses.")
-        # Store responses and embeddings
-        self.response_pool = unique_responses
-        self.response_embeddings = response_embeddings
-        logger.info("Precomputation and indexing completed.")
     def encode_responses(
         self,
@@ -309,55 +350,390 @@ class RetrievalChatbot(DeviceAwareModel):
         batch_size: int = 64
     ) -> tf.Tensor:
         """
-        Encodes a list of responses into embeddings, using chunked/batched processing
-        to avoid running out of memory when there are many responses.
-        Args:
-            responses (List[str]): The list of response texts to encode.
-            batch_size (int): How many responses to encode per chunk.
-                            Adjust based on available GPU/CPU memory.
-        Returns:
-            tf.Tensor: Tensor of shape (N, emb_dim) with all response embeddings.
         """
-        # Accumulate embeddings in a list and concatenate at the end
         all_embeddings = []
-        # Process the responses in chunks of 'batch_size'
-        for start_idx in range(0, len(responses), batch_size):
-            end_idx = start_idx + batch_size
-            batch_texts = responses[start_idx:end_idx]
-            # Tokenize the current batch
-            encodings = self.tokenizer(
-                batch_texts,
-                padding='max_length',
-                truncation=True,
-                max_length=self.config.max_context_token_limit,
-                return_tensors='tf',
-            )
-            # Run the encoder forward pass
-            input_ids = encodings['input_ids']
-            embeddings_batch = self.encoder(input_ids, training=False)
-            # Cast to float32 if needed
-            if embeddings_batch.dtype != tf.float32:
-                embeddings_batch = tf.cast(embeddings_batch, tf.float32)
-            # Collect
-            all_embeddings.append(embeddings_batch)
-        # Concatenate all batch embeddings along axis=0
         if len(all_embeddings) == 1:
-            # Only one batch
             final_embeddings = all_embeddings[0]
         else:
-            # Multiple batches, concatenate
             final_embeddings = tf.concat(all_embeddings, axis=0)
         return final_embeddings
     def encode_query(self, query: str, context: Optional[List[Tuple[str, str]]] = None) -> tf.Tensor:
         """Encode a query with optional conversation context."""
         # Prepare query with context
@@ -436,7 +812,7 @@ class RetrievalChatbot(DeviceAwareModel):
         """Retrieve top-k responses using FAISS."""
         # Encode the query
         q_emb = self.encode_query(query)  # Shape: [1, embedding_dim]
-        q_emb_np = q_emb.numpy().astype('float32')  # Ensure type matches FAISS requirements
         # Normalize the query embedding for cosine similarity
         faiss.normalize_L2(q_emb_np)
@@ -523,130 +899,9 @@ class RetrievalChatbot(DeviceAwareModel):
         logger.info(f"Loaded {len(dialogues)} dialogues.")
         return dialogues
-    def prepare_dataset(
-        self,
-        dialogues: List[dict],
-        neg_samples: int = 1,
-        debug_samples: int = None
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-        """
-        Prepares dataset for multiple-negatives ranking,
-        but also appends 'hard negative' pairs for each query.
-        We'll generate:
-        - (query, positive) as usual
-        - (query, negative) for each query, using FAISS top-1 approx. negative.
-        Then, in-batch training sees them as 'two different positives'
-        for the same query, forcing the model to discriminate them.
-        """
-        logger.info("Preparing in-batch dataset with hard negatives...")
-        queries, positives = [], []
-        # Assemble (q, p)
-        for dialogue in dialogues:
-            turns = dialogue.get('turns', [])
-            for i in range(len(turns) - 1):
-                current_turn = turns[i]
-                next_turn = turns[i+1]
-                if (current_turn.get('speaker') == 'user'
-                    and next_turn.get('speaker') == 'assistant'
-                    and 'text' in current_turn
-                    and 'text' in next_turn):
-                    query_text = current_turn['text'].strip()
-                    pos_text   = next_turn['text'].strip()
-                    queries.append(query_text)
-                    positives.append(pos_text)
-        # Debug slicing
-        if debug_samples is not None:
-            queries = queries[:debug_samples]
-            positives = positives[:debug_samples]
-            logger.info(f"Debug mode: limited to {debug_samples} pairs.")
-        logger.info(f"Prepared {len(queries)} (query, positive) pairs initially.")
-        # Find a hard negative from FAISS for each (q, p)
-        # Create a second 'positive' row => (q, negative). In-batch, it's seen as a different 'positive' row, but is a hard negative.
-        augmented_queries = []
-        augmented_positives = []
-        for q_text, p_text in zip(queries, positives):
-            neg_texts = self._find_hard_negative(q_text, p_text, top_k=5, neg_samples=neg_samples)
-            for neg_text in neg_texts:
-                augmented_queries.append(q_text)
-                augmented_positives.append(neg_text)
-        logger.info(f"Found hard negatives for {len(augmented_queries)} queries.")
-        # Combine them into a single big list -> Original pairs: (q, p) & Hard neg pairs: (q, n)
-        final_queries = queries + augmented_queries
-        final_positives = positives + augmented_positives
-        logger.info(f"Total dataset size after adding hard neg: {len(final_queries)}")
-        # Tokenize
-        encoded_queries = self.tokenizer(
-            final_queries,
-            padding='max_length',
-            truncation=True,
-            max_length=self.config.max_context_token_limit,
-            return_tensors='tf'
-        )
-        encoded_positives = self.tokenizer(
-            final_positives,
-            padding='max_length',
-            truncation=True,
-            max_length=self.config.max_context_token_limit,
-            return_tensors='tf'
-        )
-        q_tensor = encoded_queries['input_ids']
-        p_tensor = encoded_positives['input_ids']
-        logger.info("Tokenized and padded sequences for in-batch training + hard negatives.")
-        return q_tensor, p_tensor
-    def _find_hard_negative(
-        self,
-        query_text: str,
-        positive_text: str,
-        top_k: int = 5,
-        neg_samples: int = 1
-    ) -> List[str]:
-        """
-        Return up to `neg_samples` unique negatives from top_k FAISS results,
-        excluding the known positive_text.
-        """
-        # Encode the query to get the embedding
-        query_emb = self.encode_query(query_text)
-        q_emb_np = query_emb.numpy().astype('float32')
-        # Normalize for cosine similarity
-        faiss.normalize_L2(q_emb_np)
-        # Search in FAISS
-        distances, indices = self.index.search(q_emb_np, top_k)
-        # Exclude the actual positive from these results
-        hard_negatives = []
-        for idx in indices[0]:
-            if idx < len(self.response_pool):
-                candidate = self.response_pool[idx].strip()
-                if candidate != positive_text.strip():
-                    hard_negatives.append(candidate)
-                    if len(hard_negatives) == neg_samples:
-                        break
-        return hard_negatives
-    def train(
         self,
-        q_pad: tf.Tensor,
-        p_pad: tf.Tensor,
         epochs: int = 20,
         batch_size: int = 16,
         validation_split: float = 0.2,
@@ -656,23 +911,41 @@ class RetrievalChatbot(DeviceAwareModel):
         warmup_steps_ratio: float = 0.1,
         early_stopping_patience: int = 3,
         min_delta: float = 1e-4,
-        accum_steps: int = 2  # Gradient accumulation steps
-    ):
-        dataset_size = tf.shape(q_pad)[0].numpy()
-        val_size = int(dataset_size * validation_split)
-        train_size = dataset_size - val_size
-        logger.info(f"Total samples: {dataset_size}")
-        logger.info(f"Training samples: {train_size}")
-        logger.info(f"Validation samples: {val_size}")
-        steps_per_epoch = train_size // batch_size
-        if train_size % batch_size != 0:
-            steps_per_epoch += 1
         total_steps = steps_per_epoch * epochs
-        logger.info(f"Total training steps (approx): {total_steps}")
-        # 1) Set up LR schedule or fixed LR
         if use_lr_schedule:
             warmup_steps = int(total_steps * warmup_steps_ratio)
             lr_schedule = self._get_lr_schedule(
@@ -686,175 +959,290 @@ class RetrievalChatbot(DeviceAwareModel):
             self.optimizer = tf.keras.optimizers.Adam(learning_rate=peak_lr)
             logger.info("Using fixed learning rate.")
-        # 2) Prepare data splits
-        train_q = q_pad[:train_size]
-        train_p = p_pad[:train_size]
-        val_q = q_pad[train_size:]
-        val_p = p_pad[train_size:]
-        train_dataset = (tf.data.Dataset.from_tensor_slices((train_q, train_p))
-                    .shuffle(4096)
-                    .batch(batch_size)
-                    .prefetch(tf.data.AUTOTUNE))
-        val_dataset = (tf.data.Dataset.from_tensor_slices((val_q, val_p))
-                    .batch(batch_size)
-                    .prefetch(tf.data.AUTOTUNE))
-        # 3) Checkpoint + manager
         checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.encoder)
         manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)
-        # 4) TensorBoard setup
         log_dir = Path(checkpoint_dir) / "tensorboard_logs"
         log_dir.mkdir(parents=True, exist_ok=True)
         current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
         train_log_dir = str(log_dir / f"train_{current_time}")
         val_log_dir = str(log_dir / f"val_{current_time}")
         train_summary_writer = tf.summary.create_file_writer(train_log_dir)
         val_summary_writer = tf.summary.create_file_writer(val_log_dir)
         logger.info(f"TensorBoard logs will be saved in {log_dir}")
-        # 5) Early stopping
         best_val_loss = float("inf")
         epochs_no_improve = 0
-        logger.info("Beginning training loop...")
-        global_step = 0
-        # Prepare zero-initialized accumulators for your trainable variables
-        # We'll accumulate gradients across mini-batches, then apply them every accum_steps.
-        train_vars = self.encoder.pretrained.trainable_variables
-        accum_grads = [tf.zeros_like(var, dtype=tf.float32) for var in train_vars]
-        from tqdm import tqdm
-        for epoch in range(1, epochs + 1):
-            logger.info(f"\n=== Epoch {epoch}/{epochs} ===")
-            epoch_loss_avg = tf.keras.metrics.Mean()
-            step_in_epoch = 0
-            with tqdm(total=steps_per_epoch, desc=f"Training Epoch {epoch}") as pbar:
-                for (q_batch, p_batch) in train_dataset:
-                    step_in_epoch += 1
-                    global_step += 1
-                    with tf.GradientTape() as tape:
-                        q_enc = self.encoder(q_batch, training=True)
-                        p_enc = self.encoder(p_batch, training=True)
-                        sim_matrix = tf.matmul(q_enc, p_enc, transpose_b=True)
-                        bsz = tf.shape(q_enc)[0]
-                        labels = tf.range(bsz, dtype=tf.int32)
-                        loss_value = tf.nn.sparse_softmax_cross_entropy_with_logits(
-                            labels=labels, logits=sim_matrix
-                        )
-                        loss_value = tf.reduce_mean(loss_value)
-                    gradients = tape.gradient(loss_value, train_vars)
-                    # -- Accumulate gradients --
-                    for i, grad in enumerate(gradients):
-                        if grad is not None:
-                            accum_grads[i] += tf.cast(grad, tf.float32)
-                    epoch_loss_avg(loss_value)
-                    # -- Apply gradients every 'accum_steps' mini-batches --
-                    if (step_in_epoch % accum_steps) == 0:
-                        # Scale by 1/accum_steps so that each accumulation cycle
-                        # is effectively the same as one “normal” update
-                        for i in range(len(accum_grads)):
-                            accum_grads[i] /= accum_steps
-                        self.optimizer.apply_gradients(
-                            [(accum_grads[i], train_vars[i]) for i in range(len(accum_grads))]
-                        )
-                        # Reset the accumulator
-                        accum_grads = [tf.zeros_like(var, dtype=tf.float32) for var in train_vars]
-                    # Logging / tqdm updates
-                    if use_lr_schedule:
-                        # measure current LR
-                        lr = self.optimizer.learning_rate
-                        if isinstance(lr, tf.keras.optimizers.schedules.LearningRateSchedule):
-                            current_step = tf.cast(self.optimizer.iterations, tf.float32)
-                            current_lr = lr(current_step)
                         else:
-                            current_lr = lr
-                        current_lr_value = float(current_lr.numpy())
-                    else:
-                        current_lr_value = float(self.optimizer.learning_rate.numpy())
-                    pbar.update(1)
-                    pbar.set_postfix({
-                        "loss": f"{loss_value.numpy():.4f}",
-                        "lr": f"{current_lr_value:.2e}"
-                    })
-                    # TensorBoard logging omitted for brevity...
-            # -- Handle leftover partial accumulation at epoch end --
-            leftover = (step_in_epoch % accum_steps)
-            if leftover != 0:
-                logger.info(f"Applying leftover accum_grads for partial batch group (size={leftover}).")
-                # If you want each leftover batch to contribute proportionally:
-                # multiply by leftover/accum_steps (this ensures leftover
-                # steps have the same "average" effect as a full accumulation cycle)
-                for i in range(len(accum_grads)):
-                    accum_grads[i] *= float(leftover) / float(accum_steps)
-                self.optimizer.apply_gradients(
-                    [(accum_grads[i], train_vars[i]) for i in range(len(accum_grads))]
-                )
-                accum_grads = [tf.zeros_like(var, dtype=tf.float32) for var in train_vars]
-            # Validation
-            val_loss_avg = tf.keras.metrics.Mean()
-            for q_val, p_val in val_dataset:
-                q_enc = self.encoder(q_val, training=False)
-                p_enc = self.encoder(p_val, training=False)
-                sim_matrix = tf.matmul(q_enc, p_enc, transpose_b=True)
-                bs_val = tf.shape(q_enc)[0]
-                labels_val = tf.range(bs_val, dtype=tf.int32)
-                loss_val = tf.nn.sparse_softmax_cross_entropy_with_logits(
-                    labels=labels_val,
-                    logits=sim_matrix
-                )
-                val_loss_avg(tf.reduce_mean(loss_val))
-            train_loss = epoch_loss_avg.result().numpy()
-            val_loss = val_loss_avg.result().numpy()
-            logger.info(f"Epoch {epoch} Complete: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
-            # TensorBoard: validation loss
-            with val_summary_writer.as_default():
-                tf.summary.scalar("val_loss", val_loss, step=epoch)
-            # Save checkpoint
-            manager.save()
-            # Update history
-            self.history['train_loss'].append(train_loss)
-            self.history['val_loss'].append(val_loss)
-            self.history.setdefault('learning_rate', []).append(float(current_lr_value))
-            # Early stopping
-            if val_loss < best_val_loss - min_delta:
-                best_val_loss = val_loss
-                epochs_no_improve = 0
-                logger.info(f"Validation loss improved to {val_loss:.4f}. Reset patience.")
             else:
-                epochs_no_improve += 1
-                logger.info(f"No improvement this epoch. Patience: {epochs_no_improve}/{early_stopping_patience}")
-                if epochs_no_improve >= early_stopping_patience:
-                    logger.info("Early stopping triggered.")
-                    break
-        logger.info("In-batch training completed!")
     def _get_lr_schedule(
         self,
@@ -994,277 +1382,554 @@ class RetrievalChatbot(DeviceAwareModel):
         conversation_parts.append(f"{self.special_tokens['user']} {query}")
         return "\n".join(conversation_parts)
-# def prepare_dataset(
-#     self,
-#     dialogues: List[dict],
-#     debug_samples: int = None
-# ) -> Tuple[tf.Tensor, tf.Tensor]:
-#     """
-#     Prepares dataset for in-batch negatives:
-#     Only returns (query, positive) pairs.
-#     """
-#     logger.info("Preparing in-batch dataset...")
-#     queries, positives = [], []
-#     for dialogue in dialogues:
-#         turns = dialogue.get('turns', [])
-#         for i in range(len(turns) - 1):
-#             current_turn = turns[i]
-#             next_turn = turns[i+1]
-#             if (current_turn.get('speaker') == 'user' and
-#                 next_turn.get('speaker') == 'assistant' and
-#                 'text' in current_turn and
-#                 'text' in next_turn):
-#                 query = current_turn['text'].strip()
-#                 positive = next_turn['text'].strip()
-#                 queries.append(query)
-#                 positives.append(positive)
-#     # Optional debug slicing
-#     if debug_samples is not None:
-#         queries = queries[:debug_samples]
-#         positives = positives[:debug_samples]
-#         logger.info(f"Debug mode: limited to {debug_samples} pairs.")
-#     logger.info(f"Prepared {len(queries)} (query, positive) pairs.")
-#     # Tokenize queries
-#     encoded_queries = self.tokenizer(
-#         queries,
-#         padding='max_length',
-#         truncation=True,
-#         max_length=self.config.max_sequence_length,
-#         return_tensors='tf'
-#     )
-#     # Tokenize positives
-#     encoded_positives = self.tokenizer(
-#         positives,
-#         padding='max_length',
-#         truncation=True,
-#         max_length=self.config.max_sequence_length,
-#         return_tensors='tf'
-#     )
-#     q_tensor = encoded_queries['input_ids']
-#     p_tensor = encoded_positives['input_ids']
-#     logger.info("Tokenized and padded sequences for in-batch training.")
-#     return q_tensor, p_tensor
-# def train(
-#     self,
-#     q_pad: tf.Tensor,
-#     p_pad: tf.Tensor,
-#     epochs: int = 20,
-#     batch_size: int = 16,
-#     validation_split: float = 0.2,
-#     checkpoint_dir: str = "checkpoints/",
-#     use_lr_schedule: bool = True,
-#     peak_lr: float = 2e-5,
-#     warmup_steps_ratio: float = 0.1,
-#     early_stopping_patience: int = 3,
-#     min_delta: float = 1e-4
-# ):
-#     dataset_size = tf.shape(q_pad)[0].numpy()
-#     val_size = int(dataset_size * validation_split)
-#     train_size = dataset_size - val_size
-#     logger.info(f"Total samples: {dataset_size}")
-#     logger.info(f"Training samples: {train_size}")
-#     logger.info(f"Validation samples: {val_size}")
-#     steps_per_epoch = train_size // batch_size
-#     if train_size % batch_size != 0:
-#         steps_per_epoch += 1
-#     total_steps = steps_per_epoch * epochs
-#     logger.info(f"Total training steps (approx): {total_steps}")
-#     # 1) Set up LR schedule or fixed LR
-#     if use_lr_schedule:
-#         warmup_steps = int(total_steps * warmup_steps_ratio)
-#         lr_schedule = self._get_lr_schedule(
-#             total_steps=total_steps,
-#             peak_lr=peak_lr,
-#             warmup_steps=warmup_steps
-#         )
-#         self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
-#         logger.info("Using custom learning rate schedule.")
-#     else:
-#         self.optimizer = tf.keras.optimizers.Adam(learning_rate=peak_lr)
-#         logger.info("Using fixed learning rate.")
-#     # 2) Prepare data splits
-#     train_q = q_pad[:train_size]
-#     train_p = p_pad[:train_size]
-#     val_q = q_pad[train_size:]
-#     val_p = p_pad[train_size:]
-#     train_dataset = tf.data.Dataset.from_tensor_slices((train_q, train_p))
-#     train_dataset = train_dataset.shuffle(buffer_size=4096).batch(batch_size)
-#     val_dataset = tf.data.Dataset.from_tensor_slices((val_q, val_p))
-#     val_dataset = val_dataset.batch(batch_size)
-#     # 3) Checkpoint + manager
-#     checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.encoder)
-#     manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)
-#     # 4) TensorBoard setup
-#     log_dir = Path(checkpoint_dir) / "tensorboard_logs"
-#     log_dir.mkdir(parents=True, exist_ok=True)
-#     current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
-#     train_log_dir = str(log_dir / f"train_{current_time}")
-#     val_log_dir = str(log_dir / f"val_{current_time}")
-#     train_summary_writer = tf.summary.create_file_writer(train_log_dir)
-#     val_summary_writer = tf.summary.create_file_writer(val_log_dir)
-#     logger.info(f"TensorBoard logs will be saved in {log_dir}")
-#     # 5) Early stopping
-#     best_val_loss = float("inf")
-#     epochs_no_improve = 0
-#     logger.info("Beginning training loop...")
-#     global_step = 0
-#     from tqdm import tqdm
-#     for epoch in range(1, epochs + 1):
-#         logger.info(f"\n=== Epoch {epoch}/{epochs} ===")
-#         epoch_loss_avg = tf.keras.metrics.Mean()
-#         # Training loop
-#         with tqdm(total=steps_per_epoch, desc=f"Training Epoch {epoch}") as pbar:
-#             for (q_batch, p_batch) in train_dataset:
-#                 global_step += 1
-#                 # Train step
-#                 batch_loss = self._train_step(q_batch, p_batch)
-#                 epoch_loss_avg(batch_loss)
-#                 # Get current LR
-#                 if use_lr_schedule:
-#                     lr = self.optimizer.learning_rate
-#                     if isinstance(lr, tf.keras.optimizers.schedules.LearningRateSchedule):
-#                         # Get the current step
-#                         current_step = tf.cast(self.optimizer.iterations, tf.float32)
-#                         # Compute the current learning rate
-#                         current_lr = lr(current_step)
-#                     else:
-#                         # If learning_rate is not a schedule, use it directly
-#                         current_lr = lr
-#                     # Convert to float for logging
-#                     current_lr_value = float(current_lr.numpy())
-#                 else:
-#                     # If using fixed learning rate
-#                     current_lr_value = float(self.optimizer.learning_rate.numpy())
-#                 # Update tqdm
-#                 pbar.update(1)
-#                 pbar.set_postfix({
-#                     "loss": f"{batch_loss.numpy():.4f}",
-#                     "lr": f"{current_lr_value:.2e}"
-#                 })
-#                 # TensorBoard: log train metrics per step
-#                 with train_summary_writer.as_default():
-#                     tf.summary.scalar("loss", batch_loss, step=global_step)
-#                     tf.summary.scalar("learning_rate", current_lr_value, step=global_step)
-#         # Validation
-#         val_loss_avg = tf.keras.metrics.Mean()
-#         for q_val, p_val in val_dataset:
-#             q_enc = self.encoder(q_val, training=False)
-#             p_enc = self.encoder(p_val, training=False)
-#             sim_matrix = tf.matmul(q_enc, p_enc, transpose_b=True)
-#             bs_val = tf.shape(q_enc)[0]
-#             labels_val = tf.range(bs_val, dtype=tf.int32)
-#             loss_val = tf.nn.sparse_softmax_cross_entropy_with_logits(
-#                 labels=labels_val,
-#                 logits=sim_matrix
-#             )
-#             val_loss_avg(tf.reduce_mean(loss_val))
-#         train_loss = epoch_loss_avg.result().numpy()
-#         val_loss = val_loss_avg.result().numpy()
-#         logger.info(f"Epoch {epoch} Complete: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
-#         # TensorBoard: validation loss
-#         with val_summary_writer.as_default():
-#             tf.summary.scalar("val_loss", val_loss, step=epoch)
-#         # Save checkpoint
-#         manager.save()
-#         # Update history
-#         self.history['train_loss'].append(train_loss)
-#         self.history['val_loss'].append(val_loss)
-#         self.history.setdefault('learning_rate', []).append(float(current_lr_value))
-#         # Early stopping
-#         if val_loss < best_val_loss - min_delta:
-#             best_val_loss = val_loss
-#             epochs_no_improve = 0
-#             logger.info(f"Validation loss improved to {val_loss:.4f}. Reset patience.")
-#         else:
-#             epochs_no_improve += 1
-#             logger.info(f"No improvement this epoch. Patience: {epochs_no_improve}/{early_stopping_patience}")
-#             if epochs_no_improve >= early_stopping_patience:
-#                 logger.info("Early stopping triggered.")
-#                 break
-#     logger.info("In-batch training completed!")
-# @tf.function
-# def _train_step(self, q_batch, p_batch):
-#     """
-#     Single training step using in-batch negatives.
-#     q_batch: (batch_size, seq_len) int32 input_ids for queries
-#     p_batch: (batch_size, seq_len) int32 input_ids for positives
-#     """
-#     with tf.GradientTape() as tape:
-#         # Encode queries and positives
-#         q_enc = self.encoder(q_batch, training=True)  # [B, emb_dim]
-#         p_enc = self.encoder(p_batch, training=True)  # [B, emb_dim]
-#         # Compute similarity matrix: (B, B) = q_enc * p_enc^T
-#         # If embeddings are L2-normalized, this is cosine similarity
-#         sim_matrix = tf.matmul(q_enc, p_enc, transpose_b=True)  # [B, B]
-#         # Labels are just the diagonal indices
-#         batch_size = tf.shape(q_enc)[0]
-#         labels = tf.range(batch_size, dtype=tf.int32)  # [0..B-1]
-#         # Softmax cross-entropy
-#         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-#             labels=labels,
-#             logits=sim_matrix
-#         )
-#         loss = tf.reduce_mean(loss)
-#     # Compute gradients for the pretrained DistilBERT variables only
-#     train_vars = self.encoder.pretrained.trainable_variables
-#     gradients = tape.gradient(loss, train_vars)
-#     # Remove any None grads (in case some layers are frozen)
-#     grads_and_vars = [(g, v) for g, v in zip(gradients, train_vars) if g is not None]
-#     if grads_and_vars:
-#         self.optimizer.apply_gradients(grads_and_vars)
-#     return loss

+import time
 from transformers import TFAutoModel, AutoTokenizer
 import tensorflow as tf
 import numpy as np
+import threading
+from queue import Queue, Empty
+from typing import Generator, List, Tuple, Dict, Optional, Union, Any
 import math
 from dataclasses import dataclass
 import json
 from pathlib import Path
 import datetime
 import faiss
+import gc
+import random
 from response_quality_checker import ResponseQualityChecker
 from cross_encoder_reranker import CrossEncoderReranker
 from conversation_summarizer import DeviceAwareModel, Summarizer
+from gpu_monitor import GPUMemoryMonitor
+import absl.logging
 from logger_config import config_logger
+from tqdm.auto import tqdm
+absl.logging.set_verbosity(absl.logging.WARNING)
 logger = config_logger(__name__)
 @dataclass
 class ChatbotConfig:
     """Configuration for the RetrievalChatbot."""
+    vocab_size: int = 30526  # DistilBERT vocab size + special tokens
     max_context_token_limit: int = 512
+    embedding_dim: int = 512
     encoder_units: int = 256
     num_attention_heads: int = 8
     dropout_rate: float = 0.2
             summarizer = Summarizer(device=self.device)
         self.summarizer = summarizer
+        # # Configure XLA optimization if on GPU/TPU
+        # if self.device in ["GPU", "TPU"]:
+        #     tf.config.optimizer.set_jit(True)
+        #     logger.info(f"XLA compilation enabled for {self.device}")
+        # # Configure mixed precision for GPU/TPU
+        # if self.device != "CPU":
+        #     policy = tf.keras.mixed_precision.Policy('mixed_float16')
+        #     tf.keras.mixed_precision.set_global_policy(policy)
+        #     logger.info("Mixed precision training enabled (float16)")
         # Special tokens
         self.special_tokens = {
             {'additional_special_tokens': list(self.special_tokens.values())}
         )
+        self.memory_monitor = GPUMemoryMonitor()
+        self.min_batch_size = 8
+        self.max_batch_size = 128
+        self.current_batch_size = 32
+        # Collect unique responses from dialogues
+        self.response_pool, self.unique_responses = self._collect_responses(dialogues)
         # Initialize training history
         self.history = {
             "val_metrics": {}
         }
+    def build_models(self):
         """Initialize the shared encoder."""
         logger.info("Building encoder model...")
+        tf.keras.backend.clear_session()
         # Shared encoder for both queries and responses
         self.encoder = EncoderModel(
         self.encoder.pretrained.resize_token_embeddings(new_vocab_size)
         logger.info(f"Token embeddings resized to: {new_vocab_size}")
+        # Initialize FAISS index (moved here from __init__)
+        self._initialize_faiss()
+        # Compute embeddings after FAISS is initialized and moved
+        self._compute_and_index_embeddings()
         # Try different ways to get embedding dimension
         try:
             logger.error("Vocabulary size is less than embedding dimension.")
             raise ValueError("Vocabulary size is less than embedding dimension.")
+    def _collect_responses(self, dialogues: List[dict]) -> Tuple[List[str], List[str]]:
+        """Collect all unique responses from dialogues."""
+        logger.info("Collecting responses from dialogues...")
         responses = []
+        try:
+            progress_bar = tqdm(dialogues, desc="Collecting assistant responses")
+        except ImportError:
+            progress_bar = dialogues
+            logger.info("Progress bar disabled - continuing without visual progress")
+        for dialogue in progress_bar:
             turns = dialogue.get('turns', [])
             for turn in turns:
                 if turn.get('speaker') == 'assistant' and 'text' in turn:
         unique_responses = list(set(responses))
         logger.info(f"Found {len(unique_responses)} unique responses.")
+        return responses, unique_responses
+    def _adjust_batch_size(self) -> None:
+        """Dynamically adjust batch size based on GPU memory usage."""
+        if self.memory_monitor.should_reduce_batch_size():
+            new_size = max(self.min_batch_size, self.current_batch_size // 2)
+            if new_size != self.current_batch_size:
+                logger.info(f"Reducing batch size to {new_size} due to high memory usage")
+                self.current_batch_size = new_size
+                gc.collect()
+                if tf.config.list_physical_devices('GPU'):
+                    tf.keras.backend.clear_session()
+        elif self.memory_monitor.can_increase_batch_size():
+            new_size = min(self.max_batch_size, self.current_batch_size * 2)
+            if new_size != self.current_batch_size:
+                logger.info(f"Increasing batch size to {new_size}")
+                self.current_batch_size = new_size
+    def _initialize_faiss(self):
+        """Initialize FAISS with safer GPU handling and memory monitoring."""
+        logger.info("Initializing FAISS index...")
+        # First, detect if we have GPU-enabled FAISS
+        self.faiss_gpu = False
+        self.gpu_resources = []
+        try:
+            if hasattr(faiss, 'get_num_gpus'):
+                ngpus = faiss.get_num_gpus()
+                if ngpus > 0:
+                    # Configure GPU resources with memory limit
+                    for i in range(ngpus):
+                        res = faiss.StandardGpuResources()
+                        # Set temp memory to 1/4 of total memory to avoid OOM
+                        if self.memory_monitor.has_gpu:
+                            stats = self.memory_monitor.get_memory_stats()
+                            if stats:
+                                temp_memory = int(stats.total * 0.25)  # 25% of total memory
+                                res.setTempMemory(temp_memory)
+                        self.gpu_resources.append(res)
+                    self.faiss_gpu = True
+                    logger.info(f"FAISS GPU resources initialized on {ngpus} GPUs")
+            else:
+                logger.info("Using CPU-only FAISS build")
+        except Exception as e:
+            logger.warning(f"Using CPU due to GPU initialization error: {e}")
+        # TODO: figure out buf with faiss-gpu
+        try:
+            # Create appropriate index based on dataset size
+            if len(self.unique_responses) < 1000:
+                logger.info("Small dataset detected, using simple FlatIP index")
+                self.index = faiss.IndexFlatIP(self.config.embedding_dim)
+            else:
+                # Use IVF index with dynamic number of clusters
+                # nlist = min(
+                #     25,  # max clusters
+                #     max(int(math.sqrt(len(self.unique_responses))), 1)  # min 1 cluster
+                # )
+                # logger.info(f"Using IVF index with {nlist} clusters")
+                # quantizer = faiss.IndexFlatIP(self.config.embedding_dim)
+                # self.index = faiss.IndexIVFFlat(
+                #     quantizer,
+                #     self.config.embedding_dim,
+                #     nlist,
+                #     faiss.METRIC_INNER_PRODUCT
+                # )
+                self.index = faiss.IndexFlatIP(self.config.embedding_dim)
+            # # Move to GPU(s) if available
+            # if self.faiss_gpu and self.gpu_resources:
+            #     try:
+            #         if len(self.gpu_resources) > 1:
+            #             self.index = faiss.index_cpu_to_gpus_list(self.index, self.gpu_resources)
+            #             logger.info("FAISS index distributed across multiple GPUs")
+            #         else:
+            #             self.index = faiss.index_cpu_to_gpu(self.gpu_resources[0], 0, self.index)
+            #             logger.info("FAISS index moved to single GPU")
+            #     except Exception as e:
+            #         logger.warning(f"Failed to move index to GPU: {e}. Falling back to CPU")
+            #         self.faiss_gpu = False
+            # # Set search parameters for IVF index
+            # if isinstance(self.index, faiss.IndexIVFFlat):
+            #     self.index.nprobe = min(10, nlist)
+        except Exception as e:
+            logger.error(f"Error initializing FAISS: {e}")
+            raise
     def encode_responses(
         self,
         batch_size: int = 64
     ) -> tf.Tensor:
         """
+        Encodes responses with more conservative memory management.
         """
         all_embeddings = []
+        self.current_batch_size = batch_size
+        # Memory stats
+        # if self.memory_monitor.has_gpu:
+        #     initial_stats = self.memory_monitor.get_memory_stats()
+        #     if initial_stats:
+        #         logger.info("Initial GPU memory state:")
+        #         logger.info(f"Total: {initial_stats.total / 1e9:.2f}GB")
+        #         logger.info(f"Used: {initial_stats.used / 1e9:.2f}GB")
+        #         logger.info(f"Free: {initial_stats.free / 1e9:.2f}GB")
+        total_processed = 0
+        with tqdm(total=len(responses), desc="Encoding responses") as pbar:
+            while total_processed < len(responses):
+                # Monitor memory and adjust batch size
+                if self.memory_monitor.has_gpu:
+                    gpu_usage = self.memory_monitor.get_memory_usage()
+                    if gpu_usage > 0.8:  # Over 80% usage
+                        self.current_batch_size = max(128, self.current_batch_size // 2)
+                        logger.info(f"High GPU memory usage ({gpu_usage:.1%}), reducing batch size to {self.current_batch_size}")
+                        gc.collect()
+                        tf.keras.backend.clear_session()
+                # Get batch
+                end_idx = min(total_processed + self.current_batch_size, len(responses))
+                batch_texts = responses[total_processed:end_idx]
+                try:
+                    # Tokenize
+                    encodings = self.tokenizer(
+                        batch_texts,
+                        padding='max_length',
+                        truncation=True,
+                        max_length=self.config.max_context_token_limit,
+                        return_tensors='tf'
+                    )
+                    # Encode
+                    embeddings_batch = self.encoder(encodings['input_ids'], training=False)
+                    # Cast to float32
+                    if embeddings_batch.dtype != tf.float32:
+                        embeddings_batch = tf.cast(embeddings_batch, tf.float32)
+                    # Store
+                    all_embeddings.append(embeddings_batch)
+                    # Update progress
+                    batch_processed = len(batch_texts)
+                    total_processed += batch_processed
+                    # Update progress bar
+                    if self.memory_monitor.has_gpu:
+                        gpu_usage = self.memory_monitor.get_memory_usage()
+                        pbar.set_postfix({
+                            'GPU mem': f'{gpu_usage:.1%}',
+                            'batch_size': self.current_batch_size
+                        })
+                    pbar.update(batch_processed)
+                    # Memory cleanup every 1000 samples
+                    if total_processed % 1000 == 0:
+                        gc.collect()
+                        if tf.config.list_physical_devices('GPU'):
+                            tf.keras.backend.clear_session()
+                except tf.errors.ResourceExhaustedError:
+                    logger.warning("GPU memory exhausted during encoding, reducing batch size")
+                    self.current_batch_size = max(8, self.current_batch_size // 2)
+                    continue
+                except Exception as e:
+                    logger.error(f"Error during encoding: {str(e)}")
+                    raise
+        # Concatenate results
+        #logger.info("Concatenating embeddings...")
         if len(all_embeddings) == 1:
             final_embeddings = all_embeddings[0]
         else:
             final_embeddings = tf.concat(all_embeddings, axis=0)
         return final_embeddings
+    def _train_faiss_index(self, response_embeddings: np.ndarray) -> None:
+        """Train FAISS index with better memory management and robust fallback mechanisms."""
+        if self.index.is_trained:
+            logger.info("Index already trained, skipping training phase")
+            return
+        logger.info("Starting FAISS index training...")
+        try:
+            # First attempt: Try training with smaller subset
+            subset_size = min(5000, len(response_embeddings))  # Reduced from 10000
+            logger.info(f"Using {subset_size} samples for initial training attempt")
+            subset_idx = np.random.choice(len(response_embeddings), subset_size, replace=False)
+            training_embeddings = response_embeddings[subset_idx].copy()  # Make a copy
+            # Ensure contiguous memory layout
+            training_embeddings = np.ascontiguousarray(training_embeddings)
+            # Force cleanup before training
+            gc.collect()
+            if tf.config.list_physical_devices('GPU'):
+                tf.keras.backend.clear_session()
+            # Verify data properties
+            logger.info(f"FAISS training data shape: {training_embeddings.shape}")
+            logger.info(f"FAISS training data dtype: {training_embeddings.dtype}")
+            logger.info("Starting initial training attempt...")
+            self.index.train(training_embeddings)
+            logger.info("Training completed successfully")
+        except (RuntimeError, Exception) as e:
+            logger.warning(f"Initial training attempt failed: {str(e)}")
+            logger.info("Attempting fallback strategy...")
+            try:
+                # Move to CPU for more stable training
+                if self.faiss_gpu:
+                    logger.info("Moving index to CPU for fallback training")
+                    cpu_index = faiss.index_gpu_to_cpu(self.index)
+                else:
+                    cpu_index = self.index
+                # Create simpler index type if needed
+                if isinstance(cpu_index, faiss.IndexIVFFlat):
+                    logger.info("Creating simpler FlatL2 index for fallback")
+                    cpu_index = faiss.IndexFlatL2(self.config.embedding_dim)
+                # Use even smaller subset for CPU training
+                subset_size = min(2000, len(response_embeddings))
+                subset_idx = np.random.choice(len(response_embeddings), subset_size, replace=False)
+                fallback_embeddings = response_embeddings[subset_idx].copy()
+                # Ensure data is properly formatted
+                if not fallback_embeddings.flags['C_CONTIGUOUS']:
+                    fallback_embeddings = np.ascontiguousarray(fallback_embeddings)
+                if fallback_embeddings.dtype != np.float32:
+                    fallback_embeddings = fallback_embeddings.astype(np.float32)
+                # Train on CPU
+                logger.info("Training fallback index on CPU...")
+                cpu_index.train(fallback_embeddings)
+                # Move back to GPU if needed
+                if self.faiss_gpu:
+                    logger.info("Moving trained index back to GPU...")
+                    if len(self.gpu_resources) > 1:
+                        self.index = faiss.index_cpu_to_gpus_list(cpu_index, self.gpu_resources)
+                    else:
+                        self.index = faiss.index_cpu_to_gpu(self.gpu_resources[0], 0, cpu_index)
+                else:
+                    self.index = cpu_index
+                logger.info("Fallback training completed successfully")
+            except Exception as e2:
+                logger.error(f"Fallback training also failed: {str(e2)}")
+                logger.warning("Creating basic brute-force index as last resort")
+                try:
+                    # Create basic brute-force index as last resort
+                    dim = response_embeddings.shape[1]
+                    basic_index = faiss.IndexFlatL2(dim)
+                    if self.faiss_gpu:
+                        if len(self.gpu_resources) > 1:
+                            self.index = faiss.index_cpu_to_gpus_list(basic_index, self.gpu_resources)
+                        else:
+                            self.index = faiss.index_cpu_to_gpu(self.gpu_resources[0], 0, basic_index)
+                    else:
+                        self.index = basic_index
+                    logger.info("Basic index created as fallback")
+                except Exception as e3:
+                    logger.error(f"All training attempts failed: {str(e3)}")
+                    raise RuntimeError("Unable to create working FAISS index")
+    def _add_vectors_to_index(self, response_embeddings: np.ndarray) -> None:
+        """Add vectors to FAISS index with enhanced memory management."""
+        logger.info("Starting vector addition process...")
+        # Even smaller batches
+        initial_batch_size = 50  # Start smaller
+        min_batch_size = 10
+        max_batch_size = 500  # Lower maximum
+        total_added = 0
+        retry_count = 0
+        max_retries = 5
+        while total_added < len(response_embeddings):
+            try:
+                # Monitor memory
+                if self.memory_monitor.has_gpu:
+                    gpu_usage = self.memory_monitor.get_memory_usage()
+                    #logger.info(f"GPU memory usage before batch: {gpu_usage:.1%}")
+                    # Force cleanup if memory usage is high
+                    if gpu_usage > 0.7:  # Lower threshold to 70%
+                        logger.info("High memory usage detected, forcing cleanup")
+                        gc.collect()
+                        tf.keras.backend.clear_session()
+                # Get batch
+                end_idx = min(total_added + initial_batch_size, len(response_embeddings))
+                batch = response_embeddings[total_added:end_idx]
+                # Add batch
+                self.index.add(batch)
+                # Update progress
+                batch_size = len(batch)
+                total_added += batch_size
+                #logger.info(f"Added batch of {batch_size} vectors ({total_added}/{len(response_embeddings)} total)")
+                # Memory cleanup every few batches
+                if total_added % (initial_batch_size * 5) == 0:
+                    gc.collect()
+                    if tf.config.list_physical_devices('GPU'):
+                        tf.keras.backend.clear_session()
+                # Gradually increase batch size
+                if initial_batch_size < max_batch_size:
+                    initial_batch_size = min(initial_batch_size + 25, max_batch_size)
+            except Exception as e:
+                logger.warning(f"Error adding batch: {str(e)}")
+                retry_count += 1
+                if retry_count > max_retries:
+                    logger.error("Max retries exceeded.")
+                    raise
+                # Reduce batch size
+                initial_batch_size = max(min_batch_size, initial_batch_size // 2)
+                logger.info(f"Reducing batch size to {initial_batch_size} and retrying...")
+                # Cleanup
+                gc.collect()
+                if tf.config.list_physical_devices('GPU'):
+                    tf.keras.backend.clear_session()
+                time.sleep(1)  # Brief pause before retry
+        logger.info(f"Successfully added all {total_added} vectors to index")
+    def _add_vectors_cpu_fallback(self, remaining_embeddings: np.ndarray, already_added: int = 0) -> None:
+        """CPU fallback with extra safeguards and progress tracking."""
+        logger.info(f"CPU Fallback: Adding {len(remaining_embeddings)} remaining vectors...")
+        try:
+            # Move index to CPU
+            if self.faiss_gpu:
+                logger.info("Moving index to CPU...")
+                cpu_index = faiss.index_gpu_to_cpu(self.index)
+            else:
+                cpu_index = self.index
+            # Add remaining vectors on CPU with very small batches
+            batch_size = 50  # Extremely conservative batch size for CPU
+            total_added = already_added
+            for i in range(0, len(remaining_embeddings), batch_size):
+                end_idx = min(i + batch_size, len(remaining_embeddings))
+                batch = remaining_embeddings[i:end_idx]
+                # Add batch
+                cpu_index.add(batch)
+                # Update progress
+                total_added += len(batch)
+                if i % (batch_size * 10) == 0:
+                    logger.info(f"Added {total_added} vectors total "
+                            f"({i}/{len(remaining_embeddings)} in current phase)")
+                # Periodic cleanup
+                if i % (batch_size * 20) == 0:
+                    gc.collect()
+            # Move back to GPU if needed
+            if self.faiss_gpu:
+                logger.info("Moving index back to GPU...")
+                if len(self.gpu_resources) > 1:
+                    self.index = faiss.index_cpu_to_gpus_list(cpu_index, self.gpu_resources)
+                else:
+                    self.index = faiss.index_cpu_to_gpu(self.gpu_resources[0], 0, cpu_index)
+            else:
+                self.index = cpu_index
+            logger.info("CPU fallback completed successfully")
+        except Exception as e:
+            logger.error(f"Error during CPU fallback: {str(e)}")
+            raise
+    def _compute_and_index_embeddings(self):
+        """Compute embeddings and build FAISS index with simpler handling."""
+        logger.info("Computing embeddings and indexing with FAISS...")
+        try:
+            # Encode responses with memory monitoring
+            logger.info("Encoding unique responses")
+            response_embeddings = self.encode_responses(self.unique_responses)
+            response_embeddings = response_embeddings.numpy()
+            # Memory cleanup after encoding
+            gc.collect()
+            if tf.config.list_physical_devices('GPU'):
+                tf.keras.backend.clear_session()
+            # Ensure float32 and memory contiguous
+            response_embeddings = response_embeddings.astype('float32')
+            response_embeddings = np.ascontiguousarray(response_embeddings)
+            # Log memory state before normalization
+            if self.memory_monitor.has_gpu:
+                stats = self.memory_monitor.get_memory_stats()
+                if stats:
+                    logger.info(f"GPU memory before normalization: {stats.used/1e9:.2f}GB used")
+            # Normalize embeddings
+            logger.info("Normalizing embeddings with FAISS")
+            faiss.normalize_L2(response_embeddings)
+            # Create and initialize simple FlatIP index
+            dim = response_embeddings.shape[1]
+            if self.faiss_gpu:
+                cpu_index = faiss.IndexFlatIP(dim)
+                if len(self.gpu_resources) > 1:
+                    self.index = faiss.index_cpu_to_gpus_list(cpu_index, self.gpu_resources)
+                else:
+                    self.index = faiss.index_cpu_to_gpu(self.gpu_resources[0], 0, cpu_index)
+            else:
+                self.index = faiss.IndexFlatIP(dim)
+            # Add vectors to index
+            self._add_vectors_to_index(response_embeddings)
+            # Store responses and embeddings
+            self.response_pool = self.unique_responses
+            self.response_embeddings = response_embeddings
+            # Final memory cleanup
+            gc.collect()
+            if tf.config.list_physical_devices('GPU'):
+                tf.keras.backend.clear_session()
+            # Log final state
+            logger.info(f"Successfully indexed {self.index.ntotal} responses")
+            if self.memory_monitor.has_gpu:
+                stats = self.memory_monitor.get_memory_stats()
+                if stats:
+                    logger.info(f"Final GPU memory usage: {stats.used/1e9:.2f}GB used")
+            logger.info("Indexing completed successfully")
+        except Exception as e:
+            logger.error(f"Error during indexing: {e}")
+            # Ensure cleanup even on error
+            gc.collect()
+            if tf.config.list_physical_devices('GPU'):
+                tf.keras.backend.clear_session()
+            raise
+    def verify_faiss_index(self):
+        """Verify that FAISS index matches the response pool."""
+        indexed_size = self.index.ntotal
+        pool_size = len(self.response_pool)
+        logger.info(f"FAISS index size: {indexed_size}")
+        logger.info(f"Response pool size: {pool_size}")
+        if indexed_size != pool_size:
+            logger.warning("Mismatch between FAISS index size and response pool size.")
+        else:
+            logger.info("FAISS index correctly matches the response pool.")
     def encode_query(self, query: str, context: Optional[List[Tuple[str, str]]] = None) -> tf.Tensor:
         """Encode a query with optional conversation context."""
         # Prepare query with context
         """Retrieve top-k responses using FAISS."""
         # Encode the query
         q_emb = self.encode_query(query)  # Shape: [1, embedding_dim]
+        q_emb_np = q_emb.numpy().astype('float32')  # Ensure type match
         # Normalize the query embedding for cosine similarity
         faiss.normalize_L2(q_emb_np)
         logger.info(f"Loaded {len(dialogues)} dialogues.")
         return dialogues
+    def train_streaming(
         self,
+        dialogues: List[dict],
         epochs: int = 20,
         batch_size: int = 16,
         validation_split: float = 0.2,
         warmup_steps_ratio: float = 0.1,
         early_stopping_patience: int = 3,
         min_delta: float = 1e-4,
+        buffer_size: int = 10,
+        neg_samples: int = 1
+    ) -> None:
+        """
+        Streaming version of training that interleaves training/val batches by
+        giving priority to training until we meet `steps_per_epoch`, then
+        sending leftover batches to validation.
+        """
+        logger.info("Starting streaming training pipeline...")
+        # Initialize dataset preparer
+        dataset_preparer = StreamingDataPipeline(
+            tokenizer=self.tokenizer,
+            encoder=self.encoder,
+            index=self.index,
+            response_pool=self.response_pool,
+            max_length=self.config.max_context_token_limit,
+            batch_size=batch_size,
+            neg_samples=neg_samples
+        )
+        # Calculate total steps for learning rate schedule
+        total_pairs = dataset_preparer.estimate_total_pairs(dialogues)
+        train_size = total_pairs * (1 - validation_split)
+        steps_per_epoch = int(math.ceil(train_size / batch_size))
+        val_steps = int(math.ceil((total_pairs * validation_split) / batch_size))
         total_steps = steps_per_epoch * epochs
+        logger.info(f"Total pairs: {total_pairs}")
+        logger.info(f"Training pairs: {train_size}")
+        logger.info(f"Steps per epoch: {steps_per_epoch}")
+        logger.info(f"Validation steps: {val_steps}")
+        logger.info(f"Total steps: {total_steps}")
+        # Set up optimizer with learning rate schedule
         if use_lr_schedule:
             warmup_steps = int(total_steps * warmup_steps_ratio)
             lr_schedule = self._get_lr_schedule(
             self.optimizer = tf.keras.optimizers.Adam(learning_rate=peak_lr)
             logger.info("Using fixed learning rate.")
+        # Initialize checkpoint manager
         checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.encoder)
         manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)
+        # Setup TensorBoard
         log_dir = Path(checkpoint_dir) / "tensorboard_logs"
         log_dir.mkdir(parents=True, exist_ok=True)
         current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
         train_log_dir = str(log_dir / f"train_{current_time}")
         val_log_dir = str(log_dir / f"val_{current_time}")
         train_summary_writer = tf.summary.create_file_writer(train_log_dir)
         val_summary_writer = tf.summary.create_file_writer(val_log_dir)
         logger.info(f"TensorBoard logs will be saved in {log_dir}")
+        # Training loop
         best_val_loss = float("inf")
         epochs_no_improve = 0
+        try:
+            epoch_pbar = tqdm(range(1, epochs + 1), desc="Training", unit="epoch")
+            is_tqdm_epoch = True
+        except ImportError:
+            epoch_pbar = range(1, epochs + 1)
+            is_tqdm_epoch = False
+            logger.info("Epoch progress bar disabled - continuing without visual progress")
+        for epoch in epoch_pbar:
+            # Shared queues for streaming pipeline
+            train_queue = Queue(maxsize=buffer_size)
+            val_queue = Queue(maxsize=buffer_size)
+            stop_flag = threading.Event()
+            def data_pipeline_worker():
+                """Thread function that processes dialogues and sends batches to train or val."""
+                try:
+                    train_batches_needed = steps_per_epoch  # 9 in your logs
+                    val_batches_needed = val_steps          # 3 in your logs
+                    train_batches_sent = 0
+                    val_batches_sent = 0
+                    logger.info(f"Pipeline starting: need {train_batches_needed} train batches, {val_batches_needed} val batches")
+                    # Possibly shuffle your processed pairs to avoid repeating them in the same order
+                    # (If you haven't already done so in the pipeline)
+                    random.shuffle(dataset_preparer.processed_pairs)
+                    while (train_batches_sent < train_batches_needed or
+                        val_batches_sent < val_batches_needed):
+                        # We loop over the generator
+                        for batch in dataset_preparer.process_dialogues(dialogues):
+                            if stop_flag.is_set():
+                                logger.warning("Pipeline stopped early")
+                                break
+                            if train_batches_sent < train_batches_needed:
+                                train_queue.put(batch)
+                                train_batches_sent += 1
+                            elif val_batches_sent < val_batches_needed:
+                                val_queue.put(batch)
+                                val_batches_sent += 1
+                            else:
+                                # We have enough batches for both train & val
+                                break
+                        # If we still haven't met our target steps, REPEAT the data
+                        if train_batches_sent < train_batches_needed or val_batches_sent < val_batches_needed:
+                            logger.info("Data exhausted, repeating since we still need more batches...")
+                            # Optionally shuffle again
+                            random.shuffle(dataset_preparer.processed_pairs)
+                        else:
+                            # We have enough
+                            break
+                    logger.info(
+                        f"Pipeline complete: sent {train_batches_sent}/{train_batches_needed} train batches, "
+                        f"{val_batches_sent}/{val_batches_needed} val batches"
+                    )
+                except Exception as e:
+                    logger.error(f"Error in pipeline worker: {str(e)}")
+                    raise e
+                finally:
+                    train_queue.put(None)
+                    val_queue.put(None)
+            # Start data preparation pipeline in background thread
+            pipeline_thread = threading.Thread(target=data_pipeline_worker)
+            pipeline_thread.start()
+            try:
+                # --- Training Phase ---
+                epoch_loss_avg = tf.keras.metrics.Mean()
+                batches_processed = 0
+                try:
+                    train_pbar = tqdm(total=steps_per_epoch, desc=f"Training Epoch {epoch}")
+                    is_tqdm_train = True
+                except ImportError:
+                    train_pbar = None
+                    is_tqdm_train = False
+                    logger.info("Training progress bar disabled")
+                while batches_processed < steps_per_epoch:
+                    try:
+                        batch = train_queue.get(timeout=1200) # 20 minutes timeout
+                        if batch is None:
+                            logger.warning(f"Received end signal after only {batches_processed}/{steps_per_epoch} batches")
+                            break
+                        q_batch, p_batch = batch[0], batch[1]
+                        attention_mask = batch[2] if len(batch) > 2 else None
+                        loss = self.train_step(q_batch, p_batch, attention_mask)
+                        epoch_loss_avg(loss)
+                        batches_processed += 1
+                        # Log to TensorBoard
+                        with train_summary_writer.as_default():
+                            tf.summary.scalar("loss", loss, step=epoch)
+                        # Update progress bar
+                        if use_lr_schedule:
+                            current_lr = float(lr_schedule(self.optimizer.iterations))
                         else:
+                            current_lr = float(self.optimizer.learning_rate.numpy())
+                        if is_tqdm_train:
+                            train_pbar.update(1)
+                            train_pbar.set_postfix({
+                                "loss": f"{loss.numpy():.4f}",
+                                "lr": f"{current_lr:.2e}",
+                                "batches": f"{batches_processed}/{steps_per_epoch}"
+                            })
+                    except Empty:
+                        logger.warning(f"Queue timeout after {batches_processed}/{steps_per_epoch} batches")
+                        break
+                if is_tqdm_train and train_pbar:
+                    train_pbar.close()
+                # --- Validation Phase ---
+                val_loss_avg = tf.keras.metrics.Mean()
+                val_batches_processed = 0
+                try:
+                    val_pbar = tqdm(total=val_steps, desc="Validation")
+                    is_tqdm_val = True
+                except ImportError:
+                    val_pbar = None
+                    is_tqdm_val = False
+                    logger.info("Validation progress bar disabled")
+                while val_batches_processed < val_steps:
+                    try:
+                        batch = val_queue.get(timeout=30)
+                        if batch is None:
+                            logger.warning(
+                                f"Received end signal after {val_batches_processed}/{val_steps} validation batches"
+                            )
+                            break
+                        q_batch, p_batch = batch[0], batch[1]
+                        attention_mask = batch[2] if len(batch) > 2 else None
+                        val_loss = self.validation_step(q_batch, p_batch, attention_mask)
+                        val_loss_avg(val_loss)
+                        val_batches_processed += 1
+                        if is_tqdm_val:
+                            val_pbar.update(1)
+                            val_pbar.set_postfix({
+                                "val_loss": f"{val_loss.numpy():.4f}",
+                                "batches": f"{val_batches_processed}/{val_steps}"
+                            })
+                    except Empty:
+                        logger.warning(
+                            f"Validation queue timeout after {val_batches_processed}/{val_steps} batches"
+                        )
+                        break
+                if is_tqdm_val and val_pbar:
+                    val_pbar.close()
+                # End of epoch: compute final epoch stats
+                train_loss = epoch_loss_avg.result().numpy()
+                val_loss = val_loss_avg.result().numpy()
+                logger.info(f"Epoch {epoch} Complete: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
+                # Log epoch metrics
+                with val_summary_writer.as_default():
+                    tf.summary.scalar("val_loss", val_loss, step=epoch)
+                # Save checkpoint
+                manager.save()
+                # Store metrics in history
+                self.history['train_loss'].append(train_loss)
+                self.history['val_loss'].append(val_loss)
+                if use_lr_schedule:
+                    current_lr = float(lr_schedule(self.optimizer.iterations))
+                else:
+                    current_lr = float(self.optimizer.learning_rate.numpy())
+                self.history.setdefault('learning_rate', []).append(current_lr)
+                # Early stopping logic
+                if val_loss < best_val_loss - min_delta:
+                    best_val_loss = val_loss
+                    epochs_no_improve = 0
+                    logger.info(f"Validation loss improved to {val_loss:.4f}. Reset patience.")
+                else:
+                    epochs_no_improve += 1
+                    logger.info(f"No improvement this epoch. Patience: {epochs_no_improve}/{early_stopping_patience}")
+                    if epochs_no_improve >= early_stopping_patience:
+                        logger.info("Early stopping triggered.")
+                        break
+            except Exception as e:
+                logger.error(f"Error during training: {str(e)}")
+                stop_flag.set()
+                raise e
+            finally:
+                # Clean up epoch resources
+                stop_flag.set()
+                pipeline_thread.join()
+        logger.info("Streaming training completed!")
+    @tf.function
+    def train_step(self, q_batch: tf.Tensor, p_batch: tf.Tensor, attention_mask: Optional[tf.Tensor] = None) -> tf.Tensor:
+        """Single training step with tf.function optimization and partial batch handling."""
+        with tf.GradientTape() as tape:
+            q_enc = self.encoder(q_batch, training=True)
+            p_enc = self.encoder(p_batch, training=True)
+            sim_matrix = tf.matmul(q_enc, p_enc, transpose_b=True)
+            # Handle partial batches
+            batch_size = tf.shape(q_enc)[0]
+            labels = tf.range(batch_size, dtype=tf.int32)
+            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                labels=labels, logits=sim_matrix
+            )
+            # If there's an attention mask, apply it
+            if attention_mask is not None:
+                loss = loss * attention_mask
+                # normalize by the sum of attention_mask
+                loss = tf.reduce_sum(loss) / tf.reduce_sum(attention_mask)
             else:
+                loss = tf.reduce_mean(loss)
+        gradients = tape.gradient(loss, self.encoder.trainable_variables)
+        self.optimizer.apply_gradients(zip(gradients, self.encoder.trainable_variables))
+        return loss
+    @tf.function
+    def validation_step(self, q_batch: tf.Tensor, p_batch: tf.Tensor, attention_mask: Optional[tf.Tensor] = None) -> tf.Tensor:
+        """Single validation step with partial batch handling."""
+        q_enc = self.encoder(q_batch, training=False)
+        p_enc = self.encoder(p_batch, training=False)
+        sim_matrix = tf.matmul(q_enc, p_enc, transpose_b=True)
+        batch_size = tf.shape(q_enc)[0]
+        labels = tf.range(batch_size, dtype=tf.int32)
+        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+            labels=labels, logits=sim_matrix
+        )
+        if attention_mask is not None:
+            loss = loss * attention_mask
+            loss = tf.reduce_sum(loss) / tf.reduce_sum(attention_mask)
+        else:
+            loss = tf.reduce_mean(loss)
+        return loss
     def _get_lr_schedule(
         self,
         conversation_parts.append(f"{self.special_tokens['user']} {query}")
         return "\n".join(conversation_parts)
+class StreamingDataPipeline:
+    """Helper class to manage the streaming data preparation pipeline with optimized caching and GPU usage."""
+    def __init__(
+        self,
+        tokenizer,
+        encoder,
+        index,
+        response_pool,
+        max_length: int,
+        batch_size: int,
+        neg_samples: int
+    ):
+        self.tokenizer = tokenizer
+        self.encoder = encoder
+        self.index = index
+        self.response_pool = response_pool
+        self.max_length = max_length
+        self.base_batch_size = batch_size
+        self.neg_samples = neg_samples
+        self.memory_monitor = GPUMemoryMonitor()
+        # Caching structures
+        self.hard_negatives_cache = {}
+        self.processed_pairs = []
+        self.query_embeddings_cache = {}
+        # Error tracking
+        self.error_count = 0
+        self.max_retries = 3
+        # Batch processing settings
+        self.current_batch_size = batch_size
+        self.batch_increase_factor = 1.25
+        # TODO: use GPU/strategy
+        if len(response_pool) < 100:
+            self.embedding_batch_size = 16
+            self.search_batch_size = 8
+            self.max_batch_size = 32
+            self.min_batch_size = 4
+        else:
+            self.embedding_batch_size = 64
+            self.search_batch_size = 32
+            self.min_batch_size = max(8, batch_size // 4)
+            self.max_batch_size = 64
+    def save_cache(self, cache_dir: Path) -> None:
+        """Save all cached data for future runs."""
+        cache_dir = Path(cache_dir)
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        logger.info(f"Saving cache to {cache_dir}")
+        # Save embeddings cache
+        embeddings_path = cache_dir / "query_embeddings.npy"
+        np.save(
+            embeddings_path,
+            {k: v.numpy() if hasattr(v, 'numpy') else v
+             for k, v in self.query_embeddings_cache.items()}
+        )
+        # Save hard negatives and processed pairs
+        with open(cache_dir / "hard_negatives.json", 'w') as f:
+            json.dump(self.hard_negatives_cache, f)
+        with open(cache_dir / "processed_pairs.json", 'w') as f:
+            json.dump(self.processed_pairs, f)
+        logger.info("Cache saved successfully")
+    def load_cache(self, cache_dir: Path) -> bool:
+        """Load cached data if available."""
+        cache_dir = Path(cache_dir)
+        required_files = [
+            "query_embeddings.npy",
+            "hard_negatives.json",
+            "processed_pairs.json"
+        ]
+        if not all((cache_dir / f).exists() for f in required_files):
+            logger.info("Cache files not found")
+            return False
+        try:
+            logger.info("Loading cache...")
+            # Load embeddings
+            self.query_embeddings_cache = np.load(
+                cache_dir / "query_embeddings.npy",
+                allow_pickle=True
+            ).item()
+            # Load other caches
+            with open(cache_dir / "hard_negatives.json", 'r') as f:
+                self.hard_negatives_cache = json.load(f)
+            with open(cache_dir / "processed_pairs.json", 'r') as f:
+                self.processed_pairs = json.load(f)
+            logger.info(f"Cache loaded successfully: {len(self.processed_pairs)} pairs")
+            return True
+        except Exception as e:
+            logger.error(f"Error loading cache: {e}")
+            return False
+    def _adjust_batch_size(self) -> None:
+        """Dynamically adjust batch size based on GPU memory usage."""
+        if self.memory_monitor:
+            if self.memory_monitor.should_reduce_batch_size():
+                new_size = max(self.min_batch_size, self.current_batch_size // 2)
+                if new_size != self.current_batch_size:
+                    if new_size < self.min_batch_size:
+                        logger.info(f"Reducing batch size to {new_size} due to high memory usage")
+                    self.current_batch_size = new_size
+                    gc.collect()
+                    if tf.config.list_physical_devices('GPU'):
+                        tf.keras.backend.clear_session()
+            elif self.memory_monitor.can_increase_batch_size():
+                new_size = min(self.max_batch_size, int(self.current_batch_size * self.batch_increase_factor)) # More gradual increase
+                if new_size != self.current_batch_size:
+                    if new_size > self.max_batch_size:
+                        logger.info(f"Increasing batch size to {new_size}")
+                    self.current_batch_size = new_size
+    def _add_progress_metrics(self, pbar, **metrics) -> None:
+        """Add memory and batch size metrics to progress bars."""
+        if self.memory_monitor:
+            gpu_usage = self.memory_monitor.get_memory_usage()
+            metrics['gpu_mem'] = f"{gpu_usage:.1%}"
+        metrics['batch_size'] = self.current_batch_size
+        pbar.set_postfix(**metrics)
+    def preprocess_dialogues(self, dialogues: List[dict]) -> None:
+        """Preprocess all dialogues with error recovery and caching."""
+        retry_count = 0
+        while retry_count < self.max_retries:
+            try:
+                self._preprocess_dialogues_internal(dialogues)
+                break
+            except Exception as e:
+                retry_count += 1
+                logger.warning(f"Preprocessing attempt {retry_count} failed: {e}")
+                if retry_count == self.max_retries:
+                    logger.error("Max retries reached. Falling back to CPU processing")
+                    self._fallback_to_cpu_processing(dialogues)
+    def _preprocess_dialogues_internal(self, dialogues: List[dict]) -> None:
+        """Internal preprocessing implementation with progress tracking."""
+        logger.info("Starting dialogue preprocessing...")
+        # Collect unique queries and pairs
+        unique_queries = set()
+        query_positive_pairs = []
+        with tqdm(total=len(dialogues), desc="Collecting dialogue pairs") as pbar:
+            for dialogue in dialogues:
+                pairs = self._extract_pairs_from_dialogue(dialogue)
+                for query, positive in pairs:
+                    unique_queries.add(query)
+                    query_positive_pairs.append((query, positive))
+                pbar.update(1)
+                self._add_progress_metrics(pbar, pairs=len(query_positive_pairs))
+        # Precompute embeddings
+        logger.info("Precomputing query embeddings...")
+        self.precompute_query_embeddings(list(unique_queries))
+        # Find hard negatives
+        logger.info("Finding hard negatives for all pairs...")
+        self._find_hard_negatives_for_pairs(query_positive_pairs)
+    def precompute_query_embeddings(self, queries: List[str]) -> None:
+        """Precompute embeddings for all unique queries in batches."""
+        unique_queries = list(set(queries))
+        with tqdm(total=len(unique_queries), desc="Precomputing query embeddings") as pbar:
+            for i in range(0, len(unique_queries), self.embedding_batch_size):
+                # Adjust batch size based on memory
+                self._adjust_batch_size()
+                batch_size = min(self.embedding_batch_size, len(unique_queries) - i)
+                # Get batch of queries
+                batch_queries = unique_queries[i:i + batch_size]
+                try:
+                    # Tokenize batch
+                    encoded = self.tokenizer(
+                        batch_queries,
+                        padding=True,
+                        truncation=True,
+                        max_length=self.max_length,
+                        return_tensors='tf'
+                    )
+                    # Get embeddings
+                    embeddings = self.encoder(encoded['input_ids'], training=False)
+                    embeddings_np = embeddings.numpy().astype('float32')
+                    # Normalize for similarity search
+                    faiss.normalize_L2(embeddings_np)
+                    # Cache embeddings
+                    for query, emb in zip(batch_queries, embeddings_np):
+                        self.query_embeddings_cache[query] = emb
+                    pbar.update(len(batch_queries))
+                    self._add_progress_metrics(
+                        pbar,
+                        cached=len(self.query_embeddings_cache),
+                        batch_size=batch_size
+                    )
+                except Exception as e:
+                    logger.warning(f"Error processing batch: {e}")
+                    # Reduce batch size and retry
+                    self.embedding_batch_size = max(self.min_batch_size, self.embedding_batch_size // 2)
+                    continue
+                # Memory cleanup after successful batch
+                if i % (self.embedding_batch_size * 10) == 0:
+                    gc.collect()
+                    if tf.config.list_physical_devices('GPU'):
+                        tf.keras.backend.clear_session()
+        logger.info(f"Cached embeddings for {len(self.query_embeddings_cache)} unique queries")
+    def _extract_pairs_from_dialogue(self, dialogue: dict) -> List[Tuple[str, str]]:
+        """Extract query-response pairs from a dialogue."""
+        pairs = []
+        turns = dialogue.get('turns', [])
+        for i in range(len(turns) - 1):
+            current_turn = turns[i]
+            next_turn = turns[i+1]
+            if (current_turn.get('speaker') == 'user' and
+                next_turn.get('speaker') == 'assistant' and
+                'text' in current_turn and
+                'text' in next_turn):
+                query = current_turn['text'].strip()
+                positive = next_turn['text'].strip()
+                pairs.append((query, positive))
+        return pairs
+    def _fallback_to_cpu_processing(self, dialogues: List[dict]) -> None:
+        """Fallback processing method using CPU only."""
+        logger.info("Falling back to CPU-only processing")
+        # Reset GPU-specific settings
+        self.current_batch_size = self.min_batch_size
+        self.embedding_batch_size = 32
+        self.search_batch_size = 16
+        # Attempt preprocessing with reduced batches
+        self._preprocess_dialogues_internal(dialogues)
+    def process_dialogues(self, dialogues: List[dict]) -> Generator[Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor]], None, None]:
+        """
+        Process dialogues using cached data with dynamic batch sizing.
+        Yields (q_tokens['input_ids'], p_tokens['input_ids'], attention_mask) tuples.
+        """
+        # Preprocess if not already done
+        if not self.processed_pairs:
+            self.preprocess_dialogues(dialogues)
+        # Generate batches from cached data
+        current_queries = []
+        current_positives = []
+        # Counters for logging
+        total_examples_yielded = 0
+        total_batches_yielded = 0
+        with tqdm(total=len(self.processed_pairs), desc="Generating training batches", leave=False) as pbar:
+            for i, (query, positive) in enumerate(self.processed_pairs):
+                # Periodically adjust batch size
+                if i % 10 == 0:  # Check more frequently (e.g., every 10 pairs)
+                    self._adjust_batch_size()
+                # Add original pair
+                current_queries.append(query)
+                current_positives.append(positive)
+                # Add cached hard negatives for each query
+                hard_negatives = self.hard_negatives_cache.get((query, positive), [])
+                for neg_text in hard_negatives:
+                    current_queries.append(query)
+                    current_positives.append(neg_text)
+                # If we have enough examples to form a full batch, yield it
+                while len(current_queries) >= self.current_batch_size:
+                    batch_queries = current_queries[:self.current_batch_size]
+                    batch_positives = current_positives[:self.current_batch_size]
+                    # Update counters and logs
+                    batch_size_to_yield = len(batch_queries)
+                    total_examples_yielded += batch_size_to_yield
+                    total_batches_yielded += 1
+                    yield self._prepare_batch(batch_queries, batch_positives, pad_to_batch_size=False)
+                    # Remove used entries
+                    current_queries = current_queries[self.current_batch_size:]
+                    current_positives = current_positives[self.current_batch_size:]
+                # Update progress bar
+                pbar.update(1)
+                self._add_progress_metrics(
+                    pbar,
+                    pairs_processed=pbar.n,
+                    pending_pairs=len(current_queries)
+                )
+        # After the loop, if anything is left, yield a final partial batch
+        if current_queries:
+            leftover_size = len(current_queries)
+            total_examples_yielded += leftover_size
+            total_batches_yielded += 1
+            yield self._prepare_batch(
+                current_queries,
+                current_positives,
+                pad_to_batch_size=True
+            )
+    def _find_hard_negatives_for_pairs(self, query_positive_pairs: List[Tuple[str, str]]) -> None:
+        """Process pairs in batches to find hard negatives with GPU acceleration."""
+        total_pairs = len(query_positive_pairs)
+        # Use smaller batch size for small datasets
+        if len(self.response_pool) < 1000:
+            batch_size = min(8, self.search_batch_size)
+        else:
+            batch_size = self.search_batch_size
+        try:
+            pbar = tqdm(total=total_pairs, desc="Finding hard negatives")
+            is_tqdm = True
+        except ImportError:
+            pbar = None
+            is_tqdm = False
+            logger.info("Progress bar disabled - continuing without visual progress")
+        for i in range(0, total_pairs, batch_size):
+            self._adjust_batch_size()
+            batch_pairs = query_positive_pairs[i:i + batch_size]
+            batch_queries, batch_positives = zip(*batch_pairs)
+            batch_negatives = self._find_hard_negatives_batch(
+                list(batch_queries),
+                list(batch_positives)
+            )
+            for query, positive, negatives in zip(batch_queries, batch_positives, batch_negatives):
+                self.hard_negatives_cache[(query, positive)] = negatives
+                self.processed_pairs.append((query, positive))
+            if is_tqdm:
+                pbar.update(len(batch_pairs))
+                self._add_progress_metrics(
+                    pbar,
+                    cached=len(self.processed_pairs),
+                    progress=f"{i+len(batch_pairs)}/{total_pairs}"
+                )
+        if is_tqdm:
+            pbar.close()
+    def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
+        """Find hard negatives for a batch of queries with error handling and retries."""
+        retry_count = 0
+        total_responses = len(self.response_pool)
+        # For very small datasets (testing), just use random sampling
+        if total_responses < 100:
+            all_negatives = []
+            for positive in positives:
+                available = [r for r in self.response_pool if r.strip() != positive.strip()]
+                if available:
+                    negatives = list(np.random.choice(
+                        available,
+                        size=min(self.neg_samples, len(available)),
+                        replace=False
+                    ))
+                else:
+                    negatives = []
+                # Pad with empty strings if needed
+                while len(negatives) < self.neg_samples:
+                    negatives.append("")
+                all_negatives.append(negatives)
+            return all_negatives
+        while retry_count < self.max_retries:
+            try:
+                # Get cached embeddings and ensure they're the right type
+                query_embeddings = np.vstack([
+                    self.query_embeddings_cache[q] for q in queries
+                ]).astype(np.float32)
+                if not query_embeddings.flags['C_CONTIGUOUS']:
+                    query_embeddings = np.ascontiguousarray(query_embeddings)
+                # Normalize embeddings
+                faiss.normalize_L2(query_embeddings)
+                k = 1 #min(total_responses - 1, max(3, self.neg_samples + 2))
+                #logger.debug(f"Searching with k={k} among {total_responses} responses")
+                assert query_embeddings.dtype == np.float32, f"Embeddings are not float32: {query_embeddings.dtype}" # Assertion here
+                try:
+                    distances, indices = self.index.search(query_embeddings, k)
+                except RuntimeError as e:
+                    logger.error(f"FAISS search failed: {e}")
+                    return self._fallback_random_negatives(queries, positives)
+                # Process results
+                all_negatives = []
+                for i, (query_indices, query, positive) in enumerate(zip(indices, queries, positives)):
+                    negatives = []
+                    positive_strip = positive.strip()
+                    # Filter valid indices and deduplicate
+                    seen = {positive_strip}
+                    for idx in query_indices:
+                        if idx >= 0 and idx < total_responses:
+                            candidate = self.response_pool[idx].strip()
+                            if candidate and candidate not in seen:  # Check for non-empty strings
+                                seen.add(candidate)
+                                negatives.append(candidate)
+                                if len(negatives) >= self.neg_samples:
+                                    break
+                    # If we don't have enough negatives, use random sampling from remaining pool
+                    if len(negatives) < self.neg_samples:
+                        available = [r for r in self.response_pool if r.strip() not in seen and r.strip()]
+                        if available:
+                            additional = np.random.choice(
+                                available,
+                                size=min(self.neg_samples - len(negatives), len(available)),
+                                replace=False
+                            )
+                            negatives.extend(additional)
+                    # Still pad with empty strings if needed
+                    while len(negatives) < self.neg_samples:
+                        negatives.append("")
+                    all_negatives.append(negatives)
+                return all_negatives
+            except Exception as e:
+                retry_count += 1
+                logger.warning(f"Hard negative search attempt {retry_count} failed: {e}")
+                if retry_count == self.max_retries:
+                    logger.error("Max retries reached for hard negative search")
+                    return [[] for _ in queries]  # Return empty lists on complete failure
+                gc.collect()
+                if tf.config.list_physical_devices('GPU'):
+                    tf.keras.backend.clear_session()
+    def _fallback_random_negatives(self, queries: List[str], positives: List[str]) -> List[List[str]]:
+        """Fallback to random sampling when similarity search fails."""
+        logger.warning("Falling back to random negative sampling")
+        all_negatives = []
+        for positive in positives:
+            available = [r for r in self.response_pool if r.strip() != positive.strip()]
+            negatives = list(np.random.choice(
+                available,
+                size=min(self.neg_samples, len(available)),
+                replace=False
+            )) if available else []
+            while len(negatives) < self.neg_samples:
+                negatives.append("")
+            all_negatives.append(negatives)
+        return all_negatives
+    def _prepare_batch(
+        self,
+        queries: List[str],
+        positives: List[str],
+        pad_to_batch_size: bool = False
+    ) -> Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor]]:
+        """Prepare a batch with dynamic padding and memory optimization."""
+        actual_size = len(queries)
+        # Handle padding if requested and needed
+        if pad_to_batch_size and actual_size < self.current_batch_size:
+            padding_needed = self.current_batch_size - actual_size
+            queries.extend([queries[0]] * padding_needed)
+            positives.extend([positives[0]] * padding_needed)
+            # Create attention mask for padded examples
+            attention_mask = tf.concat([
+                tf.ones((actual_size,), dtype=tf.float32),
+                tf.zeros((padding_needed,), dtype=tf.float32)
+            ], axis=0)
+        else:
+            attention_mask = None
+        try:
+            # Tokenize batch
+            q_tokens = self.tokenizer(
+                queries,
+                padding='max_length',
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors='tf'
+            )
+            p_tokens = self.tokenizer(
+                positives,
+                padding='max_length',
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors='tf'
+            )
+            return q_tokens['input_ids'], p_tokens['input_ids'], attention_mask
+        except Exception as e:
+            logger.error(f"Error preparing batch: {e}")
+            # Emergency memory cleanup
+            gc.collect()
+            if tf.config.list_physical_devices('GPU'):
+                tf.keras.backend.clear_session()
+            raise
+    def estimate_total_pairs(self, dialogues: List[dict]) -> int:
+        """Estimate total number of training pairs including hard negatives."""
+        base_pairs = sum(
+            len([
+                1 for i in range(len(d.get('turns', [])) - 1)
+                if (d['turns'][i].get('speaker') == 'user' and
+                    d['turns'][i+1].get('speaker') == 'assistant')
+            ])
+            for d in dialogues
+        )
+        # Account for hard negatives
+        return base_pairs * (1 + self.neg_samples)
+    def cleanup(self):
+        """Cleanup resources and memory."""
+        self.query_embeddings_cache.clear()
+        gc.collect()
+        if tf.config.list_physical_devices('GPU'):
+            tf.keras.backend.clear_session()

conversation_summarizer.py CHANGED Viewed

@@ -25,9 +25,9 @@ class DeviceAwareModel:
         self.strategy = None
         if self.device == 'GPU':
-            # Enable mixed precision for better performance
-            policy = tf.keras.mixed_precision.Policy('mixed_float16')
-            tf.keras.mixed_precision.set_global_policy(policy)
             # Setup distribution strategy for multi-GPU if available
             gpus = tf.config.list_physical_devices('GPU')

         self.strategy = None
         if self.device == 'GPU':
+            # # Enable mixed precision for better performance
+            # policy = tf.keras.mixed_precision.Policy('mixed_float16')
+            # tf.keras.mixed_precision.set_global_policy(policy)
             # Setup distribution strategy for multi-GPU if available
             gpus = tf.config.list_physical_devices('GPU')

environment_setup.py CHANGED Viewed

@@ -122,14 +122,14 @@ class EnvironmentSetup:
                     except (subprocess.SubprocessError, FileNotFoundError):
                         logger.warning("Could not detect specific GPU model")
-                    # Enable XLA
-                    tf.config.optimizer.set_jit(True)
-                    logger.info("XLA compilation enabled for Colab GPU")
-                    # Set mixed precision policy
-                    policy = tf.keras.mixed_precision.Policy('mixed_float16')
-                    tf.keras.mixed_precision.set_global_policy(policy)
-                    logger.info("Mixed precision training enabled (float16)")
                     strategy = tf.distribute.OneDeviceStrategy("/GPU:0")
                     return "GPU", strategy
@@ -187,20 +187,24 @@ class EnvironmentSetup:
                     stderr=subprocess.DEVNULL
                 ).decode('utf-8').strip()
-                if "T4" in gpu_name:
                     # T4 optimizations
                     logger.info("Optimizing for Colab T4 GPU")
-                    base_batch_size = min(base_batch_size * 2, 32)  # T4 can handle larger batches
                 elif "V100" in gpu_name:
                     # V100 optimizations
                     logger.info("Optimizing for Colab V100 GPU")
-                    base_batch_size = min(base_batch_size * 3, 48)  # V100 can handle even larger batches
             except (subprocess.SubprocessError, FileNotFoundError):
                 logger.warning("Could not detect specific GPU model, using default settings")
         elif self.device_type == "TPU":
             # TPU optimizations
-            base_batch_size = min(base_batch_size * 4, 64)  # TPUs can handle very large batches
             logger.info("Optimizing for Colab TPU")
         logger.info(f"Optimized batch size for Colab: {base_batch_size}")

                     except (subprocess.SubprocessError, FileNotFoundError):
                         logger.warning("Could not detect specific GPU model")
+                    # # Enable XLA
+                    # tf.config.optimizer.set_jit(True)
+                    # logger.info("XLA compilation enabled for Colab GPU")
+                    # # Set mixed precision policy
+                    # policy = tf.keras.mixed_precision.Policy('mixed_float16')
+                    # tf.keras.mixed_precision.set_global_policy(policy)
+                    # logger.info("Mixed precision training enabled (float16)")
                     strategy = tf.distribute.OneDeviceStrategy("/GPU:0")
                     return "GPU", strategy
                     stderr=subprocess.DEVNULL
                 ).decode('utf-8').strip()
+                if "A100" in gpu_name:
+                    # A100 optimizations - has 40GB or 80GB variants
+                    logger.info("Optimizing for Colab A100 GPU")
+                    base_batch_size = min(base_batch_size * 8, 128)  # A100 can handle much larger batches
+                elif "T4" in gpu_name:
                     # T4 optimizations
                     logger.info("Optimizing for Colab T4 GPU")
+                    base_batch_size = min(base_batch_size * 2, 32)
                 elif "V100" in gpu_name:
                     # V100 optimizations
                     logger.info("Optimizing for Colab V100 GPU")
+                    base_batch_size = min(base_batch_size * 3, 48)
             except (subprocess.SubprocessError, FileNotFoundError):
                 logger.warning("Could not detect specific GPU model, using default settings")
         elif self.device_type == "TPU":
             # TPU optimizations
+            base_batch_size = min(base_batch_size * 4, 64)
             logger.info("Optimizing for Colab TPU")
         logger.info(f"Optimized batch size for Colab: {base_batch_size}")

gpu_monitor.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import numpy as np
+import tensorflow as tf
+import faiss
+import json
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional, Generator
+from dataclasses import dataclass
+import threading
+from queue import Queue
+import gc
+try:
+    from tqdm.notebook import tqdm
+except ImportError:
+    from tqdm import tqdm
+@dataclass
+class GPUMemoryStats:
+    total: int
+    used: int
+    free: int
+class GPUMemoryMonitor:
+    """Monitor GPU memory usage with safe CPU fallback."""
+    def __init__(self):
+        self.has_gpu = False
+        try:
+            gpus = tf.config.list_physical_devices('GPU')
+            self.has_gpu = len(gpus) > 0
+        except:
+            pass
+    def get_memory_stats(self) -> Optional[GPUMemoryStats]:
+        """Get current GPU memory statistics."""
+        if not self.has_gpu:
+            return None
+        try:
+            memory_info = tf.config.experimental.get_memory_info('GPU:0')
+            return GPUMemoryStats(
+                total=memory_info['peak'],
+                used=memory_info['current'],
+                free=memory_info['peak'] - memory_info['current']
+            )
+        except:
+            return None
+    def get_memory_usage(self) -> float:
+        """Get current GPU memory usage as a percentage."""
+        if not self.has_gpu:
+            return 0.0
+        stats = self.get_memory_stats()
+        if stats is None or stats.total == 0:
+            return 0.0
+        return stats.used / stats.total
+    def should_reduce_batch_size(self) -> bool:
+        """Check if batch size should be reduced based on memory usage."""
+        if not self.has_gpu:
+            return False
+        usage = self.get_memory_usage()
+        return usage > 0.90
+    def can_increase_batch_size(self) -> bool:
+        """Check if batch size can be increased based on memory usage."""
+        if not self.has_gpu:
+            return True  # Allow increase on CPU
+        usage = self.get_memory_usage()
+        return usage < 0.70

run_model_train.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from chatbot_model import RetrievalChatbot, ChatbotConfig
 from environment_setup import EnvironmentSetup
 from response_quality_checker import ResponseQualityChecker
@@ -33,11 +34,12 @@ def run_interactive_chat(chatbot, quality_checker):
 def main():
     # Initialize environment
     env = EnvironmentSetup()
     env.initialize()
-    DEBUG_SAMPLES = 5
-    EPOCHS = 1 if DEBUG_SAMPLES else 20
     TRAINING_DATA_PATH = 'processed_outputs/batch_group_0010.json'
     # Optimize batch size for Colab
@@ -54,23 +56,16 @@ def main():
     dialogues = RetrievalChatbot.load_training_data(data_path=TRAINING_DATA_PATH, debug_samples=DEBUG_SAMPLES)
     # Initialize chatbot and verify FAISS index
-    with env.strategy.scope():
-        chatbot = RetrievalChatbot(config, dialogues)
     chatbot.verify_faiss_index()
-    # Prepare dataset
-    logger.info("Preparing dataset...")
-    q_tensor, p_tensor = chatbot.prepare_dataset(dialogues)
-    quality_checker = ResponseQualityChecker(chatbot=chatbot)
-    # Train model
-    logger.info("Starting training...")
-    chatbot.train(
-        q_pad=q_tensor,
-        p_pad=p_tensor,
         epochs=EPOCHS,
         batch_size=batch_size,
-        validation_split=0.2,
     )
     # Save final model

+import tensorflow as tf
 from chatbot_model import RetrievalChatbot, ChatbotConfig
 from environment_setup import EnvironmentSetup
 from response_quality_checker import ResponseQualityChecker
 def main():
     # Initialize environment
+    tf.keras.backend.clear_session()
     env = EnvironmentSetup()
     env.initialize()
+    DEBUG_SAMPLES = 15
+    EPOCHS = 5 if DEBUG_SAMPLES else 20
     TRAINING_DATA_PATH = 'processed_outputs/batch_group_0010.json'
     # Optimize batch size for Colab
     dialogues = RetrievalChatbot.load_training_data(data_path=TRAINING_DATA_PATH, debug_samples=DEBUG_SAMPLES)
     # Initialize chatbot and verify FAISS index
+    #with env.strategy.scope():
+    chatbot = RetrievalChatbot(config, dialogues)
+    chatbot.build_models()
     chatbot.verify_faiss_index()
+    chatbot.train_streaming(
+        dialogues=dialogues,
         epochs=EPOCHS,
         batch_size=batch_size,
+        use_lr_schedule=True,
     )
     # Save final model