Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Dec 31, 2024

Commit

74af405

1 Parent(s): 2183656

data processing pipeline

Browse files

Files changed (4) hide show

chatbot_model.py +133 -637
requirements.txt +1 -0
run_data_preparer.py +182 -0
tf_data_pipeline.py +734 -0

chatbot_model.py CHANGED Viewed

@@ -2,7 +2,7 @@ import time
 from transformers import TFAutoModel, AutoTokenizer
 import tensorflow as tf
 import numpy as np
-from typing import Generator, List, Tuple, Dict, Optional, Union, Any
 import math
 from dataclasses import dataclass
 import json
@@ -10,6 +10,7 @@ from pathlib import Path
 import datetime
 import faiss
 import gc
 from response_quality_checker import ResponseQualityChecker
 from cross_encoder_reranker import CrossEncoderReranker
 from conversation_summarizer import DeviceAwareModel, Summarizer
@@ -24,14 +25,12 @@ logger = config_logger(__name__)
 @dataclass
 class ChatbotConfig:
     """Configuration for the RetrievalChatbot."""
-    vocab_size: int = 30526  # DistilBERT vocab size + special tokens
     max_context_token_limit: int = 512
     embedding_dim: int = 768
     encoder_units: int = 256
     num_attention_heads: int = 8
     dropout_rate: float = 0.2
     l2_reg_weight: float = 0.001
-    margin: float = 0.3
     learning_rate: float = 0.001
     min_text_length: int = 3
     max_context_turns: int = 5
@@ -39,16 +38,19 @@ class ChatbotConfig:
     pretrained_model: str = 'distilbert-base-uncased'
     dtype: str = 'float32'
     freeze_embeddings: bool = False
-    embedding_batch_size: int = 128
-    # Additional configurations can be added here
-    def to_dict(self) -> dict:
         """Convert config to dictionary."""
-        return {k: str(v) if isinstance(v, Path) else v
                 for k, v in self.__dict__.items()}
     @classmethod
-    def from_dict(cls, config_dict: dict) -> 'ChatbotConfig':
         """Create config from dictionary."""
         return cls(**{k: v for k, v in config_dict.items()
                      if k in cls.__dataclass_fields__})
@@ -59,24 +61,17 @@ class EncoderModel(tf.keras.Model):
         self,
         config: ChatbotConfig,
         name: str = "encoder",
-        shared_weights: bool = False,
         **kwargs
     ):
         super().__init__(name=name, **kwargs)
         self.config = config
-        self.shared_weights = shared_weights
         # Load pretrained model
         self.pretrained = TFAutoModel.from_pretrained(config.pretrained_model)
-        # Freeze pretrained weights if specified
-        self.pretrained.distilbert.embeddings.trainable = False
-        for i, layer_module in enumerate(self.pretrained.distilbert.transformer.layer):
-            if i < 1:  # freeze first layer
-                layer_module.trainable = False
-            else:
-                layer_module.trainable = True
         # Pooling layer (Global Average Pooling)
         self.pooler = tf.keras.layers.GlobalAveragePooling1D()
@@ -90,9 +85,27 @@ class EncoderModel(tf.keras.Model):
         # Dropout and normalization
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
         self.normalize = tf.keras.layers.Lambda(
-            lambda x: tf.nn.l2_normalize(x, axis=1)
         )
     def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
         """Forward pass."""
         # Get pretrained embeddings
@@ -112,46 +125,33 @@ class EncoderModel(tf.keras.Model):
         config = super().get_config()
         config.update({
             "config": self.config.to_dict(),
-            "shared_weights": self.shared_weights,
             "name": self.name
         })
         return config
 class RetrievalChatbot(DeviceAwareModel):
     """Retrieval-based chatbot using pretrained embeddings and FAISS for similarity search."""
-    def __init__(self, config: ChatbotConfig, dialogues: List[dict] = [], device: str = None,
-                 strategy=None, reranker: Optional[CrossEncoderReranker] = None,
-                 summarizer: Optional[Summarizer] = None
-                 ):
         self.config = config
         self.strategy = strategy
-        self.setup_device(device)
-        if reranker is None:
-            logger.info("Creating default CrossEncoderReranker...")
-            reranker = CrossEncoderReranker(model_name="cross-encoder/ms-marco-MiniLM-L-12-v2")
-        self.reranker = reranker
-        if summarizer is None:
-            logger.info("Creating default Summarizer...")
-            summarizer = Summarizer(device=self.device)
-        self.summarizer = summarizer
-        # Special tokens
-        self.special_tokens = {
-            "user": "<USER>",
-            "assistant": "<ASSISTANT>",
-            "context": "<CONTEXT>",
-            "sep": "<SEP>"
-        }
-        # Initialize tokenizer and add special tokens
-        self.tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
-        self.tokenizer.add_special_tokens(
-            {'additional_special_tokens': list(self.special_tokens.values())}
-        )
         self.memory_monitor = GPUMemoryMonitor()
         self.min_batch_size = 8
         self.max_batch_size = 128
         self.current_batch_size = 32
@@ -166,9 +166,62 @@ class RetrievalChatbot(DeviceAwareModel):
             "train_metrics": {},
             "val_metrics": {}
         }
     def build_models(self):
-        """Initialize the shared encoder."""
         logger.info("Building encoder model...")
         tf.keras.backend.clear_session()
@@ -176,6 +229,7 @@ class RetrievalChatbot(DeviceAwareModel):
         self.encoder = EncoderModel(
             self.config,
             name="shared_encoder",
         )
         # Resize token embeddings after adding special tokens
@@ -183,31 +237,14 @@ class RetrievalChatbot(DeviceAwareModel):
         self.encoder.pretrained.resize_token_embeddings(new_vocab_size)
         logger.info(f"Token embeddings resized to: {new_vocab_size}")
-        # Initialize FAISS index (moved here from __init__)
         self._initialize_faiss()
-        # Compute embeddings after FAISS is initialized and moved
-        self._compute_and_index_embeddings()
-        # Try different ways to get embedding dimension
-        try:
-            # First try: from config
-            embedding_dim = self.encoder.pretrained.config.dim
-            logger.info("Got embedding dim from config")
-        except AttributeError:
-            try:
-                # Second try: from word embeddings
-                embedding_dim = self.encoder.pretrained.distilbert.embeddings.word_embeddings.embedding_dim
-                logger.info("Got embedding dim from word embeddings")
-            except AttributeError:
-                try:
-                    # Third try: from embeddings module
-                    embedding_dim = self.encoder.pretrained.distilbert.embeddings.embedding_dim
-                    logger.info("Got embedding dim from embeddings module")
-                except AttributeError:
-                    # Fallback to config value
-                    embedding_dim = self.config.embedding_dim
-                    logger.info("Using config embedding dim")
         vocab_size = len(self.tokenizer)
         logger.info(f"Encoder Embedding Dimension: {embedding_dim}")
@@ -217,29 +254,6 @@ class RetrievalChatbot(DeviceAwareModel):
         else:
             logger.error("Vocabulary size is less than embedding dimension.")
             raise ValueError("Vocabulary size is less than embedding dimension.")
-    def _collect_responses(self, dialogues: List[dict]) -> Tuple[List[str], List[str]]:
-        """Collect all unique responses from dialogues."""
-        logger.info("Collecting responses from dialogues...")
-        responses = []
-        try:
-            progress_bar = tqdm(dialogues, desc="Collecting assistant responses")
-        except ImportError:
-            progress_bar = dialogues
-            logger.info("Progress bar disabled - continuing without visual progress")
-        for dialogue in progress_bar:
-            turns = dialogue.get('turns', [])
-            for turn in turns:
-                if turn.get('speaker') == 'assistant' and 'text' in turn:
-                    responses.append(turn['text'].strip())
-        # Remove duplicates
-        unique_responses = list(set(responses))
-        logger.info(f"Found {len(unique_responses)} unique responses.")
-        return responses, unique_responses
     def _adjust_batch_size(self) -> None:
         """Dynamically adjust batch size based on GPU memory usage."""
@@ -288,6 +302,7 @@ class RetrievalChatbot(DeviceAwareModel):
             logger.warning(f"Using CPU due to GPU initialization error: {e}")
         # TODO: figure out buf with faiss-gpu
         try:
             # Create appropriate index based on dataset size
             if len(self.unique_responses) < 1000:
@@ -860,33 +875,33 @@ class RetrievalChatbot(DeviceAwareModel):
         logger.info(f"Models and tokenizer loaded from {load_dir}.")
         return chatbot
-    @staticmethod
-    def load_training_data(data_path: Union[str, Path], debug_samples: Optional[int] = None) -> List[dict]:
-        """
-        Load training data from a JSON file.
-        Args:
-            data_path (Union[str, Path]): Path to the JSON file containing dialogues.
-            debug_samples (Optional[int]): Number of samples to load for debugging.
-        Returns:
-            List[dict]: List of dialogue dictionaries.
-        """
-        logger.info(f"Loading training data from {data_path}...")
-        data_path = Path(data_path)
-        if not data_path.exists():
-            logger.error(f"Data file {data_path} does not exist.")
-            return []
-        with open(data_path, 'r', encoding='utf-8') as f:
-            dialogues = json.load(f)
-        if debug_samples is not None:
-            dialogues = dialogues[:debug_samples]
-            logger.info(f"Debug mode: Limited to {debug_samples} dialogues")
-        logger.info(f"Loaded {len(dialogues)} dialogues.")
-        return dialogues
     def train_streaming(
         self,
@@ -1336,522 +1351,3 @@ class RetrievalChatbot(DeviceAwareModel):
         conversation_parts.append(f"{self.special_tokens['user']} {query}")
         return "\n".join(conversation_parts)
-class TFDataPipeline:
-    def __init__(
-        self,
-        embedding_batch_size,
-        tokenizer,
-        encoder,
-        index,
-        response_pool,
-        max_length: int,
-        neg_samples: int,
-    ):
-        self.embedding_batch_size = embedding_batch_size
-        self.tokenizer = tokenizer
-        self.encoder = encoder
-        self.index = index  # CPU version of the index
-        self.response_pool = response_pool
-        self.max_length = max_length
-        self.neg_samples = neg_samples
-        self.embedding_batch_size = 16 if len(response_pool) < 100 else 64
-        self.search_batch_size = 8 if len(response_pool) < 100 else 32
-        self.max_batch_size = 32 if len(response_pool) < 100 else 256
-        self.memory_monitor = GPUMemoryMonitor()
-        self.max_retries = 3
-        # In-memory cache for embeddings
-        self.query_embeddings_cache = {}
-    def _extract_pairs_from_dialogue(self, dialogue: dict) -> List[Tuple[str, str]]:
-        """Extract query-response pairs from a dialogue."""
-        pairs = []
-        turns = dialogue.get('turns', [])
-        for i in range(len(turns) - 1):
-            current_turn = turns[i]
-            next_turn = turns[i+1]
-            if (current_turn.get('speaker') == 'user' and
-                next_turn.get('speaker') == 'assistant' and
-                'text' in current_turn and
-                'text' in next_turn):
-                query = current_turn['text'].strip()
-                positive = next_turn['text'].strip()
-                pairs.append((query, positive))
-        return pairs
-    def estimate_total_pairs(self, dialogues: List[dict]) -> int:
-        """Estimate total number of training pairs including hard negatives."""
-        base_pairs = sum(
-            len([
-                1 for i in range(len(d.get('turns', [])) - 1)
-                if (d['turns'][i].get('speaker') == 'user' and
-                    d['turns'][i+1].get('speaker') == 'assistant')
-            ])
-            for d in dialogues
-        )
-        # Account for hard negatives
-        return base_pairs * (1 + self.neg_samples)
-    def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
-        """Find hard negatives for a batch of queries with error handling and retries."""
-        retry_count = 0
-        total_responses = len(self.response_pool)
-        while retry_count < self.max_retries:
-            try:
-                query_embeddings = np.vstack([
-                    self.query_embeddings_cache[q] for q in queries
-                ]).astype(np.float32)
-                query_embeddings = np.ascontiguousarray(query_embeddings)
-                faiss.normalize_L2(query_embeddings)
-                k = 1  # TODO: try higher k for better results
-                #logger.debug(f"Searching with k={k} among {total_responses} responses")
-                distances, indices = self.index.search(query_embeddings, k)
-                all_negatives = []
-                for query_indices, query, positive in zip(indices, queries, positives):
-                    negatives = []
-                    positive_strip = positive.strip()
-                    seen = {positive_strip}
-                    for idx in query_indices:
-                        if idx >= 0 and idx < total_responses:
-                            candidate = self.response_pool[idx].strip()
-                            if candidate and candidate not in seen:
-                                seen.add(candidate)
-                                negatives.append(candidate)
-                                if len(negatives) >= self.neg_samples:
-                                    break
-                    # Pad with a special empty negative if necessary
-                    while len(negatives) < self.neg_samples:
-                        negatives.append("<EMPTY_NEGATIVE>")  # Use a special token
-                    all_negatives.append(negatives)
-                return all_negatives
-            except Exception as e:
-                retry_count += 1
-                logger.warning(f"Hard negative search attempt {retry_count} failed: {e}")
-                if retry_count == self.max_retries:
-                    logger.error("Max retries reached for hard negative search")
-                    return [["<EMPTY_NEGATIVE>"] * self.neg_samples for _ in queries]  # Return empty negatives for all queries
-                gc.collect()
-                if tf.config.list_physical_devices('GPU'):
-                    tf.keras.backend.clear_session()
-    def _tokenize_negatives_tf(self, negatives):
-        """Tokenizes negatives using tf.py_function."""
-        # Handle the case where negatives is an empty tensor
-        if tf.size(negatives) == 0:
-            return tf.zeros([0, self.neg_samples, self.max_length], dtype=tf.int32)
-        # Convert EagerTensor to a list of strings
-        negatives_list = []
-        for neg_list in negatives.numpy():
-            decoded_negs = [neg.decode("utf-8") for neg in neg_list if neg]  # Filter out empty strings
-            negatives_list.append(decoded_negs)
-        # Flatten the list of lists
-        flattened_negatives = [neg for sublist in negatives_list for neg in sublist]
-        # Tokenize the flattened negatives
-        if flattened_negatives:
-            n_tokens = self.tokenizer(
-                flattened_negatives,
-                padding='max_length',
-                truncation=True,
-                max_length=self.max_length,
-                return_tensors='tf'
-            )
-            # Reshape the tokens
-            n_tokens_reshaped = tf.reshape(n_tokens['input_ids'], [-1, self.neg_samples, self.max_length])
-            return n_tokens_reshaped
-        else:
-            return tf.zeros([0, self.neg_samples, self.max_length], dtype=tf.int32)
-    def _compute_embeddings(self, queries: List[str]) -> None:
-        """Computes and caches embeddings for new queries."""
-        new_queries = [q for q in queries if q not in self.query_embeddings_cache]
-        if not new_queries:
-            return  # All queries already cached
-        new_embeddings = []
-        for i in range(0, len(new_queries), self.embedding_batch_size):
-            batch_queries = new_queries[i:i + self.embedding_batch_size]
-            encoded = self.tokenizer(
-                batch_queries,
-                padding=True,
-                truncation=True,
-                max_length=self.max_length,
-                return_tensors='tf'
-            )
-            # Compute embeddings on CPU
-            with tf.device('/CPU:0'):
-                batch_embeddings = self.encoder(encoded['input_ids'], training=False).numpy()
-            new_embeddings.extend(batch_embeddings)
-        # Update cache with new embeddings
-        for query, emb in zip(new_queries, new_embeddings):
-            self.query_embeddings_cache[query] = emb
-    def data_generator(self, dialogues: List[dict]) -> Generator[Tuple[str, str, List[str]], None, None]:
-        """
-        Generates training examples: (query, positive, hard_negatives).
-        Wrapped the outer loop with tqdm for progress tracking.
-        """
-        total_dialogues = len(dialogues)
-        logger.debug(f"Total dialogues to process: {total_dialogues}")
-        # Initialize tqdm progress bar
-        with tqdm(total=total_dialogues, desc="Processing Dialogues", unit="dialogue") as pbar:
-            for dialogue in dialogues:
-                pairs = self._extract_pairs_from_dialogue(dialogue)
-                for query, positive in pairs:
-                    # Ensure embeddings are computed, find hard negatives, etc.
-                    self._compute_embeddings([query])
-                    hard_negatives = self._find_hard_negatives_batch([query], [positive])[0]
-                    yield (query, positive, hard_negatives)
-                pbar.update(1)
-    def _prepare_batch(self, queries: tf.Tensor, positives: tf.Tensor, negatives: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
-        """Prepares a batch of data for training."""
-        # Convert EagerTensors to lists of strings
-        queries_list = [query.decode("utf-8") for query in queries.numpy()]
-        positives_list = [pos.decode("utf-8") for pos in positives.numpy()]
-        # Tokenize queries and positives
-        q_tokens = self.tokenizer(queries_list, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='tf')
-        p_tokens = self.tokenizer(positives_list, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='tf')
-        # Decode negatives and ensure they are lists of strings
-        negatives_list = []
-        for neg_list in negatives.numpy():
-            decoded_negs = [neg.decode("utf-8") for neg in neg_list if neg]  # Filter out empty strings
-            negatives_list.append(decoded_negs)
-        # Flatten negatives for tokenization if there are any valid negatives
-        flattened_negatives = [neg for sublist in negatives_list for neg in sublist if neg]
-        # Tokenize negatives if there are any
-        n_tokens_reshaped = None
-        if flattened_negatives:
-            n_tokens = self.tokenizer(flattened_negatives, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='tf')
-            # Reshape n_tokens to match the expected shape based on the number of negatives per query
-            # This part may need adjustment if the number of negatives varies per query
-            n_tokens_reshaped = tf.reshape(n_tokens['input_ids'], [len(queries_list), -1, self.max_length])
-        else:
-            # Create a placeholder tensor for the case where there are no negatives
-            n_tokens_reshaped = tf.zeros([len(queries_list), 0, self.max_length], dtype=tf.int32)
-        # Ensure n_tokens_reshaped has a consistent shape even when there are no negatives
-        # Adjust shape to [batch_size, num_neg_samples, max_length]
-        if n_tokens_reshaped.shape[1] != self.neg_samples:
-            # Pad or truncate the second dimension to match neg_samples
-            padding = tf.zeros([len(queries_list), tf.maximum(0, self.neg_samples - n_tokens_reshaped.shape[1]), self.max_length], dtype=tf.int32)
-            n_tokens_reshaped = tf.concat([n_tokens_reshaped, padding], axis=1)
-            n_tokens_reshaped = n_tokens_reshaped[:, :self.neg_samples, :]
-        # Concatenate the positive and negative examples along the 'neg_samples' dimension
-        combined_p_n_tokens = tf.concat([tf.expand_dims(p_tokens['input_ids'], axis=1), n_tokens_reshaped], axis=1)
-        return q_tokens['input_ids'], combined_p_n_tokens
-    def get_tf_dataset(self, dialogues: List[dict], batch_size: int) -> tf.data.Dataset:
-        """
-        Creates a tf.data.Dataset for streaming training that yields
-        (input_ids_query, input_ids_positive, input_ids_negatives).
-        """
-        # 1) Start with a generator dataset
-        dataset = tf.data.Dataset.from_generator(
-            lambda: self.data_generator(dialogues),
-            output_signature=(
-                tf.TensorSpec(shape=(), dtype=tf.string),        # Query (single string)
-                tf.TensorSpec(shape=(), dtype=tf.string),        # Positive (single string)
-                tf.TensorSpec(shape=(None,), dtype=tf.string)    # Hard Negatives (list of strings)
-            )
-        )
-        # 2) Batch the raw strings
-        dataset = dataset.batch(batch_size)
-        # 3) Now map them through a tokenize step (via py_function)
-        dataset = dataset.map(
-            lambda q, p, n: self._tokenize_triple(q, p, n),
-            num_parallel_calls=1 #tf.data.AUTOTUNE
-        )
-        dataset = dataset.prefetch(tf.data.AUTOTUNE)
-        return dataset
-    def _tokenize_triple(
-        self,
-        q: tf.Tensor,
-        p: tf.Tensor,
-        n: tf.Tensor
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-        """
-        Wraps a Python function via tf.py_function to convert tf.Tensors of strings
-        -> Python lists of strings -> HF tokenizer -> Tensors of IDs.
-        q is shape [batch_size], p is shape [batch_size],
-        n is shape [batch_size, neg_samples] (i.e., each row is a list of negatives).
-        """
-        # Use tf.py_function with limited parallelism
-        q_ids, p_ids, n_ids = tf.py_function(
-            func=self._tokenize_triple_py,
-            inp=[q, p, n, tf.constant(self.max_length), tf.constant(self.neg_samples)],
-            Tout=[tf.int32, tf.int32, tf.int32]
-        )
-        # Manually set shape information
-        q_ids.set_shape([None, self.max_length])                # [batch_size, max_length]
-        p_ids.set_shape([None, self.max_length])                # [batch_size, max_length]
-        n_ids.set_shape([None, self.neg_samples, self.max_length])  # [batch_size, neg_samples, max_length]
-        return q_ids, p_ids, n_ids
-    # def _tokenize_triple(
-    #     self,
-    #     q: tf.Tensor,
-    #     p: tf.Tensor,
-    #     n: tf.Tensor
-    # ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-    #     """
-    #     Wraps a Python function via tf.py_function to convert tf.Tensors of strings
-    #     -> Python lists of strings -> HF tokenizer -> Tensors of IDs.
-    #     q is shape [batch_size], p is shape [batch_size],
-    #     n is shape [batch_size, None] (i.e. each row is a variable number of negatives).
-    #     """
-    #     # Use tf.py_function
-    #     # We pass in self.max_length as well, so we can do it in one shot.
-    #     q_ids, p_ids, n_ids = tf.py_function(
-    #         func=self._tokenize_triple_py,
-    #         inp=[q, p, n, tf.constant(self.max_length), tf.constant(self.neg_samples)],
-    #         Tout=[tf.int32, tf.int32, tf.int32]
-    #     )
-    #     # We must manually set shape information so that TF data pipeline knows the dimensions
-    #     q_ids.set_shape([None, self.max_length])                # [batch_size, max_length]
-    #     p_ids.set_shape([None, self.max_length])                # [batch_size, max_length]
-    #     n_ids.set_shape([None, self.neg_samples, self.max_length])
-    #     # The negative dimension is set to `self.neg_samples` for consistency.
-    #     return q_ids, p_ids, n_ids
-    def _tokenize_triple_py(
-        self,
-        q: tf.Tensor,
-        p: tf.Tensor,
-        n: tf.Tensor,
-        max_len: tf.Tensor,
-        neg_samples: tf.Tensor
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """
-        Python function that:
-        - Decodes each tf.string Tensor to a Python list of strings
-        - Calls the HF tokenizer
-        - Reshapes negatives
-        - Returns np.array of int32s for (q_ids, p_ids, n_ids).
-        q: shape [batch_size], p: shape [batch_size]
-        n: shape [batch_size, neg_samples]
-        max_len: scalar int
-        neg_samples: scalar int
-        """
-        max_len = int(max_len.numpy())       # Convert to Python int
-        neg_samples = int(neg_samples.numpy())
-        # 1) Convert Tensors -> Python lists of strings
-        q_list = [q_i.decode("utf-8") for q_i in q.numpy()]  # shape [batch_size]
-        p_list = [p_i.decode("utf-8") for p_i in p.numpy()]  # shape [batch_size]
-        # shape [batch_size, neg_samples], decode each row
-        n_list = []
-        for row in n.numpy():
-            # row is shape [neg_samples], each is a tf.string
-            decoded = [neg.decode("utf-8") for neg in row]
-            n_list.append(decoded)
-        # 2) Tokenize queries & positives
-        q_enc = self.tokenizer(
-            q_list,
-            padding="max_length",
-            truncation=True,
-            max_length=max_len,
-            return_tensors="np"
-        )
-        p_enc = self.tokenizer(
-            p_list,
-            padding="max_length",
-            truncation=True,
-            max_length=max_len,
-            return_tensors="np"
-        )
-        # 3) Tokenize negatives
-        # Flatten [batch_size, neg_samples] -> single list
-        flattened_negatives = [neg for row in n_list for neg in row]
-        if len(flattened_negatives) == 0:
-            # No negatives at all: return a zero array
-            n_ids = np.zeros((len(q_list), neg_samples, max_len), dtype=np.int32)
-        else:
-            n_enc = self.tokenizer(
-                flattened_negatives,
-                padding="max_length",
-                truncation=True,
-                max_length=max_len,
-                return_tensors="np"
-            )
-            # shape [batch_size * neg_samples, max_len]
-            n_input_ids = n_enc["input_ids"]
-            # We want to reshape to [batch_size, neg_samples, max_len]
-            # Handle cases where there might be fewer negatives
-            batch_size = len(q_list)
-            n_ids_list = []
-            for i in range(batch_size):
-                start_idx = i * neg_samples
-                end_idx = start_idx + neg_samples
-                row_negs = n_input_ids[start_idx:end_idx]
-                # If fewer negatives, pad with zeros
-                if row_negs.shape[0] < neg_samples:
-                    deficit = neg_samples - row_negs.shape[0]
-                    pad_arr = np.zeros((deficit, max_len), dtype=np.int32)
-                    row_negs = np.concatenate([row_negs, pad_arr], axis=0)
-                n_ids_list.append(row_negs)
-            # stack them -> shape [batch_size, neg_samples, max_len]
-            n_ids = np.stack(n_ids_list, axis=0)
-        # 4) Return as np.int32 arrays
-        q_ids = q_enc["input_ids"].astype(np.int32)  # shape [batch_size, max_len]
-        p_ids = p_enc["input_ids"].astype(np.int32)  # shape [batch_size, max_len]
-        n_ids = n_ids.astype(np.int32)               # shape [batch_size, neg_samples, max_len]
-        return q_ids, p_ids, n_ids
-    # def _tokenize_triple_py(
-    #     self,
-    #     q: tf.Tensor,
-    #     p: tf.Tensor,
-    #     n: tf.Tensor,
-    #     max_len: tf.Tensor,
-    #     neg_samples: tf.Tensor
-    # ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-    #     """
-    #     Python function that:
-    #      - Decodes each tf.string Tensor to a Python list of strings
-    #      - Calls the HF tokenizer
-    #      - Reshapes negatives
-    #      - Returns np.array of int32s for (q_ids, p_ids, n_ids).
-    #     q: shape [batch_size], p: shape [batch_size]
-    #     n: shape [batch_size, None]
-    #     max_len: scalar int
-    #     neg_samples: scalar int
-    #     """
-    #     max_len = int(max_len.numpy())       # convert to python int
-    #     neg_samples = int(neg_samples.numpy())
-    #     # 1) Convert Tensors -> Python lists of strings
-    #     q_list = [q_i.decode("utf-8") for q_i in q.numpy()]  # shape [batch_size]
-    #     p_list = [p_i.decode("utf-8") for p_i in p.numpy()]  # shape [batch_size]
-    #     # shape [batch_size, variable_negatives], decode each row
-    #     n_list = []
-    #     for row in n.numpy():
-    #         # row is shape [N], each is a tf.string
-    #         decoded = [neg.decode("utf-8") for neg in row]
-    #         n_list.append(decoded)
-    #     # 2) Tokenize queries & positives
-    #     q_enc = self.tokenizer(
-    #         q_list,
-    #         padding="max_length",
-    #         truncation=True,
-    #         max_length=max_len,
-    #         return_tensors="np"  # you can do return_tensors="tf", but "np" is often simpler here
-    #     )
-    #     p_enc = self.tokenizer(
-    #         p_list,
-    #         padding="max_length",
-    #         truncation=True,
-    #         max_length=max_len,
-    #         return_tensors="np"
-    #     )
-    #     # 3) Tokenize negatives
-    #     # Flatten [batch_size, variable_negatives] -> single list
-    #     flattened_negatives = [neg for row in n_list for neg in row]
-    #     if len(flattened_negatives) == 0:
-    #         # No negatives at all: return a zero array
-    #         n_ids = np.zeros((len(q_list), neg_samples, max_len), dtype=np.int32)
-    #     else:
-    #         n_enc = self.tokenizer(
-    #             flattened_negatives,
-    #             padding="max_length",
-    #             truncation=True,
-    #             max_length=max_len,
-    #             return_tensors="np"
-    #         )
-    #         # shape [batch_size * total_negatives, max_len]
-    #         n_input_ids = n_enc["input_ids"]
-    #         # We want to reshape to [batch_size, neg_samples, max_len].
-    #         # If each row truly has exactly `neg_samples` (or fewer), we can do:
-    #         #   n_input_ids = n_input_ids.reshape(len(q_list), neg_samples, max_len)
-    #         # But if the rows have variable # of negatives, we must clamp or pad.
-    #         # For simplicity, let's just "take first neg_samples" per row
-    #         # and pad if fewer.
-    #         # We'll do it row by row:
-    #         batch_size = len(q_list)
-    #         row_offsets = 0
-    #         n_ids_list = []
-    #         for row_idx in range(batch_size):
-    #             row_negs = n_list[row_idx]
-    #             row_count = len(row_negs)
-    #             # slice from the flattened array
-    #             row_slice = n_input_ids[row_offsets:row_offsets + row_count]
-    #             row_offsets += row_count
-    #             # Now pick out up to neg_samples
-    #             row_slice = row_slice[:neg_samples]
-    #             # If fewer than neg_samples, pad
-    #             if row_slice.shape[0] < neg_samples:
-    #                 deficit = neg_samples - row_slice.shape[0]
-    #                 pad_arr = np.zeros((deficit, max_len), dtype=np.int32)
-    #                 row_slice = np.concatenate([row_slice, pad_arr], axis=0)
-    #             # row_slice is now shape [neg_samples, max_len]
-    #             n_ids_list.append(row_slice)
-    #         # stack them -> shape [batch_size, neg_samples, max_len]
-    #         n_ids = np.stack(n_ids_list, axis=0)
-    #     # 4) Return as np.int32 arrays (tokenizer should already return int32,
-    #     #    but we can cast to be sure)
-    #     q_ids = q_enc["input_ids"].astype(np.int32)  # shape [batch_size, max_len]
-    #     p_ids = p_enc["input_ids"].astype(np.int32)  # shape [batch_size, max_len]
-    #     n_ids = n_ids.astype(np.int32)               # shape [batch_size, neg_samples, max_len]
-    #     return q_ids, p_ids, n_ids

 from transformers import TFAutoModel, AutoTokenizer
 import tensorflow as tf
 import numpy as np
+from typing import List, Tuple, Dict, Optional, Union, Any
 import math
 from dataclasses import dataclass
 import json
 import datetime
 import faiss
 import gc
+from tf_data_pipeline import TFDataPipeline
 from response_quality_checker import ResponseQualityChecker
 from cross_encoder_reranker import CrossEncoderReranker
 from conversation_summarizer import DeviceAwareModel, Summarizer
 @dataclass
 class ChatbotConfig:
     """Configuration for the RetrievalChatbot."""
     max_context_token_limit: int = 512
     embedding_dim: int = 768
     encoder_units: int = 256
     num_attention_heads: int = 8
     dropout_rate: float = 0.2
     l2_reg_weight: float = 0.001
     learning_rate: float = 0.001
     min_text_length: int = 3
     max_context_turns: int = 5
     pretrained_model: str = 'distilbert-base-uncased'
     dtype: str = 'float32'
     freeze_embeddings: bool = False
+    embedding_batch_size: int = 64
+    search_batch_size: int = 64
+    max_batch_size: int = 64
+    neg_samples: int = 3
+    max_retries: int = 3
+    def to_dict(self) -> Dict:
         """Convert config to dictionary."""
+        return {k: (str(v) if isinstance(v, Path) else v)
                 for k, v in self.__dict__.items()}
     @classmethod
+    def from_dict(cls, config_dict: Dict) -> 'ChatbotConfig':
         """Create config from dictionary."""
         return cls(**{k: v for k, v in config_dict.items()
                      if k in cls.__dataclass_fields__})
         self,
         config: ChatbotConfig,
         name: str = "encoder",
         **kwargs
     ):
         super().__init__(name=name, **kwargs)
         self.config = config
         # Load pretrained model
         self.pretrained = TFAutoModel.from_pretrained(config.pretrained_model)
+        # Freeze layers based on config
+        self._freeze_layers()
         # Pooling layer (Global Average Pooling)
         self.pooler = tf.keras.layers.GlobalAveragePooling1D()
         # Dropout and normalization
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
         self.normalize = tf.keras.layers.Lambda(
+            lambda x: tf.nn.l2_normalize(x, axis=1),
+            name="l2_normalize"
         )
+    def _freeze_layers(self):
+        """Freeze layers of the pretrained model based on configuration."""
+        if self.config.freeze_embeddings:
+            self.pretrained.trainable = False
+            logger.info("All pretrained layers frozen.")
+        else:
+            # Freeze only the first 'n' transformer layers
+            for i, layer in enumerate(self.pretrained.layers):
+                if isinstance(layer, tf.keras.layers.Layer):
+                    if hasattr(layer, 'trainable'):
+                        # Freeze the first transformer block
+                        if i < 1:
+                            layer.trainable = False
+                            logger.info(f"Layer {i} frozen.")
+                        else:
+                            layer.trainable = True
     def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
         """Forward pass."""
         # Get pretrained embeddings
         config = super().get_config()
         config.update({
             "config": self.config.to_dict(),
             "name": self.name
         })
         return config
 class RetrievalChatbot(DeviceAwareModel):
     """Retrieval-based chatbot using pretrained embeddings and FAISS for similarity search."""
+    def __init__(
+        self,
+        config: ChatbotConfig,
+        dialogues: List[dict] = [],
+        device: str = None,
+        strategy=None,
+        reranker: Optional[CrossEncoderReranker] = None,
+        summarizer: Optional[Summarizer] = None
+    ):
+        super().__init__()
         self.config = config
         self.strategy = strategy
+        self.device = device or self._setup_default_device()
+        # Initialize reranker, summarizer, tokenizer, and memory monitor
+        self.reranker = reranker or self._initialize_reranker()
+        self.summarizer = summarizer or self._initialize_summarizer()
+        self.tokenizer = self._initialize_tokenizer()
         self.memory_monitor = GPUMemoryMonitor()
+        # Initialize models
         self.min_batch_size = 8
         self.max_batch_size = 128
         self.current_batch_size = 32
             "train_metrics": {},
             "val_metrics": {}
         }
+    def _setup_default_device(self) -> str:
+        """Set up default device if none is provided."""
+        if tf.config.list_physical_devices('GPU'):
+            return 'GPU'
+        else:
+            return 'CPU'
+    def _initialize_reranker(self) -> CrossEncoderReranker:
+        """Initialize the CrossEncoderReranker."""
+        logger.info("Initializing default CrossEncoderReranker...")
+        return CrossEncoderReranker(model_name="cross-encoder/ms-marco-MiniLM-L-12-v2")
+    def _initialize_summarizer(self) -> Summarizer:
+        """Initialize the Summarizer."""
+        logger.info("Initializing default Summarizer...")
+        return Summarizer(device=self.device)
+    def _initialize_tokenizer(self) -> AutoTokenizer:
+        """Initialize the tokenizer and add special tokens."""
+        logger.info("Initializing tokenizer and adding special tokens...")
+        tokenizer = AutoTokenizer.from_pretrained(self.config.pretrained_model)
+        special_tokens = {
+            "user": "<USER>",
+            "assistant": "<ASSISTANT>",
+            "context": "<CONTEXT>",
+            "sep": "<SEP>"
+        }
+        tokenizer.add_special_tokens(
+            {'additional_special_tokens': list(special_tokens.values())}
+        )
+        return tokenizer
+    def _collect_responses(self, dialogues: List[dict]) -> Tuple[List[str], List[str]]:
+        """
+        Collect unique responses from dialogues.
+        Returns:
+            response_pool: List of all possible responses.
+            unique_responses: List of unique responses.
+        """
+        logger.info("Collecting unique responses from dialogues...")
+        responses = set()
+        for dialogue in dialogues:
+            turns = dialogue.get('turns', [])
+            for turn in turns:
+                if turn.get('speaker') == 'assistant' and 'text' in turn:
+                    response = turn['text'].strip()
+                    if len(response) >= self.config.min_text_length:
+                        responses.add(response)
+        response_pool = list(responses)
+        unique_responses = list(responses)  # Assuming uniqueness
+        logger.info(f"Collected {len(response_pool)} unique responses.")
+        return response_pool, unique_responses
     def build_models(self):
+        """Initialize the shared encoder and FAISS index."""
         logger.info("Building encoder model...")
         tf.keras.backend.clear_session()
         self.encoder = EncoderModel(
             self.config,
             name="shared_encoder",
+            shared_weights=True  # If weight sharing is intended
         )
         # Resize token embeddings after adding special tokens
         self.encoder.pretrained.resize_token_embeddings(new_vocab_size)
         logger.info(f"Token embeddings resized to: {new_vocab_size}")
+        # Initialize FAISS index
         self._initialize_faiss()
+        # Compute and index embeddings
+        self._compute_and_index_embeddings()
+        # Retrieve embedding dimension from encoder
+        embedding_dim = self.config.embedding_dim
         vocab_size = len(self.tokenizer)
         logger.info(f"Encoder Embedding Dimension: {embedding_dim}")
         else:
             logger.error("Vocabulary size is less than embedding dimension.")
             raise ValueError("Vocabulary size is less than embedding dimension.")
     def _adjust_batch_size(self) -> None:
         """Dynamically adjust batch size based on GPU memory usage."""
             logger.warning(f"Using CPU due to GPU initialization error: {e}")
         # TODO: figure out buf with faiss-gpu
+        # TODO: consider IndexIVFFlat in the future (speed).
         try:
             # Create appropriate index based on dataset size
             if len(self.unique_responses) < 1000:
         logger.info(f"Models and tokenizer loaded from {load_dir}.")
         return chatbot
+    # @staticmethod
+    # def load_training_data(data_path: Union[str, Path], debug_samples: Optional[int] = None) -> List[dict]:
+    #     """
+    #     Load training data from a JSON file.
+    #     Args:
+    #         data_path (Union[str, Path]): Path to the JSON file containing dialogues.
+    #         debug_samples (Optional[int]): Number of samples to load for debugging.
+    #     Returns:
+    #         List[dict]: List of dialogue dictionaries.
+    #     """
+    #     logger.info(f"Loading training data from {data_path}...")
+    #     data_path = Path(data_path)
+    #     if not data_path.exists():
+    #         logger.error(f"Data file {data_path} does not exist.")
+    #         return []
+    #     with open(data_path, 'r', encoding='utf-8') as f:
+    #         dialogues = json.load(f)
+    #     if debug_samples is not None:
+    #         dialogues = dialogues[:debug_samples]
+    #         logger.info(f"Debug mode: Limited to {debug_samples} dialogues")
+    #     logger.info(f"Loaded {len(dialogues)} dialogues.")
+    #     return dialogues
     def train_streaming(
         self,
         conversation_parts.append(f"{self.special_tokens['user']} {query}")
         return "\n".join(conversation_parts)

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 faiss-cpu>=1.7.0  # Required for Facebook AI Similarity Search
 ipython>=8.0.0  # For interactive Python
 loguru>=0.7.0  # Enhanced logging (optional but recommended)
 matplotlib>=3.5.0  # For validation plotting

 faiss-cpu>=1.7.0  # Required for Facebook AI Similarity Search
+h5py>=3.1.0  # For saving and loading models
 ipython>=8.0.0  # For interactive Python
 loguru>=0.7.0  # Enhanced logging (optional but recommended)
 matplotlib>=3.5.0  # For validation plotting

run_data_preparer.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import os
+import sys
+import faiss
+import pickle
+from transformers import AutoTokenizer
+from tqdm.auto import tqdm
+from chatbot_model import ChatbotConfig, EncoderModel
+from environment_setup import EnvironmentSetup
+from tf_data_pipeline import TFDataPipeline
+from logger_config import config_logger
+logger = config_logger(__name__)
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+def cleanup_test_indices(faiss_dir, test_prefix='test_'):
+    test_files = [f for f in os.listdir(faiss_dir) if f.startswith(test_prefix)]
+    for file in test_files:
+        file_path = os.path.join(faiss_dir, file)
+        os.remove(file_path)
+        logger.info(f"Removed test FAISS index file: {file_path}")
+def main():
+    # Constants
+    MODELS_DIR = 'models'
+    PROCESSED_DATA_DIR = 'processed_outputs'
+    CACHE_DIR = 'cache'
+    TOKENIZER_DIR = os.path.join(MODELS_DIR, 'tokenizer')
+    FAISS_INDICES_DIR = os.path.join(MODELS_DIR, 'faiss_indices')
+    TF_RECORD_DIR = 'training_data'
+    FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_production.index')
+    FAISS_INDEX_TEST_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_test.index')
+    ENVIRONMENT = 'test'  # or 'production'
+    if ENVIRONMENT == 'test':
+        FAISS_INDEX_PATH = FAISS_INDEX_TEST_PATH
+    else:
+        FAISS_INDEX_PATH = FAISS_INDEX_PRODUCTION_PATH
+    JSON_TRAINING_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, 'augmented_dialogues.json')
+    CACHE_FILE = os.path.join(CACHE_DIR, 'query_embeddings_cache.pkl')
+    TF_RECORD_PATH = os.path.join(TF_RECORD_DIR, 'training_data.tfrecord')
+    DEBUG_SAMPLES = None
+    # Ensure output directories exist
+    os.makedirs(MODELS_DIR, exist_ok=True)
+    os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
+    os.makedirs(CACHE_DIR, exist_ok=True)
+    os.makedirs(TOKENIZER_DIR, exist_ok=True)
+    os.makedirs(FAISS_INDICES_DIR, exist_ok=True)
+    os.makedirs(TF_RECORD_DIR, exist_ok=True)
+    # Initialize configuration
+    config = ChatbotConfig()
+    logger.info(f"Chatbot Configuration: {config}")
+    # Initialize tokenizer
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
+        logger.info(f"Tokenizer '{config.pretrained_model}' loaded successfully.")
+    except Exception as e:
+        logger.error(f"Failed to load tokenizer: {e}")
+        sys.exit(1)
+    # Add special tokens
+    try:
+        tokenizer.add_special_tokens({'additional_special_tokens': ['<EMPTY_NEGATIVE>']})
+        logger.info("Added special tokens to tokenizer.")
+    except Exception as e:
+        logger.error(f"Failed to add special tokens: {e}")
+        sys.exit(1)
+    # Initialize encoder model
+    try:
+        encoder = EncoderModel(config=config)
+        logger.info("EncoderModel initialized successfully.")
+    except Exception as e:
+        logger.error(f"Failed to initialize EncoderModel: {e}")
+        sys.exit(1)
+    # Resize token embeddings in encoder to match tokenizer
+    try:
+        encoder.pretrained.resize_token_embeddings(len(tokenizer))
+        logger.info(f"Token embeddings resized to: {len(tokenizer)}")
+    except Exception as e:
+        logger.error(f"Failed to resize token embeddings: {e}")
+        sys.exit(1)
+    # Load JSON dialogues
+    try:
+        dialogues = TFDataPipeline.load_json_training_data(JSON_TRAINING_DATA_PATH, DEBUG_SAMPLES)
+        logger.info(f"Loaded {len(dialogues)} dialogues from {JSON_TRAINING_DATA_PATH}.")
+    except Exception as e:
+        logger.error(f"Failed to load dialogues: {e}")
+        sys.exit(1)
+    # Load or initialize query_embeddings_cache
+    try:
+        if os.path.exists(CACHE_FILE):
+            with open(CACHE_FILE, 'rb') as f:
+                query_embeddings_cache = pickle.load(f)
+            logger.info(f"Loaded {len(query_embeddings_cache)} query embeddings from {CACHE_FILE}.")
+        else:
+            query_embeddings_cache = {}
+            logger.info("Initialized empty query embeddings cache.")
+    except Exception as e:
+        logger.error(f"Failed to load or initialize query embeddings cache: {e}")
+        sys.exit(1)
+    # Initialize TFDataPipeline
+    try:
+        data_pipeline = TFDataPipeline(
+            config=config,
+            tokenizer=tokenizer,
+            encoder=encoder,
+            index_file_path=FAISS_INDEX_PATH,
+            response_pool=[],
+            max_length=config.max_context_token_limit,
+            neg_samples=config.neg_samples,
+            query_embeddings_cache=query_embeddings_cache,
+            max_retries=config.max_retries
+        )
+        logger.info("TFDataPipeline initialized successfully.")
+    except Exception as e:
+        logger.error(f"Failed to initialize TFDataPipeline: {e}")
+        sys.exit(1)
+    # Collect unique assistant responses from dialogues
+    try:
+        response_pool = data_pipeline.collect_responses(dialogues)
+        data_pipeline.response_pool = response_pool
+        logger.info(f"Collected {len(response_pool)} unique assistant responses from dialogues.")
+    except Exception as e:
+        logger.error(f"Failed to collect responses: {e}")
+        sys.exit(1)
+    # Compute and add response embeddings to FAISS index
+    try:
+        logger.info("Computing and adding response embeddings to FAISS index...")
+        data_pipeline._compute_and_index_response_embeddings()
+        logger.info("Response embeddings computed and added to FAISS index.")
+    except Exception as e:
+        logger.error(f"Failed to compute or add response embeddings: {e}")
+        sys.exit(1)
+    # Save FAISS index
+    try:
+        logger.info(f"Saving FAISS index to {FAISS_INDEX_PATH}...")
+        faiss.write_index(data_pipeline.index, FAISS_INDEX_PATH)
+        logger.info("FAISS index saved successfully.")
+    except Exception as e:
+        logger.error(f"Failed to save FAISS index: {e}")
+        sys.exit(1)
+    # Prepare and save training data as TFRecords
+    try:
+        logger.info("Starting data preparation and saving as TFRecord...")
+        data_pipeline.prepare_and_save_data(dialogues, TF_RECORD_PATH)
+        logger.info(f"Data saved as TFRecord at {TF_RECORD_PATH}.")
+    except Exception as e:
+        logger.error(f"Failed during data preparation and saving: {e}")
+        sys.exit(1)
+    # Save query embeddings cache
+    try:
+        with open(CACHE_FILE, 'wb') as f:
+            pickle.dump(data_pipeline.query_embeddings_cache, f)
+        logger.info(f"Saved {len(data_pipeline.query_embeddings_cache)} query embeddings to {CACHE_FILE}.")
+    except Exception as e:
+        logger.error(f"Failed to save query embeddings cache: {e}")
+        sys.exit(1)
+    # Save Tokenizer (including special tokens)
+    try:
+        tokenizer.save_pretrained(TOKENIZER_DIR)
+        logger.info(f"Tokenizer saved to {TOKENIZER_DIR}.")
+    except Exception as e:
+        logger.error(f"Failed to save tokenizer: {e}")
+        sys.exit(1)
+    logger.info("Data preparation pipeline completed successfully.")
+if __name__ == "__main__":
+    main()

tf_data_pipeline.py ADDED Viewed

	@@ -0,0 +1,734 @@

+import os
+import gc
+import numpy as np
+import faiss
+import tensorflow as tf
+import h5py
+from tqdm import tqdm
+import json
+from pathlib import Path
+from typing import Union, Optional, List, Tuple, Generator
+from transformers import AutoTokenizer
+from typing import List, Tuple, Generator
+from gpu_monitor import GPUMemoryMonitor
+from logger_config import config_logger
+logger = config_logger(__name__)
+class TFDataPipeline:
+    def __init__(
+        self,
+        config,
+        tokenizer,
+        encoder,
+        index_file_path: str,
+        response_pool: List[str],
+        max_length: int,
+        query_embeddings_cache: dict,
+        neg_samples: int = 3,
+        index_type: str = 'IndexFlatIP',
+        nlist: int = 100,
+        max_retries: int = 3
+    ):
+        #self.embedding_batch_size = embedding_batch_size
+        self.config = config
+        self.tokenizer = tokenizer
+        self.encoder = encoder
+        self.index_file_path = index_file_path
+        self.response_pool = response_pool
+        self.max_length = max_length
+        self.neg_samples = neg_samples
+        self.query_embeddings_cache = query_embeddings_cache # In-memory cache for embeddings
+        self.index_type = index_type
+        self.nlist = nlist
+        self.embedding_batch_size = 16 if len(response_pool) < 100 else 64
+        self.search_batch_size = 16 if len(response_pool) < 100 else 64
+        self.max_batch_size = 16 if len(response_pool) < 100 else 64
+        self.memory_monitor = GPUMemoryMonitor()
+        self.max_retries = max_retries
+        if os.path.exists(index_file_path):
+            logger.info(f"Loading existing FAISS index from {index_file_path}...")
+            self.index = faiss.read_index(index_file_path)
+            self.validate_faiss_index()
+            logger.info("FAISS index loaded and validated successfully.")
+        else:
+            # Initialize FAISS index
+            dimension = self.encoder.config.embedding_dim
+            self.index = faiss.IndexFlatIP(dimension)
+            logger.info(f"Initialized FAISS IndexFlatIP with dimension {dimension}.")
+        if not self.index.is_trained:
+            # Train the index if it's not trained. # TODO: Replace 'dimension' with embedding size
+            dimension = self.query_embeddings_cache[next(iter(self.query_embeddings_cache))].shape[0]
+            self.index.train(np.array(list(self.query_embeddings_cache.values())).astype(np.float32))
+            self.index.add(np.array(list(self.query_embeddings_cache.values())).astype(np.float32))
+    def validate_faiss_index(self):
+        """Validates that the FAISS index has the correct dimensionality."""
+        expected_dim = self.encoder.config.embedding_dim
+        if self.index.d != expected_dim:
+            logger.error(f"FAISS index dimension {self.index.d} does not match encoder embedding dimension {expected_dim}.")
+            raise ValueError("FAISS index dimensionality mismatch.")
+        logger.info("FAISS index dimension validated successfully.")
+    def save_embeddings_cache_hdf5(self, cache_file_path: str):
+        """Save the embeddings cache to an HDF5 file."""
+        with h5py.File(cache_file_path, 'w') as hf:
+            for query, emb in self.query_embeddings_cache.items():
+                hf.create_dataset(query, data=emb)
+        logger.info(f"Embeddings cache saved to {cache_file_path}.")
+    def load_embeddings_cache_hdf5(self, cache_file_path: str):
+        """Load the embeddings cache from an HDF5 file."""
+        with h5py.File(cache_file_path, 'r') as hf:
+            for query in hf.keys():
+                self.query_embeddings_cache[query] = hf[query][:]
+        logger.info(f"Embeddings cache loaded from {cache_file_path}.")
+    def save_faiss_index(self, index_file_path: str):
+        faiss.write_index(self.index, index_file_path)
+        logger.info(f"FAISS index saved to {index_file_path}")
+    def load_faiss_index(self, index_file_path: str):
+        self.index = faiss.read_index(index_file_path)
+        logger.info(f"FAISS index loaded from {index_file_path}")
+    def save_tokenizer(self, tokenizer_dir: str):
+        self.tokenizer.save_pretrained(tokenizer_dir)
+        logger.info(f"Tokenizer saved to {tokenizer_dir}")
+    def load_tokenizer(self, tokenizer_dir: str):
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+        logger.info(f"Tokenizer loaded from {tokenizer_dir}")
+    def estimate_total_pairs(self, dialogues: List[dict]) -> int:
+        """Estimate total number of training pairs including hard negatives."""
+        base_pairs = sum(
+            len([
+                1 for i in range(len(d.get('turns', [])) - 1)
+                if (d['turns'][i].get('speaker') == 'user' and
+                    d['turns'][i+1].get('speaker') == 'assistant')
+            ])
+            for d in dialogues
+        )
+        # Account for hard negatives
+        return base_pairs * (1 + self.neg_samples)
+    @staticmethod
+    def load_json_training_data(data_path: Union[str, Path], debug_samples: Optional[int] = None) -> List[dict]:
+        """
+        Load training data from a JSON file.
+        Args:
+            data_path (Union[str, Path]): Path to the JSON file containing dialogues.
+            debug_samples (Optional[int]): Number of samples to load for debugging.
+        Returns:
+            List[dict]: List of dialogue dictionaries.
+        """
+        logger.info(f"Loading training data from {data_path}...")
+        data_path = Path(data_path)
+        if not data_path.exists():
+            logger.error(f"Data file {data_path} does not exist.")
+            return []
+        with open(data_path, 'r', encoding='utf-8') as f:
+            dialogues = json.load(f)
+        if debug_samples is not None:
+            dialogues = dialogues[:debug_samples]
+            logger.info(f"Debug mode: Limited to {debug_samples} dialogues")
+        logger.info(f"Loaded {len(dialogues)} dialogues.")
+        return dialogues
+    def collect_responses(self, dialogues: List[dict]) -> List[str]:
+        """Extract unique assistant responses from dialogues."""
+        response_set = set()
+        for dialogue in dialogues:
+            turns = dialogue.get('turns', [])
+            for turn in turns:
+                speaker = turn.get('speaker')
+                text = turn.get('text', '').strip()
+                if speaker == 'assistant' and text:
+                    # Ensure we don't exclude valid shorter responses
+                    if len(text) <= self.max_length:
+                        response_set.add(text)
+        logger.info(f"Collected {len(response_set)} unique assistant responses from dialogues.")
+        return list(response_set)
+    def _extract_pairs_from_dialogue(self, dialogue: dict) -> List[Tuple[str, str]]:
+        """Extract query-response pairs from a dialogue."""
+        pairs = []
+        turns = dialogue.get('turns', [])
+        for i in range(len(turns) - 1):
+            current_turn = turns[i]
+            next_turn = turns[i+1]
+            if (current_turn.get('speaker') == 'user' and
+                next_turn.get('speaker') == 'assistant' and
+                'text' in current_turn and
+                'text' in next_turn):
+                query = current_turn['text'].strip()
+                positive = next_turn['text'].strip()
+                pairs.append((query, positive))
+        return pairs
+    def _compute_and_index_response_embeddings(self):
+        """
+        Computes embeddings for the response pool and adds them to the FAISS index.
+        """
+        logger.info("Computing embeddings for the response pool...")
+        # Log the contents and types of response_pool
+        for idx, response in enumerate(self.response_pool[:5], 1):  # Log first 5 responses
+            logger.debug(f"Response {idx}: {response} (Type: {type(response)})")
+        # Ensure all responses are strings
+        if not all(isinstance(response, str) for response in self.response_pool):
+            logger.error("All elements in response_pool must be strings.")
+            raise ValueError("Invalid data type in response_pool.")
+        # Proceed with tokenization
+        encoded_responses = self.tokenizer(
+            self.response_pool,
+            padding=True,
+            truncation=True,
+            max_length=self.max_length,
+            return_tensors='tf'
+        )
+        response_ids = encoded_responses['input_ids']
+        # Compute embeddings in batches
+        batch_size = getattr(self, 'embedding_batch_size', 64)  # Default to 64 if not set
+        embeddings = []
+        for i in range(0, len(response_ids), batch_size):
+            batch_ids = response_ids[i:i+batch_size]
+            # Compute embeddings
+            batch_embeddings = self.encoder(batch_ids, training=False).numpy()
+            # Normalize embeddings if using inner product or cosine similarity
+            faiss.normalize_L2(batch_embeddings)
+            embeddings.append(batch_embeddings)
+        if embeddings:
+            embeddings = np.vstack(embeddings).astype(np.float32)
+            # Add embeddings to FAISS index
+            logger.info(f"Adding {len(embeddings)} response embeddings to FAISS index...")
+            self.index.add(embeddings)
+            logger.info("Response embeddings added to FAISS index.")
+        else:
+            logger.warning("No embeddings to add to FAISS index.")
+        # **Sanity Check:** Verify the number of embeddings in FAISS index
+        logger.info(f"Total embeddings in FAISS index after addition: {self.index.ntotal}")
+    def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
+        """Find hard negatives for a batch of queries with error handling and retries."""
+        retry_count = 0
+        total_responses = len(self.response_pool)
+        # Set k to be neg_samples + additional candidates to improve negative selection
+        k = self.neg_samples + 0
+        while retry_count < self.max_retries:
+            try:
+                # Compute embeddings in sub-batches to manage memory
+                batch_size = 128  # Example sub-batch size; adjust as needed
+                query_embeddings = []
+                for i in range(0, len(queries), batch_size):
+                    sub_queries = queries[i:i + batch_size]
+                    sub_embeddings = np.vstack([
+                        self.query_embeddings_cache[q] for q in sub_queries
+                    ]).astype(np.float32)
+                    faiss.normalize_L2(sub_embeddings)
+                    query_embeddings.append(sub_embeddings)
+                query_embeddings = np.vstack(query_embeddings)
+                # Ensure contiguous memory layout
+                query_embeddings = np.ascontiguousarray(query_embeddings)
+                # Perform FAISS search on CPU
+                distances, indices = self.index.search(query_embeddings, k)
+                all_negatives = []
+                for query_indices, query, positive in zip(indices, queries, positives):
+                    negatives = []
+                    positive_strip = positive.strip()
+                    seen = {positive_strip}
+                    for idx in query_indices:
+                        if idx >= 0 and idx < total_responses:
+                            candidate = self.response_pool[idx].strip()
+                            if candidate and candidate not in seen:
+                                seen.add(candidate)
+                                negatives.append(candidate)
+                                if len(negatives) >= self.neg_samples:
+                                    break
+                    # If not enough negatives are found, pad with a special token
+                    while len(negatives) < self.neg_samples:
+                        negatives.append("<EMPTY_NEGATIVE>")  # Use a special token
+                    all_negatives.append(negatives)
+                return all_negatives
+            except KeyError as ke:
+                retry_count += 1
+                logger.warning(f"Hard negative search attempt {retry_count} failed due to missing embeddings: {ke}")
+                if retry_count == self.max_retries:
+                    logger.error("Max retries reached for hard negative search due to missing embeddings.")
+                    return [["<EMPTY_NEGATIVE>"] * self.neg_samples for _ in queries]
+                # Perform memory cleanup
+                gc.collect()
+                if tf.config.list_physical_devices('GPU'):
+                    tf.keras.backend.clear_session()
+            except Exception as e:
+                retry_count += 1
+                logger.warning(f"Hard negative search attempt {retry_count} failed: {e}")
+                if retry_count == self.max_retries:
+                    logger.error("Max retries reached for hard negative search.")
+                    return [["<EMPTY_NEGATIVE>"] * self.neg_samples for _ in queries]
+                # Perform memory cleanup
+                gc.collect()
+                if tf.config.list_physical_devices('GPU'):
+                    tf.keras.backend.clear_session()
+    def _tokenize_and_encode(self, queries: List[str], positives: List[str], negatives: List[List[str]]) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Tokenize and encode the queries, positives, and negatives.
+        Returns:
+            query_ids: [batch_size, max_length]
+            positive_ids: [batch_size, max_length]
+            negative_ids: [batch_size, neg_samples, max_length]
+        """
+        # Tokenize queries
+        q_enc = self.tokenizer(
+            queries,
+            padding="max_length",
+            truncation=True,
+            max_length=self.max_length,
+            return_tensors="np"
+        )
+        # Tokenize positives
+        p_enc = self.tokenizer(
+            positives,
+            padding="max_length",
+            truncation=True,
+            max_length=self.max_length,
+            return_tensors="np"
+        )
+        # Tokenize negatives
+        # Flatten negatives
+        flattened_negatives = [neg for sublist in negatives for neg in sublist]
+        if len(flattened_negatives) == 0:
+            # No negatives at all: return a zero array
+            n_ids = np.zeros((len(queries), self.neg_samples, self.max_length), dtype=np.int32)
+        else:
+            n_enc = self.tokenizer(
+                flattened_negatives,
+                padding="max_length",
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors="np"
+            )
+            n_input_ids = n_enc["input_ids"]
+            # Reshape to [batch_size, neg_samples, max_length]
+            batch_size = len(queries)
+            n_ids = n_input_ids.reshape(batch_size, self.neg_samples, self.max_length)
+        # Convert to int32
+        query_ids = q_enc["input_ids"].astype(np.int32)
+        positive_ids = p_enc["input_ids"].astype(np.int32)
+        negative_ids = n_ids.astype(np.int32)
+        return query_ids, positive_ids, negative_ids
+    def prepare_and_save_data(self, dialogues: List[dict], tfrecord_file_path: str, batch_size: int = 32):
+        """Processes dialogues in batches and saves to a TFRecord file."""
+        with tf.io.TFRecordWriter(tfrecord_file_path) as writer:
+            total_dialogues = len(dialogues)
+            logger.debug(f"Total dialogues to process: {total_dialogues}")
+            with tqdm(total=total_dialogues, desc="Processing Dialogues", unit="dialogue") as pbar:
+                for i in range(0, total_dialogues, batch_size):
+                    batch_dialogues = dialogues[i:i+batch_size]
+                    # Process each batch_dialogues
+                    # Extract pairs, find negatives, tokenize, and serialize
+                    # Example:
+                    for dialogue in batch_dialogues:
+                        pairs = self._extract_pairs_from_dialogue(dialogue)
+                        queries = []
+                        positives = []
+                        for query, positive in pairs:
+                            queries.append(query)
+                            positives.append(positive)
+                        if queries:
+                            # **Compute and cache query embeddings before searching**
+                            self._compute_embeddings(queries)
+                            # Find hard negatives
+                            hard_negatives = self._find_hard_negatives_batch(queries, positives)
+                            for idx, negatives in enumerate(hard_negatives[:5]):  # Log first 5 examples
+                                logger.debug(f"Query: {queries[idx]}")
+                                logger.debug(f"Positive: {positives[idx]}")
+                                logger.debug(f"Hard Negatives: {negatives}")
+                            # Tokenize and encode
+                            query_ids, positive_ids, negative_ids = self._tokenize_and_encode(queries, positives, hard_negatives)
+                            # Serialize each example and write to TFRecord
+                            for q_id, p_id, n_id in zip(query_ids, positive_ids, negative_ids):
+                                feature = {
+                                    'query_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=q_id)),
+                                    'positive_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=p_id)),
+                                    'negative_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=n_id.flatten())),
+                                }
+                                example = tf.train.Example(features=tf.train.Features(feature=feature))
+                                writer.write(example.SerializeToString())
+                    pbar.update(len(batch_dialogues))
+        logger.info(f"Data preparation complete. TFRecord saved at {tfrecord_file_path}")
+    def _tokenize_negatives_tf(self, negatives):
+        """Tokenizes negatives using tf.py_function."""
+        # Handle the case where negatives is an empty tensor
+        if tf.size(negatives) == 0:
+            return tf.zeros([0, self.neg_samples, self.max_length], dtype=tf.int32)
+        # Convert EagerTensor to a list of strings
+        negatives_list = []
+        for neg_list in negatives.numpy():
+            decoded_negs = [neg.decode("utf-8") for neg in neg_list if neg]  # Filter out empty strings
+            negatives_list.append(decoded_negs)
+        # Flatten the list of lists
+        flattened_negatives = [neg for sublist in negatives_list for neg in sublist]
+        # Tokenize the flattened negatives
+        if flattened_negatives:
+            n_tokens = self.tokenizer(
+                flattened_negatives,
+                padding='max_length',
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors='tf'
+            )
+            # Reshape the tokens
+            n_tokens_reshaped = tf.reshape(n_tokens['input_ids'], [-1, self.neg_samples, self.max_length])
+            return n_tokens_reshaped
+        else:
+            return tf.zeros([0, self.neg_samples, self.max_length], dtype=tf.int32)
+    def _compute_embeddings(self, queries: List[str]) -> None:
+        new_queries = [q for q in queries if q not in self.query_embeddings_cache]
+        if not new_queries:
+            return  # All queries already cached
+        # Compute embeddings for new queries
+        new_embeddings = []
+        for i in range(0, len(new_queries), self.embedding_batch_size):
+            batch_queries = new_queries[i:i + self.embedding_batch_size]
+            encoded = self.tokenizer(
+                batch_queries,
+                padding=True,
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors='tf'
+            )
+            batch_embeddings = self.encoder(encoded['input_ids'], training=False).numpy()
+            faiss.normalize_L2(batch_embeddings)
+            new_embeddings.extend(batch_embeddings)
+        # Update the cache
+        for query, emb in zip(new_queries, new_embeddings):
+            self.query_embeddings_cache[query] = emb
+    def data_generator(self, dialogues: List[dict]) -> Generator[Tuple[str, str, List[str]], None, None]:
+        """
+        Generates training examples: (query, positive, hard_negatives).
+        Wrapped the outer loop with tqdm for progress tracking.
+        """
+        total_dialogues = len(dialogues)
+        logger.debug(f"Total dialogues to process: {total_dialogues}")
+        # Initialize tqdm progress bar
+        with tqdm(total=total_dialogues, desc="Processing Dialogues", unit="dialogue") as pbar:
+            for dialogue in dialogues:
+                pairs = self._extract_pairs_from_dialogue(dialogue)
+                for query, positive in pairs:
+                    # Ensure embeddings are computed, find hard negatives, etc.
+                    self._compute_embeddings([query])
+                    hard_negatives = self._find_hard_negatives_batch([query], [positive])[0]
+                    yield (query, positive, hard_negatives)
+                pbar.update(1)
+    def _prepare_batch(self, queries: tf.Tensor, positives: tf.Tensor, negatives: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+        """Prepares a batch of data for training."""
+        # Convert EagerTensors to lists of strings
+        queries_list = [query.decode("utf-8") for query in queries.numpy()]
+        positives_list = [pos.decode("utf-8") for pos in positives.numpy()]
+        # Tokenize queries and positives
+        q_tokens = self.tokenizer(queries_list, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='tf')
+        p_tokens = self.tokenizer(positives_list, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='tf')
+        # Decode negatives and ensure they are lists of strings
+        negatives_list = []
+        for neg_list in negatives.numpy():
+            decoded_negs = [neg.decode("utf-8") for neg in neg_list if neg]  # Filter out empty strings
+            negatives_list.append(decoded_negs)
+        # Flatten negatives for tokenization if there are any valid negatives
+        flattened_negatives = [neg for sublist in negatives_list for neg in sublist if neg]
+        # Tokenize negatives if there are any
+        n_tokens_reshaped = None
+        if flattened_negatives:
+            n_tokens = self.tokenizer(flattened_negatives, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='tf')
+            # Reshape n_tokens to match the expected shape based on the number of negatives per query
+            # This part may need adjustment if the number of negatives varies per query
+            n_tokens_reshaped = tf.reshape(n_tokens['input_ids'], [len(queries_list), -1, self.max_length])
+        else:
+            # Create a placeholder tensor for the case where there are no negatives
+            n_tokens_reshaped = tf.zeros([len(queries_list), 0, self.max_length], dtype=tf.int32)
+        # Ensure n_tokens_reshaped has a consistent shape even when there are no negatives
+        # Adjust shape to [batch_size, num_neg_samples, max_length]
+        if n_tokens_reshaped.shape[1] != self.neg_samples:
+            # Pad or truncate the second dimension to match neg_samples
+            padding = tf.zeros([len(queries_list), tf.maximum(0, self.neg_samples - n_tokens_reshaped.shape[1]), self.max_length], dtype=tf.int32)
+            n_tokens_reshaped = tf.concat([n_tokens_reshaped, padding], axis=1)
+            n_tokens_reshaped = n_tokens_reshaped[:, :self.neg_samples, :]
+        # Concatenate the positive and negative examples along the 'neg_samples' dimension
+        combined_p_n_tokens = tf.concat([tf.expand_dims(p_tokens['input_ids'], axis=1), n_tokens_reshaped], axis=1)
+        return q_tokens['input_ids'], combined_p_n_tokens
+    def get_tf_dataset(self, dialogues: List[dict], batch_size: int) -> tf.data.Dataset:
+        """
+        Creates a tf.data.Dataset for streaming training that yields
+        (input_ids_query, input_ids_positive, input_ids_negatives).
+        """
+        # 1) Start with a generator dataset
+        dataset = tf.data.Dataset.from_generator(
+            lambda: self.data_generator(dialogues),
+            output_signature=(
+                tf.TensorSpec(shape=(), dtype=tf.string),        # Query (single string)
+                tf.TensorSpec(shape=(), dtype=tf.string),        # Positive (single string)
+                tf.TensorSpec(shape=(self.neg_samples,), dtype=tf.string)    # Hard Negatives (list of strings)
+            )
+        )
+        # 2) Batch the raw strings
+        dataset = dataset.batch(batch_size, drop_remainder=True)
+        # 3) Map them through a tokenize step using `tf.py_function`
+        dataset = dataset.map(
+            lambda q, p, n: self._tokenize_triple(q, p, n),
+            num_parallel_calls=1 #tf.data.AUTOTUNE
+        )
+        dataset = dataset.prefetch(tf.data.AUTOTUNE)
+        return dataset
+    # def get_tf_dataset(self, dialogues: List[dict], batch_size: int) -> tf.data.Dataset:
+    #     """
+    #     Creates a tf.data.Dataset for streaming training that yields
+    #     (input_ids_query, input_ids_positive, input_ids_negatives).
+    #     """
+    #     # 1) Start with a generator dataset
+    #     dataset = tf.data.Dataset.from_generator(
+    #         lambda: self.data_generator(dialogues),
+    #         output_signature=(
+    #             tf.TensorSpec(shape=(), dtype=tf.string),        # Query (single string)
+    #             tf.TensorSpec(shape=(), dtype=tf.string),        # Positive (single string)
+    #             tf.TensorSpec(shape=(None,), dtype=tf.string)    # Hard Negatives (list of strings)
+    #         )
+    #     )
+    #     # 2) Batch the raw strings
+    #     dataset = dataset.batch(batch_size)
+    #     # 3) Now map them through a tokenize step (via py_function)
+    #     dataset = dataset.map(
+    #         lambda q, p, n: self._tokenize_triple(q, p, n),
+    #         num_parallel_calls=1 #tf.data.AUTOTUNE
+    #     )
+    #     dataset = dataset.prefetch(tf.data.AUTOTUNE)
+    #     return dataset
+    def _tokenize_triple(
+        self,
+        q: tf.Tensor,
+        p: tf.Tensor,
+        n: tf.Tensor
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        """
+        Wraps a Python function via tf.py_function to convert tf.Tensors of strings
+        -> Python lists of strings -> HF tokenizer -> Tensors of IDs.
+        q is shape [batch_size], p is shape [batch_size],
+        n is shape [batch_size, neg_samples] (i.e., each row is a list of negatives).
+        """
+        # Use tf.py_function with limited parallelism
+        q_ids, p_ids, n_ids = tf.py_function(
+            func=self._tokenize_triple_py,
+            inp=[q, p, n, tf.constant(self.max_length), tf.constant(self.neg_samples)],
+            Tout=[tf.int32, tf.int32, tf.int32]
+        )
+        # Manually set shape information
+        q_ids.set_shape([None, self.max_length])                # [batch_size, max_length]
+        p_ids.set_shape([None, self.max_length])                # [batch_size, max_length]
+        n_ids.set_shape([None, self.neg_samples, self.max_length])  # [batch_size, neg_samples, max_length]
+        return q_ids, p_ids, n_ids
+    def _tokenize_triple_py(
+        self,
+        q: tf.Tensor,
+        p: tf.Tensor,
+        n: tf.Tensor,
+        max_len: tf.Tensor,
+        neg_samples: tf.Tensor
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Python function that:
+        - Decodes each tf.string Tensor to a Python list of strings
+        - Calls the HF tokenizer
+        - Reshapes negatives
+        - Returns np.array of int32s for (q_ids, p_ids, n_ids).
+        q: shape [batch_size], p: shape [batch_size]
+        n: shape [batch_size, neg_samples]
+        max_len: scalar int
+        neg_samples: scalar int
+        """
+        max_len = int(max_len.numpy())       # Convert to Python int
+        neg_samples = int(neg_samples.numpy())
+        # 1) Convert Tensors -> Python lists of strings
+        q_list = [q_i.decode("utf-8") for q_i in q.numpy()]  # shape [batch_size]
+        p_list = [p_i.decode("utf-8") for p_i in p.numpy()]  # shape [batch_size]
+        # shape [batch_size, neg_samples], decode each row
+        n_list = []
+        for row in n.numpy():
+            # row is shape [neg_samples], each is a tf.string
+            decoded = [neg.decode("utf-8") for neg in row]
+            n_list.append(decoded)
+        # 2) Tokenize queries & positives
+        q_enc = self.tokenizer(
+            q_list,
+            padding="max_length",
+            truncation=True,
+            max_length=max_len,
+            return_tensors="np"
+        )
+        p_enc = self.tokenizer(
+            p_list,
+            padding="max_length",
+            truncation=True,
+            max_length=max_len,
+            return_tensors="np"
+        )
+        # 3) Tokenize negatives
+        # Flatten [batch_size, neg_samples] -> single list
+        flattened_negatives = [neg for row in n_list for neg in row]
+        if len(flattened_negatives) == 0:
+            # No negatives at all: return a zero array
+            n_ids = np.zeros((len(q_list), neg_samples, max_len), dtype=np.int32)
+        else:
+            n_enc = self.tokenizer(
+                flattened_negatives,
+                padding="max_length",
+                truncation=True,
+                max_length=max_len,
+                return_tensors="np"
+            )
+            # shape [batch_size * neg_samples, max_len]
+            n_input_ids = n_enc["input_ids"]
+            # We want to reshape to [batch_size, neg_samples, max_len]
+            # Handle cases where there might be fewer negatives
+            batch_size = len(q_list)
+            n_ids_list = []
+            for i in range(batch_size):
+                start_idx = i * neg_samples
+                end_idx = start_idx + neg_samples
+                row_negs = n_input_ids[start_idx:end_idx]
+                # If fewer negatives, pad with zeros
+                if row_negs.shape[0] < neg_samples:
+                    deficit = neg_samples - row_negs.shape[0]
+                    pad_arr = np.zeros((deficit, max_len), dtype=np.int32)
+                    row_negs = np.concatenate([row_negs, pad_arr], axis=0)
+                n_ids_list.append(row_negs)
+            # stack them -> shape [batch_size, neg_samples, max_len]
+            n_ids = np.stack(n_ids_list, axis=0)
+        # 4) Return as np.int32 arrays
+        q_ids = q_enc["input_ids"].astype(np.int32)  # shape [batch_size, max_len]
+        p_ids = p_enc["input_ids"].astype(np.int32)  # shape [batch_size, max_len]
+        n_ids = n_ids.astype(np.int32)               # shape [batch_size, neg_samples, max_len]
+        return q_ids, p_ids, n_ids
+        # def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
+    #     """Find hard negatives for a batch of queries with error handling and retries."""
+    #     retry_count = 0
+    #     total_responses = len(self.response_pool)
+    #     while retry_count < self.max_retries:
+    #         try:
+    #             query_embeddings = np.vstack([
+    #                 self.query_embeddings_cache[q] for q in queries
+    #             ]).astype(np.float32)
+    #             query_embeddings = np.ascontiguousarray(query_embeddings)
+    #             faiss.normalize_L2(query_embeddings)
+    #             k = 1  # TODO: try higher k for better results
+    #             #logger.debug(f"Searching with k={k} among {total_responses} responses")
+    #             distances, indices = self.index.search(query_embeddings, k)
+    #             all_negatives = []
+    #             for query_indices, query, positive in zip(indices, queries, positives):
+    #                 negatives = []
+    #                 positive_strip = positive.strip()
+    #                 seen = {positive_strip}
+    #                 for idx in query_indices:
+    #                     if idx >= 0 and idx < total_responses:
+    #                         candidate = self.response_pool[idx].strip()
+    #                         if candidate and candidate not in seen:
+    #                             seen.add(candidate)
+    #                             negatives.append(candidate)
+    #                             if len(negatives) >= self.neg_samples:
+    #                                 break
+    #                 # Pad with a special empty negative if necessary
+    #                 while len(negatives) < self.neg_samples:
+    #                     negatives.append("<EMPTY_NEGATIVE>")  # Use a special token
+    #                 all_negatives.append(negatives)
+    #             return all_negatives