Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Jan 11

Commit

cc2577d

1 Parent(s): 3ea7670

style updates

Browse files

Files changed (6) hide show

chatbot_model.py +38 -56
conversation_summarizer.py +30 -28
cross_encoder_reranker.py +11 -24
tf_data_pipeline.py +148 -192
train_model.py +14 -16
validate_model.py +28 -32

chatbot_model.py CHANGED Viewed

@@ -372,7 +372,7 @@ class RetrievalChatbot(DeviceAwareModel):
             reranker: CrossEncoderReranker for refined scoring, if available.
             summarizer: Summarizer for long queries, if desired.
             summarize_threshold: Summarize if query wordcount > threshold.
         Returns:
             List of (response_text, final_score).
         """
@@ -383,11 +383,13 @@ class RetrievalChatbot(DeviceAwareModel):
             logger.info(f"Summarized Query: {query}")
         detected_domain = self.detect_domain_from_query(query)
-        #logger.debug(f"Detected domain '{detected_domain}' for query: {query}")
         # Retrieve initial candidates from FAISS
         initial_k = min(top_k * 10, len(self.data_pipeline.response_pool))
-        faiss_candidates = self.retrieve_responses_faiss(query, domain=detected_domain, top_k=initial_k)
         texts = [item[0] for item in faiss_candidates]
@@ -395,23 +397,18 @@ class RetrievalChatbot(DeviceAwareModel):
         if not reranker:
             reranker = CrossEncoderReranker(model_name=self.config.cross_encoder_model)
-        ce_scores = reranker.rerank(query, texts, max_length=256)
         # Combine cross-encoder score with the base FAISS score (simple multiplicative approach)
         final_candidates = []
-        for (resp_text, faiss_score), ce_score in zip(faiss_candidates, ce_scores):
-            # TODO: dial this in.
-            ce_prob = self.sigmoid(ce_score)  # ~ relevance in [0..1]
-            faiss_norm = (faiss_score + 1)/2.0
-            combined_score = 0.9 * ce_prob + 0.1 * faiss_norm
-            # alpha = 0.9
-            # print(f'CE SCORE: {ce_score} FAISS SCORE: {faiss_score}')
-            # combined_score = alpha * ce_score + (1 - alpha) * faiss_score
             length_adjusted_score = self.length_adjust_score(resp_text, combined_score)
-            #combined_score = ce_score * faiss_score
-            #final_candidates.append((resp_text, combined_score))
             final_candidates.append((resp_text, length_adjusted_score))
         # Sort descending by combined score
         final_candidates.sort(key=lambda x: x[1], reverse=True)
@@ -441,20 +438,18 @@ class RetrievalChatbot(DeviceAwareModel):
     def length_adjust_score(self, text: str, base_score: float) -> float:
         """
-        Penalize very short lines or numeric lines; mildly reward longer lines.
-        Adjust carefully so you don't overshadow cross-encoder signals.
         """
         words = text.split()
         wcount = len(words)
-        # Penalty if under 3 words
         if wcount < 4:
             return base_score * 0.8
-        # Bonus for lines > 12 words
-        if wcount > 12:
-            extra = min(wcount - 12, 8)
-            bonus = 0.0005 * extra
             base_score += bonus
         return base_score
@@ -487,7 +482,7 @@ class RetrievalChatbot(DeviceAwareModel):
         pattern = r'^[\s]*[\d]+([\s.,\d]+)*[\s]*$'
         return bool(re.match(pattern, text.strip()))
-    def retrieve_responses_faiss(
         self,
         query: str,
         domain: str = 'other',
@@ -518,9 +513,9 @@ class RetrievalChatbot(DeviceAwareModel):
         for rank, idx in enumerate(indices[0]):
             if idx < 0:
                 continue
-            response = self.data_pipeline.response_pool[idx]
-            text = response.get('text', '').strip()
-            cand_domain = response.get('domain', 'other')
             score = distances[0][rank]
             # Skip purely numeric or extremely short text (fewer than 3 words):
@@ -554,21 +549,19 @@ class RetrievalChatbot(DeviceAwareModel):
             # shares any query keywords, apply a small boost
             if query_keywords and any(kw in resp_text.lower() for kw in query_keywords):
                 new_score *= boost_factor
-                #logger.debug(f"Boosting response: '{resp_text}' by factor {boost_factor}")
             # Apply length penalty/bonus
             new_score = self.length_adjust_score(resp_text, new_score)
             boosted.append((resp_text, new_score))
         # Sort boosted responses
         boosted.sort(key=lambda x: x[1], reverse=True)
-        # Print top 10
-        # for resp, score in boosted[:150]:
         #     logger.debug(f"Candidate: '{resp}' with score {score}")
-        # 8) Return top_k
         return boosted[:top_k]
     def chat(
@@ -584,10 +577,10 @@ class RetrievalChatbot(DeviceAwareModel):
         """
         @self.run_on_device
         def get_response(self_arg, query_arg):
-            # 1) Build conversation context string
             conversation_str = self_arg._build_conversation_context(query_arg, conversation_history)
-            # 2) Retrieve + cross-encoder re-rank
             results = self_arg.retrieve_responses_cross_encoder(
                 query=conversation_str,
                 top_k=top_k,
@@ -595,26 +588,15 @@ class RetrievalChatbot(DeviceAwareModel):
                 summarizer=self_arg.summarizer,
                 summarize_threshold=512
             )
-            # 3) Handle empty or confidence
             if not results:
-                return (
-                    "I'm sorry, but I couldn't find a relevant response.",
-                    [],
-                    {}
-                )
-            if quality_checker:
-                metrics = quality_checker.check_response_quality(query_arg, results)
-                if not metrics.get('is_confident', False):
-                    return (
-                        "I need more information to provide a good answer. Could you please clarify?",
-                        results,
-                        metrics
-                    )
-                return results[0][0], results, metrics
-            return results[0][0], results, {}
         return get_response(self, query)

             reranker: CrossEncoderReranker for refined scoring, if available.
             summarizer: Summarizer for long queries, if desired.
             summarize_threshold: Summarize if query wordcount > threshold.
         Returns:
             List of (response_text, final_score).
         """
             logger.info(f"Summarized Query: {query}")
         detected_domain = self.detect_domain_from_query(query)
         # Retrieve initial candidates from FAISS
         initial_k = min(top_k * 10, len(self.data_pipeline.response_pool))
+        faiss_candidates = self.faiss_search(query, domain=detected_domain, top_k=initial_k)
+        if not faiss_candidates:
+            return []
         texts = [item[0] for item in faiss_candidates]
         if not reranker:
             reranker = CrossEncoderReranker(model_name=self.config.cross_encoder_model)
+        ce_logits = reranker.rerank(query, texts, max_length=256)
         # Combine cross-encoder score with the base FAISS score (simple multiplicative approach)
         final_candidates = []
+        for (resp_text, faiss_score), logit in zip(faiss_candidates, ce_logits):
+            ce_prob = self.sigmoid(logit) # [0...1]
+            faiss_norm = (faiss_score + 1)/2.0 # [0...1]
+            combined_score = 0.85 * ce_prob + 0.15 * faiss_norm
             length_adjusted_score = self.length_adjust_score(resp_text, combined_score)
             final_candidates.append((resp_text, length_adjusted_score))
         # Sort descending by combined score
         final_candidates.sort(key=lambda x: x[1], reverse=True)
     def length_adjust_score(self, text: str, base_score: float) -> float:
         """
+        Penalize very short lines, reward longer lines.
         """
         words = text.split()
         wcount = len(words)
+        # Penalty if under 4 words
         if wcount < 4:
             return base_score * 0.8
+        # Bonus for lines > 15 words
+        if wcount > 15:
+            bonus = min(0.03, 0.001 * (wcount - 15))
             base_score += bonus
         return base_score
         pattern = r'^[\s]*[\d]+([\s.,\d]+)*[\s]*$'
         return bool(re.match(pattern, text.strip()))
+    def faiss_search(
         self,
         query: str,
         domain: str = 'other',
         for rank, idx in enumerate(indices[0]):
             if idx < 0:
                 continue
+            text_dict = self.data_pipeline.response_pool[idx]
+            text = text_dict.get('text', '').strip()
+            cand_domain = text_dict.get('domain', 'other')
             score = distances[0][rank]
             # Skip purely numeric or extremely short text (fewer than 3 words):
             # shares any query keywords, apply a small boost
             if query_keywords and any(kw in resp_text.lower() for kw in query_keywords):
                 new_score *= boost_factor
             # Apply length penalty/bonus
             new_score = self.length_adjust_score(resp_text, new_score)
             boosted.append((resp_text, new_score))
         # Sort boosted responses
         boosted.sort(key=lambda x: x[1], reverse=True)
+        # Debug
+        # for resp, score in boosted[:100]:
         #     logger.debug(f"Candidate: '{resp}' with score {score}")
         return boosted[:top_k]
     def chat(
         """
         @self.run_on_device
         def get_response(self_arg, query_arg):
+            # Build conversation context string
             conversation_str = self_arg._build_conversation_context(query_arg, conversation_history)
+            # Retrieve and re-rank
             results = self_arg.retrieve_responses_cross_encoder(
                 query=conversation_str,
                 top_k=top_k,
                 summarizer=self_arg.summarizer,
                 summarize_threshold=512
             )
+            # Handle low confidence or empty responses
             if not results:
+                return ("I'm sorry, but I couldn't find a relevant response.", [], {})
+            metrics = quality_checker.check_response_quality(query_arg, results)
+            if not metrics.get('is_confident', False):
+                return ("I need more information to provide a good answer. Could you please clarify?", results, metrics)
+            return results[0][0], results, metrics
         return get_response(self, query)

conversation_summarizer.py CHANGED Viewed

@@ -13,9 +13,11 @@ class ChatConfig:
     chunk_size: int = 512
     chunk_overlap: int = 256
     min_confidence_score: float = 0.7
 class DeviceAwareModel:
-    """Mixin to handle device placement and mixed precision training."""
     def setup_device(self, device: str = None):
         if device is None:
@@ -24,31 +26,33 @@ class DeviceAwareModel:
         self.device = device.upper()
         self.strategy = None
         if self.device == 'GPU':
             # # Enable mixed precision for better performance
             # policy = tf.keras.mixed_precision.Policy('mixed_float16')
             # tf.keras.mixed_precision.set_global_policy(policy)
-            # Setup distribution strategy for multi-GPU if available
             gpus = tf.config.list_physical_devices('GPU')
             if len(gpus) > 1:
                 self.strategy = tf.distribute.MirroredStrategy()
         return self.device
     def run_on_device(self, func):
         """Decorator to ensure ops run on the correct device."""
         def wrapper(*args, **kwargs):
             with tf.device(f'/{self.device}:0'):
                 return func(*args, **kwargs)
         return wrapper
 class Summarizer(DeviceAwareModel):
     """
-    Enhanced T5-based summarizer with better chunking and device management.
-    Handles long conversations by intelligent chunking and progressive summarization.
     """
     def __init__(
         self,
         tokenizer: AutoTokenizer,
@@ -57,10 +61,10 @@ class Summarizer(DeviceAwareModel):
         device=None,
         max_summary_rounds=2
     ):
-        self.tokenizer = tokenizer  # Injected tokenizer
         self.setup_device(device)
-        # Initialize model within strategy scope if using distribution
         if self.strategy:
             with self.strategy.scope():
                 self._setup_model(model_name)
@@ -69,11 +73,11 @@ class Summarizer(DeviceAwareModel):
         self.max_summary_length = max_summary_length
         self.max_summary_rounds = max_summary_rounds
     def _setup_model(self, model_name):
         self.model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
-        # Optimize model for inference
         self.model.generate = tf.function(
             self.model.generate,
             input_signature=[
@@ -83,7 +87,7 @@ class Summarizer(DeviceAwareModel):
                 }
             ]
         )
     @tf.function
     def _generate_summary(self, inputs):
         return self.model.generate(
@@ -94,9 +98,9 @@ class Summarizer(DeviceAwareModel):
             early_stopping=True,
             no_repeat_ngram_size=3
         )
     def chunk_text(self, text: str, chunk_size: int = 512, overlap: int = 256) -> List[str]:
-        """Split text into overlapping chunks for better context preservation."""
         tokens = self.tokenizer.encode(text)
         chunks = []
@@ -105,7 +109,7 @@ class Summarizer(DeviceAwareModel):
             chunks.append(self.tokenizer.decode(chunk, skip_special_tokens=True))
         return chunks
     def summarize_text(
         self,
         text: str,
@@ -113,8 +117,7 @@ class Summarizer(DeviceAwareModel):
         round_idx: int = 0
     ) -> str:
         """
-        Summarize text with optional progressive summarization
-        and limit the maximum number of re-summarization rounds.
         """
         @self.run_on_device
         def _summarize_chunk(chunk: str) -> str:
@@ -127,28 +130,27 @@ class Summarizer(DeviceAwareModel):
             )
             summary_ids = self._generate_summary(inputs)
             return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-        # If we've hit our max allowed summarization rounds, just do a single pass
         if round_idx >= self.max_summary_rounds:
             return _summarize_chunk(text)
-        # If text is longer than threshold and progressive summarization is on
         if len(text.split()) > 512 and progressive:
             chunks = self.chunk_text(text)
             chunk_summaries = [_summarize_chunk(chunk) for chunk in chunks]
             # Combine chunk-level summaries
             combined_summary = " ".join(chunk_summaries)
-            # If still too long, do another summarization pass but increment round_idx
             if len(combined_summary.split()) > 512:
                 return self.summarize_text(
                     combined_summary,
                     progressive=True,
                     round_idx=round_idx + 1
                 )
             return combined_summary
         else:
-            # If text is not too long, just summarize once and return
-            return _summarize_chunk(text)

     chunk_size: int = 512
     chunk_overlap: int = 256
     min_confidence_score: float = 0.7
 class DeviceAwareModel:
+    """
+    Mixin: Handle device placement and mixed precision training.
+    """
     def setup_device(self, device: str = None):
         if device is None:
         self.device = device.upper()
         self.strategy = None
+        # NOTE: Needs more testing. Training issues may have been from other bugs I found since this was tested.
+        # Reminder: Test model saving/loading alongside mixed precision settings
         if self.device == 'GPU':
             # # Enable mixed precision for better performance
             # policy = tf.keras.mixed_precision.Policy('mixed_float16')
             # tf.keras.mixed_precision.set_global_policy(policy)
+            # Setup multi-GPU if available
             gpus = tf.config.list_physical_devices('GPU')
             if len(gpus) > 1:
                 self.strategy = tf.distribute.MirroredStrategy()
         return self.device
     def run_on_device(self, func):
         """Decorator to ensure ops run on the correct device."""
         def wrapper(*args, **kwargs):
             with tf.device(f'/{self.device}:0'):
                 return func(*args, **kwargs)
         return wrapper
 class Summarizer(DeviceAwareModel):
     """
+    T5-based summarizer with chunking and device management.
+    Chunking and progressive summarization for long conversations.
     """
     def __init__(
         self,
         tokenizer: AutoTokenizer,
         device=None,
         max_summary_rounds=2
     ):
+        self.tokenizer = tokenizer
         self.setup_device(device)
+        # Strategy scope if using distribution
         if self.strategy:
             with self.strategy.scope():
                 self._setup_model(model_name)
         self.max_summary_length = max_summary_length
         self.max_summary_rounds = max_summary_rounds
     def _setup_model(self, model_name):
         self.model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
+        # Optimize for inference
         self.model.generate = tf.function(
             self.model.generate,
             input_signature=[
                 }
             ]
         )
     @tf.function
     def _generate_summary(self, inputs):
         return self.model.generate(
             early_stopping=True,
             no_repeat_ngram_size=3
         )
     def chunk_text(self, text: str, chunk_size: int = 512, overlap: int = 256) -> List[str]:
+        """Split text into overlapping chunks for context preservation."""
         tokens = self.tokenizer.encode(text)
         chunks = []
             chunks.append(self.tokenizer.decode(chunk, skip_special_tokens=True))
         return chunks
     def summarize_text(
         self,
         text: str,
         round_idx: int = 0
     ) -> str:
         """
+        Progressive summarization and limited number of resummarization rounds.
         """
         @self.run_on_device
         def _summarize_chunk(chunk: str) -> str:
             )
             summary_ids = self._generate_summary(inputs)
             return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        # Do a single pass at resummarizing if max_summary rounds is hit
         if round_idx >= self.max_summary_rounds:
             return _summarize_chunk(text)
+        # Chunk and summarize
         if len(text.split()) > 512 and progressive:
             chunks = self.chunk_text(text)
             chunk_summaries = [_summarize_chunk(chunk) for chunk in chunks]
             # Combine chunk-level summaries
             combined_summary = " ".join(chunk_summaries)
             if len(combined_summary.split()) > 512:
                 return self.summarize_text(
                     combined_summary,
                     progressive=True,
                     round_idx=round_idx + 1
                 )
             return combined_summary
         else:
+            # Summarize once and return
+            return _summarize_chunk(text)

cross_encoder_reranker.py CHANGED Viewed

@@ -1,23 +1,19 @@
 from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
 import tensorflow as tf
 from typing import List
-import numpy as np
 from logger_config import config_logger
 logger = config_logger(__name__)
 class CrossEncoderReranker:
     """
-    Cross-Encoder Re-Ranker that takes (query, candidate) pairs,
-    outputs a single relevance score in [0,1].
     """
     def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-12-v2"):
         """
-        Initialize the cross-encoder with a pretrained model.
         Args:
-            model_name: Name of a HF cross-encoder model. Must be
-                        compatible with TFAutoModelForSequenceClassification.
         """
         logger.info(f"Initializing CrossEncoderReranker with {model_name}...")
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -31,21 +27,16 @@ class CrossEncoderReranker:
         max_length: int = 256
     ) -> List[float]:
         """
-        Compute relevance scores for each candidate w.r.t. the query.
         Args:
             query: User's query text.
             candidates: List of candidate response texts.
             max_length: Max token length for each (query, candidate) pair.
         Returns:
-            A list of float scores in [0,1], one per candidate,
-            indicating model's predicted relevance.
         """
-        # 1) Build (query, candidate) pairs
         pair_texts = [(query, candidate) for candidate in candidates]
-        # 2) Tokenize the entire batch
         encodings = self.tokenizer(
             pair_texts,
             padding=True,
@@ -54,24 +45,20 @@ class CrossEncoderReranker:
             return_tensors="tf"
         )
-        # 3) Forward pass -> logits shape [batch_size, 1]
         outputs = self.model(
             input_ids=encodings["input_ids"],
             attention_mask=encodings["attention_mask"],
-            token_type_ids=encodings.get("token_type_ids")  # Some models need token_type_ids
         )
         logits = outputs.logits  # shape [batch_size, 1]
-        # 4) Convert logits -> [0,1] range via sigmoid
-        #    If the cross-encoder is a single-logit regression to [0,1],
-        #    this is a typical interpretation.
         scores = tf.nn.sigmoid(logits)  # shape [batch_size, 1]
-        # 5) Flatten to a 1D NumPy array of floats
         scores = tf.reshape(scores, [-1])
         scores = scores.numpy().astype(float)
-        # logger.debug(f"Cross-Encoder raw logits: {logits.numpy().flatten().tolist()}")
-        # logger.debug(f"Cross-Encoder sigmoid scores: {scores.tolist()}")
         return scores.tolist()

 from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
 import tensorflow as tf
 from typing import List
 from logger_config import config_logger
 logger = config_logger(__name__)
 class CrossEncoderReranker:
     """
+    Cross-Encoder Re-Ranker. Takes (query, candidate) pairs and outputs a relevance score [0...1].
     """
     def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-12-v2"):
         """
+        Init the cross-encoder with a pretrained model.
         Args:
+            model_name: Name of a HF cross-encoder model. Must be compatible with TFAutoModelForSequenceClassification.
         """
         logger.info(f"Initializing CrossEncoderReranker with {model_name}...")
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         max_length: int = 256
     ) -> List[float]:
         """
+        Compute relevance scores for each candidate w.r.t. query.
         Args:
             query: User's query text.
             candidates: List of candidate response texts.
             max_length: Max token length for each (query, candidate) pair.
         Returns:
+            A list of float scores [0...1]. One per candidate, indicating model's predicted relevance.
         """
+        # Build (query, candidate) pairs, then tokenize
         pair_texts = [(query, candidate) for candidate in candidates]
         encodings = self.tokenizer(
             pair_texts,
             padding=True,
             return_tensors="tf"
         )
+        # Forward pass, logits shape [batch_size, 1]
+        # Then convert logits to [0...1] range with sigmoid
+        # Note: token_type_ids are optional. .get() avoids KeyError
         outputs = self.model(
             input_ids=encodings["input_ids"],
             attention_mask=encodings["attention_mask"],
+            token_type_ids=encodings.get("token_type_ids")
         )
         logits = outputs.logits  # shape [batch_size, 1]
         scores = tf.nn.sigmoid(logits)  # shape [batch_size, 1]
+        # Flatten to 1D NumPy array, ensure float type
         scores = tf.reshape(scores, [-1])
         scores = scores.numpy().astype(float)
         return scores.tolist()

tf_data_pipeline.py CHANGED Viewed

@@ -4,6 +4,8 @@ import faiss
 import tensorflow as tf
 import h5py
 import math
 from tqdm import tqdm
 import json
 from pathlib import Path
@@ -46,47 +48,47 @@ class TFDataPipeline:
         self.max_batch_size = 16 if len(response_pool) < 100 else 64
         self.max_retries = max_retries
-        # Build a quick text->domain map for O(1) domain lookups
         self._text_domain_map = {}
         self.build_text_to_domain_map()
         if os.path.exists(index_file_path):
             logger.info(f"Loading existing FAISS index from {index_file_path}...")
             self.index = faiss.read_index(index_file_path)
             self.validate_faiss_index()
             logger.info("FAISS index loaded and validated successfully.")
         else:
-            # Initialize FAISS index
             dimension = self.encoder.config.embedding_dim
             self.index = faiss.IndexFlatIP(dimension)
             logger.info(f"Initialized FAISS IndexFlatIP with dimension {dimension}.")
         if not self.index.is_trained:
-            # Train the index if it's not trained. # TODO: Replace 'dimension' with embedding size
             dimension = self.query_embeddings_cache[next(iter(self.query_embeddings_cache))].shape[0]
             self.index.train(np.array(list(self.query_embeddings_cache.values())).astype(np.float32))
             self.index.add(np.array(list(self.query_embeddings_cache.values())).astype(np.float32))
     def save_embeddings_cache_hdf5(self, cache_file_path: str):
-        """Save the embeddings cache to an HDF5 file."""
         with h5py.File(cache_file_path, 'w') as hf:
             for query, emb in self.query_embeddings_cache.items():
                 hf.create_dataset(query, data=emb)
         logger.info(f"Embeddings cache saved to {cache_file_path}.")
     def load_embeddings_cache_hdf5(self, cache_file_path: str):
-        """Load the embeddings cache from an HDF5 file."""
         with h5py.File(cache_file_path, 'r') as hf:
             for query in hf.keys():
                 self.query_embeddings_cache[query] = hf[query][:]
         logger.info(f"Embeddings cache loaded from {cache_file_path}.")
     def save_faiss_index(self, index_file_path: str):
         faiss.write_index(self.index, index_file_path)
         logger.info(f"FAISS index saved to {index_file_path}")
     def load_faiss_index(self, index_file_path: str):
-        """Load the FAISS index from the specified file path."""
         if os.path.exists(index_file_path):
             self.index = faiss.read_index(index_file_path)
             logger.info(f"FAISS index loaded from {index_file_path}.")
@@ -95,7 +97,7 @@ class TFDataPipeline:
             raise FileNotFoundError(f"FAISS index file not found at {index_file_path}.")
     def validate_faiss_index(self):
-        """Validates that the FAISS index has the correct dimensionality."""
         expected_dim = self.encoder.config.embedding_dim
         if self.index.d != expected_dim:
             logger.error(f"FAISS index dimension {self.index.d} does not match encoder embedding dimension {expected_dim}.")
@@ -114,7 +116,6 @@ class TFDataPipeline:
     def load_json_training_data(data_path: Union[str, Path], debug_samples: Optional[int] = None) -> List[dict]:
         """
         Load training data from a JSON file.
         Args:
             data_path (Union[str, Path]): Path to the JSON file containing dialogues.
             debug_samples (Optional[int]): Number of samples to load for debugging.
@@ -137,17 +138,16 @@ class TFDataPipeline:
         logger.info(f"Loaded {len(dialogues)} dialogues.")
         return dialogues
     def collect_responses_with_domain(self, dialogues: List[dict]) -> List[Dict[str, str]]:
         """
-        Extract unique assistant responses from dialogues, along with the domain.
-        Returns a list of dicts: [{'domain': str, 'text': str}, ...]
         """
-        response_set = set()  # We'll store (domain, text) tuples to keep them unique
         results = []
         for dialogue in tqdm(dialogues, desc="Processing Dialogues", unit="dialogue"):
-            # domain is stored at the top level in your new JSON format
             domain = dialogue.get('domain', 'other')
             turns = dialogue.get('turns', [])
             for turn in turns:
@@ -155,7 +155,7 @@ class TFDataPipeline:
                 text = turn.get('text', '').strip()
                 if speaker == 'assistant' and text:
                     if len(text) <= self.max_length:
-                        # Use a tuple as a "set" key to ensure uniqueness
                         key = (domain, text)
                         if key not in response_set:
                             response_set.add(key)
@@ -163,23 +163,9 @@ class TFDataPipeline:
                                 "domain": domain,
                                 "text": text
                             })
         logger.info(f"Collected {len(results)} unique assistant responses from dialogues.")
         return results
-    # def collect_responses(self, dialogues: List[dict]) -> List[str]:
-    #     """Extract unique assistant responses from dialogues."""
-    #     response_set = set()
-    #     for dialogue in tqdm(dialogues, desc="Processing Dialogues", unit="dialogue"):
-    #         turns = dialogue.get('turns', [])
-    #         for turn in turns:
-    #             speaker = turn.get('speaker')
-    #             text = turn.get('text', '').strip()
-    #             if speaker == 'assistant' and text:
-    #                 # Ensure we don't exclude valid shorter responses
-    #                 if len(text) <= self.max_length:
-    #                     response_set.add(text)
-    #     logger.info(f"Collected {len(response_set)} unique assistant responses from dialogues.")
-    #     return list(response_set)
     def _extract_pairs_from_dialogue(self, dialogue: dict) -> List[Tuple[str, str]]:
         """Extract query-response pairs from a dialogue."""
@@ -203,18 +189,18 @@ class TFDataPipeline:
     def compute_and_index_response_embeddings(self):
         """
-        Computes embeddings for the response pool and adds them to the FAISS index.
-        self.response_pool is now List[Dict[str, str]] with keys "domain" and "text".
         """
         logger.info("Computing embeddings for the response pool...")
-        # Extract just the assistant text
         texts = [resp["text"] for resp in self.response_pool]
         logger.debug(f"Total texts to embed: {len(texts)}")
         batch_size = getattr(self, 'embedding_batch_size', 64)
         embeddings = []
         with tqdm(total=len(texts), desc="Computing Embeddings", unit="response") as pbar:
             for i in range(0, len(texts), batch_size):
                 batch_texts = texts[i:i+batch_size]
@@ -226,36 +212,30 @@ class TFDataPipeline:
                     return_tensors='tf'
                 )
                 batch_embeds = self.encoder(encodings['input_ids'], training=False).numpy()
                 embeddings.append(batch_embeds)
                 pbar.update(len(batch_texts))
         # Combine embeddings and add to FAISS
         all_embeddings = np.vstack(embeddings).astype(np.float32)
         logger.info(f"Adding {len(all_embeddings)} response embeddings to FAISS index...")
         self.index.add(all_embeddings)
-        # For debugging or repeated usage, you might store them:
         self.response_embeddings = all_embeddings
         logger.info(f"FAISS index now has {self.index.ntotal} vectors.")
-    def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
         """
         Find hard negatives for a batch of queries using FAISS search.
-        Falls back to random negatives if we run out of tries or can't find enough.
-        Uses domain-based fallback if possible.
         """
-        import random
-        import gc
         retry_count = 0
         total_responses = len(self.response_pool)
-        k = self.neg_samples  # Number of negatives to retrieve from FAISS
-        batch_size = 128
         while retry_count < self.max_retries:
             try:
-                # 1) Build query embeddings from the cache
                 query_embeddings = []
                 for i in range(0, len(queries), batch_size):
                     sub_queries = queries[i : i + batch_size]
@@ -263,23 +243,24 @@ class TFDataPipeline:
                     sub_embeds = np.vstack(sub_embeds).astype(np.float32)
                     faiss.normalize_L2(sub_embeds)  # If not already normalized
                     query_embeddings.append(sub_embeds)
                 query_embeddings = np.vstack(query_embeddings)
                 query_embeddings = np.ascontiguousarray(query_embeddings)
-                # 2) Perform FAISS search
-                distances, indices = self.index.search(query_embeddings, k)
                 all_negatives = []
-                # For each query, find domain from the corresponding positive if possible
                 for query_indices, query_text, pos_text in zip(indices, queries, positives):
                     negative_list = []
                     seen = {pos_text.strip()}
-                    # Attempt to detect the domain of the positive text
                     domain_of_positive = self._detect_domain_for_text(pos_text)
-                    # Collect hard negatives from FAISS
                     for idx in query_indices:
                         if 0 <= idx < total_responses:
                             candidate_dict = self.response_pool[idx]  # e.g. {domain, text}
@@ -289,18 +270,18 @@ class TFDataPipeline:
                                 negative_list.append(candidate_text)
                                 if len(negative_list) >= self.neg_samples:
                                     break
-                    # If not enough negatives, fallback to random domain-based
                     if len(negative_list) < self.neg_samples:
                         needed = self.neg_samples - len(negative_list)
-                        # Pass in domain_of_positive to your updated `_get_random_negatives(...)`
                         random_negatives = self._get_random_negatives(needed, seen, domain=domain_of_positive)
                         negative_list.extend(random_negatives)
                     all_negatives.append(negative_list)
                 return all_negatives
             except KeyError as ke:
                 retry_count += 1
                 logger.warning(f"Hard negative search attempt {retry_count} failed due to missing embeddings: {ke}")
@@ -310,7 +291,7 @@ class TFDataPipeline:
                 gc.collect()
                 if tf.config.list_physical_devices('GPU'):
                     tf.keras.backend.clear_session()
             except Exception as e:
                 retry_count += 1
                 logger.warning(f"Hard negative search attempt {retry_count} failed: {e}")
@@ -320,29 +301,27 @@ class TFDataPipeline:
                 gc.collect()
                 if tf.config.list_physical_devices('GPU'):
                     tf.keras.backend.clear_session()
     def _detect_domain_for_text(self, text: str) -> Optional[str]:
         """
-        O(1) domain detection by looking up text in our dictionary.
-        Returns the domain if found, else None.
         """
         stripped_text = text.strip()
         return self._text_domain_map.get(stripped_text, None)
     def _get_random_negatives(self, needed: int, seen: set, domain: Optional[str] = None) -> List[str]:
         """
-        Return a list of 'needed' random negative texts from the same domain if possible,
-        otherwise fallback to all-domain.
         """
-        # 1) Filter response_pool for domain if provided
         if domain:
             domain_texts = [r["text"] for r in self.response_pool if r["domain"] == domain]
             # fallback to entire set if insufficient domain_texts
-            if len(domain_texts) < needed * 2:  # pick some threshold
                 domain_texts = [r["text"] for r in self.response_pool]
         else:
             domain_texts = [r["text"] for r in self.response_pool]
         negatives = []
         tries = 0
         max_tries = needed * 10
@@ -352,8 +331,7 @@ class TFDataPipeline:
             if candidate and candidate not in seen:
                 negatives.append(candidate)
                 seen.add(candidate)
-        # If still not enough, we do the best we can
         if len(negatives) < needed:
             logger.warning(f"Could not find enough domain-based random negatives; needed {needed}, got {len(negatives)}.")
@@ -369,47 +347,44 @@ class TFDataPipeline:
         all_negatives = []
         for pos_text in positives:
-            # Build a 'seen' set with the positive
             seen = {pos_text.strip()}
-            # Attempt to detect the domain of the positive text
             domain_of_positive = self._detect_domain_for_text(pos_text)
-            # Use domain-based random negatives if available
             negs = self._get_random_negatives(self.neg_samples, seen, domain=domain_of_positive)
             all_negatives.append(negs)
         return all_negatives
     def build_text_to_domain_map(self):
         """
-        Build an O(1) lookup dict: text -> domain,
-        so we don't have to scan the entire self.response_pool each time.
         """
         self._text_domain_map = {}
         for item in self.response_pool:
-            # e.g., item = {"domain": "restaurant", "text": "some text..."}
             stripped_text = item["text"].strip()
             domain = item["domain"]
-            # If the same text appears multiple times with the same domain, no big deal.
-            # If it appears with a different domain, you can decide how to handle collisions.
             if stripped_text in self._text_domain_map:
-                existing_domain = self._text_domain_map[stripped_text]
-                if existing_domain != domain:
-                    # Log a warning or decide on a policy:
-                    logger.warning(
-                        f"Collision detected: text '{stripped_text}' found with domains "
-                        f"'{existing_domain}' and '{domain}'. Keeping the first."
-                    )
                 # By default, keep the first domain or overwrite. We'll skip overwriting:
                 continue
             else:
                 # Insert into the dict
                 self._text_domain_map[stripped_text] = domain
-        logger.info(f"Built text->domain map with {len(self._text_domain_map)} unique text entries.")
     def encode_query(
         self,
@@ -422,11 +397,10 @@ class TFDataPipeline:
         Args:
             query: The user query.
             context: Optional conversation history as a list of (user_text, assistant_text).
         Returns:
             np.ndarray of shape [embedding_dim], typically L2-normalized already.
         """
-        # 1) Prepare context (if any) by concatenating user/assistant pairs
         if context:
             # Take the last N turns
             relevant_history = context[-self.config.max_context_turns:]
@@ -438,18 +412,18 @@ class TFDataPipeline:
                 )
             context_str = " ".join(context_str_parts)
-            # Append the user's new query
             full_query = (
                 f"{context_str} "
                 f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
             )
         else:
-            # Just a single user turn
             full_query = (
                 f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
             )
-        # 2) Tokenize
         encodings = self.tokenizer(
             [full_query],
             padding='max_length',
@@ -459,20 +433,18 @@ class TFDataPipeline:
         )
         input_ids = encodings['input_ids']
-        # 3) Check for out-of-vocab IDs
         max_id = np.max(input_ids)
         vocab_size = len(self.tokenizer)
         if max_id >= vocab_size:
             logger.error(f"Token ID {max_id} exceeds tokenizer vocab size {vocab_size}.")
             raise ValueError("Token ID exceeds vocabulary size.")
-        # 4) Get embeddings from the model
         embeddings = self.encoder(input_ids, training=False).numpy()
-        # Typically your custom model already L2-normalizes the final embeddings.
-        # 5) Return the single embedding as 1D array
         return embeddings[0]
     def encode_responses(
         self,
         responses: List[str],
@@ -480,16 +452,13 @@ class TFDataPipeline:
     ) -> np.ndarray:
         """
         Encode multiple response texts into embedding vectors.
         Args:
-            responses: List of raw assistant responses.
             context: Optional conversation context (last N turns).
         Returns:
             np.ndarray of shape [num_responses, embedding_dim].
         """
-        # 1) If you want to incorporate context into response encoding
-        #    Usually for retrieval we might skip this. But if you want it:
         if context:
             relevant_history = context[-self.config.max_context_turns:]
             prepared = []
@@ -501,21 +470,21 @@ class TFDataPipeline:
                         f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {a_text}"
                     )
                 context_str = " ".join(context_str_parts)
-                # Now treat resp as an assistant turn
                 full_resp = (
                     f"{context_str} "
                     f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {resp}"
                 )
                 prepared.append(full_resp)
         else:
-            # By default, just mark each response as from the assistant
             prepared = [
                 f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {r}"
                 for r in responses
             ]
-        # 2) Tokenize
         encodings = self.tokenizer(
             prepared,
             padding='max_length',
@@ -524,28 +493,22 @@ class TFDataPipeline:
             return_tensors='np'
         )
         input_ids = encodings['input_ids']
-        # 3) Check for out-of-vocab
         max_id = np.max(input_ids)
         vocab_size = len(self.tokenizer)
         if max_id >= vocab_size:
             logger.error(f"Token ID {max_id} exceeds tokenizer vocab size {vocab_size}.")
             raise ValueError("Token ID exceeds vocabulary size.")
-        # 4) Model forward
         embeddings = self.encoder(input_ids, training=False).numpy()
-        # Typically already L2-normalized if your final layer is normalized.
         return embeddings.astype('float32')
     def prepare_and_save_data(self, dialogues: List[dict], tf_record_path: str, batch_size: int = 32):
         """
-        Processes dialogues in batches and saves to a TFRecord file using optimized batch tokenization and encoding.
-        Args:
-            dialogues (List[dict]): List of dialogue dictionaries.
-            tf_record_path (str): Path to save the TFRecord file.
-            batch_size (int): Number of dialogues to process per batch.
         """
         logger.info(f"Preparing and saving data to {tf_record_path}...")
@@ -553,14 +516,13 @@ class TFDataPipeline:
         num_batches = math.ceil(num_dialogues / batch_size)
         with tf.io.TFRecordWriter(tf_record_path) as writer:
-            # Initialize progress bar
             with tqdm(total=num_batches, desc="Preparing Data Batches", unit="batch") as pbar:
                 for i in range(num_batches):
                     start_idx = i * batch_size
                     end_idx = min(start_idx + batch_size, num_dialogues)
                     batch_dialogues = dialogues[start_idx:end_idx]
-                    # Extract all query-positive pairs in the batch
                     queries = []
                     positives = []
                     for dialogue in batch_dialogues:
@@ -572,7 +534,7 @@ class TFDataPipeline:
                     if not queries:
                         pbar.update(1)
-                        continue  # Skip if no valid queries
                     # Compute and cache query embeddings
                     try:
@@ -580,11 +542,11 @@ class TFDataPipeline:
                     except Exception as e:
                         logger.error(f"Error computing embeddings: {e}")
                         pbar.update(1)
-                        continue  # Skip to the next batch
-                    # Find hard negatives for the batch
                     try:
-                        hard_negatives = self._find_hard_negatives_batch(queries, positives)
                     except Exception as e:
                         logger.error(f"Error finding hard negatives: {e}")
                         pbar.update(1)
@@ -611,8 +573,8 @@ class TFDataPipeline:
                         pbar.update(1)
                         continue  # Skip to the next batch
-                    # Flatten hard_negatives while maintaining alignment
-                    # Assuming hard_negatives is a list of lists, where each sublist corresponds to a query
                     try:
                         flattened_negatives = [neg for sublist in hard_negatives for neg in sublist]
                         encoded_negatives = self.tokenizer.batch_encode_plus(
@@ -623,15 +585,15 @@ class TFDataPipeline:
                             return_tensors='tf'
                         )
-                        # Reshape encoded_negatives['input_ids'] to [num_queries, num_negatives, max_length]
                         num_negatives = self.config.neg_samples
                         reshaped_negatives = encoded_negatives['input_ids'].numpy().reshape(-1, num_negatives, self.config.max_context_token_limit)
                     except Exception as e:
                         logger.error(f"Error during negatives tokenization: {e}")
                         pbar.update(1)
-                        continue  # Skip to the next batch
-                    # Serialize each example and write to TFRecord
                     for j in range(len(queries)):
                         try:
                             q_id = encoded_queries['input_ids'][j].numpy()
@@ -655,11 +617,14 @@ class TFDataPipeline:
         logger.info(f"Data preparation complete. TFRecord saved.")
     def _compute_embeddings(self, queries: List[str]) -> None:
         new_queries = [q for q in queries if q not in self.query_embeddings_cache]
         if not new_queries:
-            return  # All queries already cached
-        # Compute embeddings for new queries
         new_embeddings = []
         for i in range(0, len(new_queries), self.embedding_batch_size):
             batch_queries = new_queries[i:i + self.embedding_batch_size]
@@ -673,49 +638,46 @@ class TFDataPipeline:
             batch_embeddings = self.encoder(encoded['input_ids'], training=False).numpy()
             faiss.normalize_L2(batch_embeddings)
             new_embeddings.extend(batch_embeddings)
         # Update the cache
         for query, emb in zip(new_queries, new_embeddings):
             self.query_embeddings_cache[query] = emb
     def data_generator(self, dialogues: List[dict]) -> Generator[Tuple[str, str, List[str]], None, None]:
         """
-        Generates training examples: (query, positive, hard_negatives).
-        Wrapped the outer loop with tqdm for progress tracking.
         """
         total_dialogues = len(dialogues)
         logger.debug(f"Total dialogues to process: {total_dialogues}")
-        # Initialize tqdm progress bar
         with tqdm(total=total_dialogues, desc="Processing Dialogues", unit="dialogue") as pbar:
             for dialogue in dialogues:
                 pairs = self._extract_pairs_from_dialogue(dialogue)
                 for query, positive in pairs:
                     # Ensure embeddings are computed, find hard negatives, etc.
                     self._compute_embeddings([query])
-                    hard_negatives = self._find_hard_negatives_batch([query], [positive])[0]
                     yield (query, positive, hard_negatives)
                 pbar.update(1)
     def get_tf_dataset(self, dialogues: List[dict], batch_size: int) -> tf.data.Dataset:
         """
-        Creates a tf.data.Dataset for streaming training that yields
-        (input_ids_query, input_ids_positive, input_ids_negatives).
         """
         # 1) Start with a generator dataset
         dataset = tf.data.Dataset.from_generator(
             lambda: self.data_generator(dialogues),
             output_signature=(
-                tf.TensorSpec(shape=(), dtype=tf.string),        # Query (single string)
-                tf.TensorSpec(shape=(), dtype=tf.string),        # Positive (single string)
-                tf.TensorSpec(shape=(self.neg_samples,), dtype=tf.string)    # Hard Negatives (list of strings)
             )
         )
-        # 2) Batch the raw strings
         dataset = dataset.batch(batch_size, drop_remainder=True)
-        # 3) Map them through a tokenize step using `tf.py_function`
         dataset = dataset.map(
             lambda q, p, n: self._tokenize_triple(q, p, n),
             num_parallel_calls=1 #tf.data.AUTOTUNE
@@ -731,22 +693,19 @@ class TFDataPipeline:
         n: tf.Tensor
     ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """
-        Wraps a Python function via tf.py_function to convert tf.Tensors of strings
-        -> Python lists of strings -> HF tokenizer -> Tensors of IDs.
-        q is shape [batch_size], p is shape [batch_size],
-        n is shape [batch_size, neg_samples] (i.e., each row is a list of negatives).
         """
-        # Use tf.py_function with limited parallelism
         q_ids, p_ids, n_ids = tf.py_function(
             func=self._tokenize_triple_py,
             inp=[q, p, n, tf.constant(self.max_length), tf.constant(self.neg_samples)],
             Tout=[tf.int32, tf.int32, tf.int32]
         )
-        # Manually set shape information
-        q_ids.set_shape([None, self.max_length])                # [batch_size, max_length]
-        p_ids.set_shape([None, self.max_length])                # [batch_size, max_length]
         n_ids.set_shape([None, self.neg_samples, self.max_length])  # [batch_size, neg_samples, max_length]
         return q_ids, p_ids, n_ids
@@ -760,32 +719,30 @@ class TFDataPipeline:
         neg_samples: tf.Tensor
     ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
         """
-        Python function that:
-        - Decodes each tf.string Tensor to a Python list of strings
-        - Calls the HF tokenizer
-        - Reshapes negatives
-        - Returns np.array of int32s for (q_ids, p_ids, n_ids).
         q: shape [batch_size], p: shape [batch_size]
         n: shape [batch_size, neg_samples]
-        max_len: scalar int
-        neg_samples: scalar int
         """
-        max_len = int(max_len.numpy())       # Convert to Python int
         neg_samples = int(neg_samples.numpy())
-        # 1) Convert Tensors -> Python lists of strings
         q_list = [q_i.decode("utf-8") for q_i in q.numpy()]  # shape [batch_size]
         p_list = [p_i.decode("utf-8") for p_i in p.numpy()]  # shape [batch_size]
-        # shape [batch_size, neg_samples], decode each row
         n_list = []
         for row in n.numpy():
             # row is shape [neg_samples], each is a tf.string
             decoded = [neg.decode("utf-8") for neg in row]
             n_list.append(decoded)
-        # 2) Tokenize queries & positives
         q_enc = self.tokenizer(
             q_list,
             padding="max_length",
@@ -801,11 +758,11 @@ class TFDataPipeline:
             return_tensors="np"
         )
-        # 3) Tokenize negatives
-        # Flatten [batch_size, neg_samples] -> single list
         flattened_negatives = [neg for row in n_list for neg in row]
         if len(flattened_negatives) == 0:
-            # No negatives at all: return a zero array
             n_ids = np.zeros((len(q_list), neg_samples, max_len), dtype=np.int32)
         else:
             n_enc = self.tokenizer(
@@ -815,11 +772,10 @@ class TFDataPipeline:
                 max_length=max_len,
                 return_tensors="np"
             )
-            # shape [batch_size * neg_samples, max_len]
             n_input_ids = n_enc["input_ids"]
-            # We want to reshape to [batch_size, neg_samples, max_len]
-            # Handle cases where there might be fewer negatives
             batch_size = len(q_list)
             n_ids_list = []
             for i in range(batch_size):
@@ -827,7 +783,7 @@ class TFDataPipeline:
                 end_idx = start_idx + neg_samples
                 row_negs = n_input_ids[start_idx:end_idx]
-                # If fewer negatives, pad with zeros
                 if row_negs.shape[0] < neg_samples:
                     deficit = neg_samples - row_negs.shape[0]
                     pad_arr = np.zeros((deficit, max_len), dtype=np.int32)
@@ -835,10 +791,10 @@ class TFDataPipeline:
                 n_ids_list.append(row_negs)
-            # stack them -> shape [batch_size, neg_samples, max_len]
             n_ids = np.stack(n_ids_list, axis=0)
-        # 4) Return as np.int32 arrays
         q_ids = q_enc["input_ids"].astype(np.int32)  # shape [batch_size, max_len]
         p_ids = p_enc["input_ids"].astype(np.int32)  # shape [batch_size, max_len]
         n_ids = n_ids.astype(np.int32)               # shape [batch_size, neg_samples, max_len]

 import tensorflow as tf
 import h5py
 import math
+import random
+import gc
 from tqdm import tqdm
 import json
 from pathlib import Path
         self.max_batch_size = 16 if len(response_pool) < 100 else 64
         self.max_retries = max_retries
+        # Build text -> domain map for O(1) domain lookups (hard negative sampling)
         self._text_domain_map = {}
         self.build_text_to_domain_map()
+        # Initialize FAISS index
         if os.path.exists(index_file_path):
             logger.info(f"Loading existing FAISS index from {index_file_path}...")
             self.index = faiss.read_index(index_file_path)
             self.validate_faiss_index()
             logger.info("FAISS index loaded and validated successfully.")
         else:
             dimension = self.encoder.config.embedding_dim
             self.index = faiss.IndexFlatIP(dimension)
             logger.info(f"Initialized FAISS IndexFlatIP with dimension {dimension}.")
         if not self.index.is_trained:
+            # Train the index if it's not trained. IndexFlatIP doesn't need training, but others do (Future switch to IndexIVFFlat)
             dimension = self.query_embeddings_cache[next(iter(self.query_embeddings_cache))].shape[0]
             self.index.train(np.array(list(self.query_embeddings_cache.values())).astype(np.float32))
             self.index.add(np.array(list(self.query_embeddings_cache.values())).astype(np.float32))
     def save_embeddings_cache_hdf5(self, cache_file_path: str):
+        """Save embeddings cache to HDF5 file."""
         with h5py.File(cache_file_path, 'w') as hf:
             for query, emb in self.query_embeddings_cache.items():
                 hf.create_dataset(query, data=emb)
         logger.info(f"Embeddings cache saved to {cache_file_path}.")
     def load_embeddings_cache_hdf5(self, cache_file_path: str):
+        """Load embeddings cache from HDF5 file."""
         with h5py.File(cache_file_path, 'r') as hf:
             for query in hf.keys():
                 self.query_embeddings_cache[query] = hf[query][:]
         logger.info(f"Embeddings cache loaded from {cache_file_path}.")
     def save_faiss_index(self, index_file_path: str):
         faiss.write_index(self.index, index_file_path)
         logger.info(f"FAISS index saved to {index_file_path}")
     def load_faiss_index(self, index_file_path: str):
+        """Load FAISS index from specified file path."""
         if os.path.exists(index_file_path):
             self.index = faiss.read_index(index_file_path)
             logger.info(f"FAISS index loaded from {index_file_path}.")
             raise FileNotFoundError(f"FAISS index file not found at {index_file_path}.")
     def validate_faiss_index(self):
+        """Validates FAISS index dimensionality."""
         expected_dim = self.encoder.config.embedding_dim
         if self.index.d != expected_dim:
             logger.error(f"FAISS index dimension {self.index.d} does not match encoder embedding dimension {expected_dim}.")
     def load_json_training_data(data_path: Union[str, Path], debug_samples: Optional[int] = None) -> List[dict]:
         """
         Load training data from a JSON file.
         Args:
             data_path (Union[str, Path]): Path to the JSON file containing dialogues.
             debug_samples (Optional[int]): Number of samples to load for debugging.
         logger.info(f"Loaded {len(dialogues)} dialogues.")
         return dialogues
     def collect_responses_with_domain(self, dialogues: List[dict]) -> List[Dict[str, str]]:
         """
+        Extract unique assistant responses and their domains from dialogues.
+        Returns List[Dict[str: "domain", str: text"]]
         """
+        response_set = set()  # Store (domain, text) unique tuples
         results = []
         for dialogue in tqdm(dialogues, desc="Processing Dialogues", unit="dialogue"):
             domain = dialogue.get('domain', 'other')
             turns = dialogue.get('turns', [])
             for turn in turns:
                 text = turn.get('text', '').strip()
                 if speaker == 'assistant' and text:
                     if len(text) <= self.max_length:
+                        # Use tuple as set key to ensure uniqueness
                         key = (domain, text)
                         if key not in response_set:
                             response_set.add(key)
                                 "domain": domain,
                                 "text": text
                             })
         logger.info(f"Collected {len(results)} unique assistant responses from dialogues.")
         return results
     def _extract_pairs_from_dialogue(self, dialogue: dict) -> List[Tuple[str, str]]:
         """Extract query-response pairs from a dialogue."""
     def compute_and_index_response_embeddings(self):
         """
+        Compute embeddings for the response pool and add them to the FAISS index.
+        self.response_pool: List[Dict[str, str]] with keys "domain" and "text".
         """
         logger.info("Computing embeddings for the response pool...")
+        # Extract the assistant text
         texts = [resp["text"] for resp in self.response_pool]
         logger.debug(f"Total texts to embed: {len(texts)}")
         batch_size = getattr(self, 'embedding_batch_size', 64)
         embeddings = []
         with tqdm(total=len(texts), desc="Computing Embeddings", unit="response") as pbar:
             for i in range(0, len(texts), batch_size):
                 batch_texts = texts[i:i+batch_size]
                     return_tensors='tf'
                 )
                 batch_embeds = self.encoder(encodings['input_ids'], training=False).numpy()
                 embeddings.append(batch_embeds)
                 pbar.update(len(batch_texts))
         # Combine embeddings and add to FAISS
         all_embeddings = np.vstack(embeddings).astype(np.float32)
         logger.info(f"Adding {len(all_embeddings)} response embeddings to FAISS index...")
         self.index.add(all_embeddings)
+        # Store in memory
         self.response_embeddings = all_embeddings
         logger.info(f"FAISS index now has {self.index.ntotal} vectors.")
+    def _find_hard_negatives(self, queries: List[str], positives: List[str], batch_size: int = 128) -> List[List[str]]:
         """
         Find hard negatives for a batch of queries using FAISS search.
+        Fallback: in-domain negatives, then random negatives when needed.
         """
         retry_count = 0
         total_responses = len(self.response_pool)
         while retry_count < self.max_retries:
             try:
+                # Build query embeddings from the cache
                 query_embeddings = []
                 for i in range(0, len(queries), batch_size):
                     sub_queries = queries[i : i + batch_size]
                     sub_embeds = np.vstack(sub_embeds).astype(np.float32)
                     faiss.normalize_L2(sub_embeds)  # If not already normalized
                     query_embeddings.append(sub_embeds)
                 query_embeddings = np.vstack(query_embeddings)
                 query_embeddings = np.ascontiguousarray(query_embeddings)
+                # FAISS search for nearest neighbors (hard negatives)
+                distances, indices = self.index.search(query_embeddings, self.neg_samples)
                 all_negatives = []
+                # Extract domain from the positive assistant response
                 for query_indices, query_text, pos_text in zip(indices, queries, positives):
                     negative_list = []
+                    # Build a 'seen' set with the positive
                     seen = {pos_text.strip()}
                     domain_of_positive = self._detect_domain_for_text(pos_text)
+                    # Collect hard negatives (from config self.neg_samples)
                     for idx in query_indices:
                         if 0 <= idx < total_responses:
                             candidate_dict = self.response_pool[idx]  # e.g. {domain, text}
                                 negative_list.append(candidate_text)
                                 if len(negative_list) >= self.neg_samples:
                                     break
+                    # Fall back to random domain-based
                     if len(negative_list) < self.neg_samples:
                         needed = self.neg_samples - len(negative_list)
                         random_negatives = self._get_random_negatives(needed, seen, domain=domain_of_positive)
                         negative_list.extend(random_negatives)
                     all_negatives.append(negative_list)
                 return all_negatives
             except KeyError as ke:
                 retry_count += 1
                 logger.warning(f"Hard negative search attempt {retry_count} failed due to missing embeddings: {ke}")
                 gc.collect()
                 if tf.config.list_physical_devices('GPU'):
                     tf.keras.backend.clear_session()
             except Exception as e:
                 retry_count += 1
                 logger.warning(f"Hard negative search attempt {retry_count} failed: {e}")
                 gc.collect()
                 if tf.config.list_physical_devices('GPU'):
                     tf.keras.backend.clear_session()
     def _detect_domain_for_text(self, text: str) -> Optional[str]:
         """
+        Domain detection for related negatives.
         """
         stripped_text = text.strip()
         return self._text_domain_map.get(stripped_text, None)
     def _get_random_negatives(self, needed: int, seen: set, domain: Optional[str] = None) -> List[str]:
         """
+        Return a list of negative texts from the same domain. Fall back to any domain.
         """
+        # Filter response_pool for domain
         if domain:
             domain_texts = [r["text"] for r in self.response_pool if r["domain"] == domain]
             # fallback to entire set if insufficient domain_texts
+            if len(domain_texts) < needed * 2:
                 domain_texts = [r["text"] for r in self.response_pool]
         else:
             domain_texts = [r["text"] for r in self.response_pool]
         negatives = []
         tries = 0
         max_tries = needed * 10
             if candidate and candidate not in seen:
                 negatives.append(candidate)
                 seen.add(candidate)
         if len(negatives) < needed:
             logger.warning(f"Could not find enough domain-based random negatives; needed {needed}, got {len(negatives)}.")
         all_negatives = []
         for pos_text in positives:
+            # Build a 'seen' set with the positive assistant response
             seen = {pos_text.strip()}
+            # Detect domain of the positive
             domain_of_positive = self._detect_domain_for_text(pos_text)
+            # Use domain-based negatives when available
             negs = self._get_random_negatives(self.neg_samples, seen, domain=domain_of_positive)
             all_negatives.append(negs)
         return all_negatives
     def build_text_to_domain_map(self):
         """
+        Build O(1) lookup dict: text -> domain for hard negative sampling.
         """
         self._text_domain_map = {}
         for item in self.response_pool:
             stripped_text = item["text"].strip()
             domain = item["domain"]
             if stripped_text in self._text_domain_map:
+                #existing_domain = self._text_domain_map[stripped_text]
+                #if existing_domain != domain:
+                    # Collision detected. Using first found domain for now.
+                    # This happens often with low-signal responses. "ok", "yes", etc.
+                    # logger.warning(
+                    #     f"Collision detected: text '{stripped_text}' found with domains "
+                    #     f"'{existing_domain}' and '{domain}'. Keeping the first."
+                    # )
                 # By default, keep the first domain or overwrite. We'll skip overwriting:
                 continue
             else:
                 # Insert into the dict
                 self._text_domain_map[stripped_text] = domain
+        logger.info(f"Built text -> domain map with {len(self._text_domain_map)} unique text entries.")
     def encode_query(
         self,
         Args:
             query: The user query.
             context: Optional conversation history as a list of (user_text, assistant_text).
         Returns:
             np.ndarray of shape [embedding_dim], typically L2-normalized already.
         """
+        # Prepare context: concat user/assistant pairs
         if context:
             # Take the last N turns
             relevant_history = context[-self.config.max_context_turns:]
                 )
             context_str = " ".join(context_str_parts)
+            # Append the new query
             full_query = (
                 f"{context_str} "
                 f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
             )
         else:
+            # Single user turn
             full_query = (
                 f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
             )
+        # Tokenize
         encodings = self.tokenizer(
             [full_query],
             padding='max_length',
         )
         input_ids = encodings['input_ids']
+        # Debug out-of-vocab IDs
         max_id = np.max(input_ids)
         vocab_size = len(self.tokenizer)
         if max_id >= vocab_size:
             logger.error(f"Token ID {max_id} exceeds tokenizer vocab size {vocab_size}.")
             raise ValueError("Token ID exceeds vocabulary size.")
+        # Get embeddings from the model. These are already L2-normalized by the model's final layer.
         embeddings = self.encoder(input_ids, training=False).numpy()
         return embeddings[0]
     def encode_responses(
         self,
         responses: List[str],
     ) -> np.ndarray:
         """
         Encode multiple response texts into embedding vectors.
         Args:
+            responses: List of assistant responses.
             context: Optional conversation context (last N turns).
         Returns:
             np.ndarray of shape [num_responses, embedding_dim].
         """
+        # Incorporate context into response encoding. Note: Undecided on benefit of this
         if context:
             relevant_history = context[-self.config.max_context_turns:]
             prepared = []
                         f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {a_text}"
                     )
                 context_str = " ".join(context_str_parts)
+                # Treat resp as an assistant turn
                 full_resp = (
                     f"{context_str} "
                     f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {resp}"
                 )
                 prepared.append(full_resp)
         else:
+            # Single response from the assistant
             prepared = [
                 f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {r}"
                 for r in responses
             ]
+        # Tokenize
         encodings = self.tokenizer(
             prepared,
             padding='max_length',
             return_tensors='np'
         )
         input_ids = encodings['input_ids']
+        # Debug for out-of-vocab
         max_id = np.max(input_ids)
         vocab_size = len(self.tokenizer)
         if max_id >= vocab_size:
             logger.error(f"Token ID {max_id} exceeds tokenizer vocab size {vocab_size}.")
             raise ValueError("Token ID exceeds vocabulary size.")
+        # Get embeddings from the model. These are already L2-normalized by the model's final layer.
         embeddings = self.encoder(input_ids, training=False).numpy()
         return embeddings.astype('float32')
     def prepare_and_save_data(self, dialogues: List[dict], tf_record_path: str, batch_size: int = 32):
         """
+        Batch-Process dialogues and save to TFRecord file.
         """
         logger.info(f"Preparing and saving data to {tf_record_path}...")
         num_batches = math.ceil(num_dialogues / batch_size)
         with tf.io.TFRecordWriter(tf_record_path) as writer:
             with tqdm(total=num_batches, desc="Preparing Data Batches", unit="batch") as pbar:
                 for i in range(num_batches):
                     start_idx = i * batch_size
                     end_idx = min(start_idx + batch_size, num_dialogues)
                     batch_dialogues = dialogues[start_idx:end_idx]
+                    # Extract query-positive pairs for the batch
                     queries = []
                     positives = []
                     for dialogue in batch_dialogues:
                     if not queries:
                         pbar.update(1)
+                        continue
                     # Compute and cache query embeddings
                     try:
                     except Exception as e:
                         logger.error(f"Error computing embeddings: {e}")
                         pbar.update(1)
+                        continue
+                    # Find hard negatives
                     try:
+                        hard_negatives = self._find_hard_negatives(queries, positives)
                     except Exception as e:
                         logger.error(f"Error finding hard negatives: {e}")
                         pbar.update(1)
                         pbar.update(1)
                         continue  # Skip to the next batch
+                    # Flatten hard_negatives. Maintain alignment.
+                    # hard_negatives is List of Lists. Each sublist corresponds to a query.
                     try:
                         flattened_negatives = [neg for sublist in hard_negatives for neg in sublist]
                         encoded_negatives = self.tokenizer.batch_encode_plus(
                             return_tensors='tf'
                         )
+                        # Reshape to [num_queries, num_negatives, max_length]
                         num_negatives = self.config.neg_samples
                         reshaped_negatives = encoded_negatives['input_ids'].numpy().reshape(-1, num_negatives, self.config.max_context_token_limit)
                     except Exception as e:
                         logger.error(f"Error during negatives tokenization: {e}")
                         pbar.update(1)
+                        continue
+                    # Serialize and write to TFRecord
                     for j in range(len(queries)):
                         try:
                             q_id = encoded_queries['input_ids'][j].numpy()
         logger.info(f"Data preparation complete. TFRecord saved.")
     def _compute_embeddings(self, queries: List[str]) -> None:
+        """
+        Compute embeddings for new queries and update the cache.
+        """
         new_queries = [q for q in queries if q not in self.query_embeddings_cache]
         if not new_queries:
+            return
+        # Compute embeddings
         new_embeddings = []
         for i in range(0, len(new_queries), self.embedding_batch_size):
             batch_queries = new_queries[i:i + self.embedding_batch_size]
             batch_embeddings = self.encoder(encoded['input_ids'], training=False).numpy()
             faiss.normalize_L2(batch_embeddings)
             new_embeddings.extend(batch_embeddings)
         # Update the cache
         for query, emb in zip(new_queries, new_embeddings):
             self.query_embeddings_cache[query] = emb
     def data_generator(self, dialogues: List[dict]) -> Generator[Tuple[str, str, List[str]], None, None]:
         """
+        Generate training examples: (query, positive, [hard_negatives]).
         """
         total_dialogues = len(dialogues)
         logger.debug(f"Total dialogues to process: {total_dialogues}")
         with tqdm(total=total_dialogues, desc="Processing Dialogues", unit="dialogue") as pbar:
             for dialogue in dialogues:
                 pairs = self._extract_pairs_from_dialogue(dialogue)
                 for query, positive in pairs:
                     # Ensure embeddings are computed, find hard negatives, etc.
                     self._compute_embeddings([query])
+                    hard_negatives = self._find_hard_negatives([query], [positive])[0]
                     yield (query, positive, hard_negatives)
                 pbar.update(1)
     def get_tf_dataset(self, dialogues: List[dict], batch_size: int) -> tf.data.Dataset:
         """
+        Creates a tf.data.Dataset for streaming training.
+        yields (input_ids_query, input_ids_positive, input_ids_negatives).
         """
         # 1) Start with a generator dataset
         dataset = tf.data.Dataset.from_generator(
             lambda: self.data_generator(dialogues),
             output_signature=(
+                tf.TensorSpec(shape=(), dtype=tf.string),                   # Query (single string)
+                tf.TensorSpec(shape=(), dtype=tf.string),                   # Positive (single string)
+                tf.TensorSpec(shape=(self.neg_samples,), dtype=tf.string)   # Hard Negatives (list of strings)
             )
         )
+        # Batch the raw strings, then map through a tokenize step
+        # Note 'Distilbert Tokenizer threw an error when using tf.data.AUTOTUNE.
         dataset = dataset.batch(batch_size, drop_remainder=True)
         dataset = dataset.map(
             lambda q, p, n: self._tokenize_triple(q, p, n),
             num_parallel_calls=1 #tf.data.AUTOTUNE
         n: tf.Tensor
     ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """
+        Wraps a Python function. Convert tf.Tensors of strings -> Python lists of strings -> HF tokenizer -> Tensors of IDs.
+        q is shape [batch_size], p is shape [batch_size], n is shape [batch_size, neg_samples] (list of negatives).
         """
+        # Use tf.py_function, limit parallelism
         q_ids, p_ids, n_ids = tf.py_function(
             func=self._tokenize_triple_py,
             inp=[q, p, n, tf.constant(self.max_length), tf.constant(self.neg_samples)],
             Tout=[tf.int32, tf.int32, tf.int32]
         )
+        # Set shape info for the output tensors
+        q_ids.set_shape([None, self.max_length])                    # [batch_size, max_length]
+        p_ids.set_shape([None, self.max_length])                    # [batch_size, max_length]
         n_ids.set_shape([None, self.neg_samples, self.max_length])  # [batch_size, neg_samples, max_length]
         return q_ids, p_ids, n_ids
         neg_samples: tf.Tensor
     ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
         """
+        Decodes tf.string Tensor to Python List[str], then tokenize.
+        Reshapes negatives to [batch_size, neg_samples, max_length].
+        Returns np.array(int32) for (q_ids, p_ids, n_ids).
         q: shape [batch_size], p: shape [batch_size]
         n: shape [batch_size, neg_samples]
+        max_len: int
+        neg_samples: int
         """
+        max_len = int(max_len.numpy())
         neg_samples = int(neg_samples.numpy())
+        # Convert Tensors -> Python List[str]
         q_list = [q_i.decode("utf-8") for q_i in q.numpy()]  # shape [batch_size]
         p_list = [p_i.decode("utf-8") for p_i in p.numpy()]  # shape [batch_size]
+        # Shape [batch_size, neg_samples], decode each row
         n_list = []
         for row in n.numpy():
             # row is shape [neg_samples], each is a tf.string
             decoded = [neg.decode("utf-8") for neg in row]
             n_list.append(decoded)
+        # Tokenize queries & positives
         q_enc = self.tokenizer(
             q_list,
             padding="max_length",
             return_tensors="np"
         )
+        # Tokenize negatives
+        # Flatten [batch_size, neg_samples] -> List
         flattened_negatives = [neg for row in n_list for neg in row]
         if len(flattened_negatives) == 0:
+            # No negatives: return a zero array
             n_ids = np.zeros((len(q_list), neg_samples, max_len), dtype=np.int32)
         else:
             n_enc = self.tokenizer(
                 max_length=max_len,
                 return_tensors="np"
             )
+            # Shape [batch_size * neg_samples, max_len]
             n_input_ids = n_enc["input_ids"]
+            # Reshape to [batch_size, neg_samples, max_len]
             batch_size = len(q_list)
             n_ids_list = []
             for i in range(batch_size):
                 end_idx = start_idx + neg_samples
                 row_negs = n_input_ids[start_idx:end_idx]
+                # Pad with zeros if not enough negatives
                 if row_negs.shape[0] < neg_samples:
                     deficit = neg_samples - row_negs.shape[0]
                     pad_arr = np.zeros((deficit, max_len), dtype=np.int32)
                 n_ids_list.append(row_negs)
+            # Stack shape [batch_size, neg_samples, max_len]
             n_ids = np.stack(n_ids_list, axis=0)
+        # Return np.int32 arrays
         q_ids = q_enc["input_ids"].astype(np.int32)  # shape [batch_size, max_len]
         p_ids = p_enc["input_ids"].astype(np.int32)  # shape [batch_size, max_len]
         n_ids = n_ids.astype(np.int32)               # shape [batch_size, neg_samples, max_len]

train_model.py CHANGED Viewed

@@ -14,10 +14,10 @@ def inspect_tfrecord(tfrecord_file_path, num_examples=3):
             'negative_ids': tf.io.FixedLenFeature([3 * 512], tf.int64),  # Adjust neg_samples if different
         }
         return tf.io.parse_single_example(example_proto, feature_description)
     dataset = tf.data.TFRecordDataset(tfrecord_file_path)
     dataset = dataset.map(parse_example)
     for i, example in enumerate(dataset.take(num_examples)):
         print(f"Example {i+1}:")
         print(f"Query IDs: {example['query_ids'].numpy()}")
@@ -26,29 +26,27 @@ def inspect_tfrecord(tfrecord_file_path, num_examples=3):
         print("-" * 50)
 def main():
-    # Quick test to inspect TFRecord
     # inspect_tfrecord('training_data/training_data.tfrecord', num_examples=3)
-    # Initialize environment
-    tf.keras.backend.clear_session()
     env = EnvironmentSetup()
     env.initialize()
-    # Training configuration
     EPOCHS = 20
     TF_RECORD_FILE_PATH = 'training_data/training_data.tfrecord'
     CHECKPOINT_DIR = 'checkpoints/'
-    # Optimize batch size for Colab
-    batch_size = 32 # env.optimize_batch_size(base_batch_size=16)
-    # Initialize config
-    config = ChatbotConfig()
-    # Initialize chatbot
     chatbot = RetrievalChatbot(config, mode='training')
-    # Check for existing checkpoint and get initial epoch
     latest_checkpoint = tf.train.latest_checkpoint(CHECKPOINT_DIR)
     initial_epoch = 0
     if latest_checkpoint:
@@ -60,7 +58,7 @@ def main():
             logger.error(f"Failed to parse checkpoint number from {latest_checkpoint}")
             initial_epoch = 0
-    # Train the model
     chatbot.train_model(
         tfrecord_file_path=TF_RECORD_FILE_PATH,
         epochs=EPOCHS,
@@ -71,13 +69,13 @@ def main():
         initial_epoch=initial_epoch
     )
-    # Save final model
     model_save_path = env.training_dirs['base'] / 'final_model'
     chatbot.save_models(model_save_path)
-    # Plot and save training history
     plotter = Plotter(save_dir=env.training_dirs['plots'])
     plotter.plot_training_history(chatbot.history)
 if __name__ == "__main__":
     main()

             'negative_ids': tf.io.FixedLenFeature([3 * 512], tf.int64),  # Adjust neg_samples if different
         }
         return tf.io.parse_single_example(example_proto, feature_description)
     dataset = tf.data.TFRecordDataset(tfrecord_file_path)
     dataset = dataset.map(parse_example)
     for i, example in enumerate(dataset.take(num_examples)):
         print(f"Example {i+1}:")
         print(f"Query IDs: {example['query_ids'].numpy()}")
         print("-" * 50)
 def main():
+    tf.keras.backend.clear_session()
+    # Validate TFRecord
     # inspect_tfrecord('training_data/training_data.tfrecord', num_examples=3)
+    # Init env
     env = EnvironmentSetup()
     env.initialize()
+    # Training config
     EPOCHS = 20
     TF_RECORD_FILE_PATH = 'training_data/training_data.tfrecord'
     CHECKPOINT_DIR = 'checkpoints/'
+    batch_size = 32
+    # Initialize config and chatbot model
+    config = ChatbotConfig()
     chatbot = RetrievalChatbot(config, mode='training')
+    # Check for existing checkpoint
     latest_checkpoint = tf.train.latest_checkpoint(CHECKPOINT_DIR)
     initial_epoch = 0
     if latest_checkpoint:
             logger.error(f"Failed to parse checkpoint number from {latest_checkpoint}")
             initial_epoch = 0
+    # Train
     chatbot.train_model(
         tfrecord_file_path=TF_RECORD_FILE_PATH,
         epochs=EPOCHS,
         initial_epoch=initial_epoch
     )
+    # Save
     model_save_path = env.training_dirs['base'] / 'final_model'
     chatbot.save_models(model_save_path)
+    # Plot
     plotter = Plotter(save_dir=env.training_dirs['plots'])
     plotter.plot_training_history(chatbot.history)
 if __name__ == "__main__":
     main()

validate_model.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import json
 from chatbot_model import ChatbotConfig, RetrievalChatbot
 from response_quality_checker import ResponseQualityChecker
 from chatbot_validator import ChatbotValidator
@@ -18,20 +17,20 @@ def run_interactive_chat(chatbot, quality_checker):
         except (KeyboardInterrupt, EOFError):
             print("\nAssistant: Goodbye!")
             break
         if user_input.lower() in ["quit", "exit", "bye"]:
             print("Assistant: Goodbye!")
             break
         response, candidates, metrics = chatbot.chat(
             query=user_input,
             conversation_history=None,
             quality_checker=quality_checker,
             top_k=10
         )
         print(f"Assistant: {response}")
         # Show alternative responses if confident
         if metrics.get("is_confident", False):
             print("\nAlternative responses:")
@@ -39,17 +38,17 @@ def run_interactive_chat(chatbot, quality_checker):
                 print(f"Score: {score:.4f} - {resp}")
         else:
             print("\n[Low Confidence]: Consider rephrasing your query for better assistance.")
 def validate_chatbot():
     # Initialize environment
     env = EnvironmentSetup()
     env.initialize()
     MODEL_DIR = "new_iteration/data_prep_iterative_models"
     FAISS_INDICES_DIR = os.path.join(MODEL_DIR, "faiss_indices")
     FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_production.index")
     FAISS_INDEX_TEST_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_test.index")
     # Toggle 'production' or 'test' env
     ENVIRONMENT = "production"
     if ENVIRONMENT == "test":
@@ -58,7 +57,7 @@ def validate_chatbot():
     else:
         FAISS_INDEX_PATH = FAISS_INDEX_PRODUCTION_PATH
         RESPONSE_POOL_PATH = FAISS_INDEX_PRODUCTION_PATH.replace(".index", "_responses.json")
     # Load the config
     config_path = os.path.join(MODEL_DIR, "config.json")
     if os.path.exists(config_path):
@@ -69,50 +68,47 @@ def validate_chatbot():
     else:
         config = ChatbotConfig()
         logger.warning("No config.json found. Using default ChatbotConfig.")
-    # Load RetrievalChatbot in 'inference' mode using the classmethod
     try:
         chatbot = RetrievalChatbot.load_model(load_dir=MODEL_DIR, mode="inference")
         logger.info("RetrievalChatbot loaded in 'inference' mode successfully.")
     except Exception as e:
         logger.error(f"Failed to load RetrievalChatbot: {e}")
         return
     # Confirm FAISS index & response pool exist
     if not os.path.exists(FAISS_INDEX_PATH) or not os.path.exists(RESPONSE_POOL_PATH):
         logger.error("FAISS index or response pool file is missing.")
         return
-    # Load specific FAISS index and response pool
     try:
-        # Even though load_model might auto-load an index, we override here with the specific file
         chatbot.data_pipeline.load_faiss_index(FAISS_INDEX_PATH)
         logger.info(f"FAISS index loaded from {FAISS_INDEX_PATH}.")
-        print("FAISS dimensions:", chatbot.data_pipeline.index.d)
-        print("FAISS index type:", type(chatbot.data_pipeline.index))
-        print("FAISS index total vectors:", chatbot.data_pipeline.index.ntotal)
-        print("FAISS is_trained:", chatbot.data_pipeline.index.is_trained)
         with open(RESPONSE_POOL_PATH, "r", encoding="utf-8") as f:
             chatbot.data_pipeline.response_pool = json.load(f)
-        logger.info(f"Response pool loaded from {RESPONSE_POOL_PATH}.")
-        print("\nTotal responses in pool:", len(chatbot.data_pipeline.response_pool))
         # Validate dimension consistency
         chatbot.data_pipeline.validate_faiss_index()
         logger.info("FAISS index and response pool validated successfully.")
     except Exception as e:
         logger.error(f"Failed to load or validate FAISS index: {e}")
         return
     # Init QualityChecker and Validator
     quality_checker = ResponseQualityChecker(data_pipeline=chatbot.data_pipeline)
     validator = ChatbotValidator(chatbot=chatbot, quality_checker=quality_checker)
     logger.info("ResponseQualityChecker and ChatbotValidator initialized.")
     # Run validation
     try:
         validation_metrics = validator.run_validation(num_examples=5)
@@ -120,7 +116,7 @@ def validate_chatbot():
     except Exception as e:
         logger.error(f"Validation process failed: {e}")
         return
     # Plot metrics
     # try:
     #     plotter = Plotter(save_dir=env.training_dirs["plots"])
@@ -128,10 +124,10 @@ def validate_chatbot():
     #     logger.info("Validation metrics plotted successfully.")
     # except Exception as e:
     #     logger.error(f"Failed to plot validation metrics: {e}")
     # Run interactive chat loop
-    # logger.info("\nStarting interactive chat session...")
-    # run_interactive_chat(chatbot, quality_checker)
 if __name__ == "__main__":
     validate_chatbot()

 import os
 import json
 from chatbot_model import ChatbotConfig, RetrievalChatbot
 from response_quality_checker import ResponseQualityChecker
 from chatbot_validator import ChatbotValidator
         except (KeyboardInterrupt, EOFError):
             print("\nAssistant: Goodbye!")
             break
         if user_input.lower() in ["quit", "exit", "bye"]:
             print("Assistant: Goodbye!")
             break
         response, candidates, metrics = chatbot.chat(
             query=user_input,
             conversation_history=None,
             quality_checker=quality_checker,
             top_k=10
         )
         print(f"Assistant: {response}")
         # Show alternative responses if confident
         if metrics.get("is_confident", False):
             print("\nAlternative responses:")
                 print(f"Score: {score:.4f} - {resp}")
         else:
             print("\n[Low Confidence]: Consider rephrasing your query for better assistance.")
 def validate_chatbot():
     # Initialize environment
     env = EnvironmentSetup()
     env.initialize()
     MODEL_DIR = "new_iteration/data_prep_iterative_models"
     FAISS_INDICES_DIR = os.path.join(MODEL_DIR, "faiss_indices")
     FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_production.index")
     FAISS_INDEX_TEST_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_test.index")
     # Toggle 'production' or 'test' env
     ENVIRONMENT = "production"
     if ENVIRONMENT == "test":
     else:
         FAISS_INDEX_PATH = FAISS_INDEX_PRODUCTION_PATH
         RESPONSE_POOL_PATH = FAISS_INDEX_PRODUCTION_PATH.replace(".index", "_responses.json")
     # Load the config
     config_path = os.path.join(MODEL_DIR, "config.json")
     if os.path.exists(config_path):
     else:
         config = ChatbotConfig()
         logger.warning("No config.json found. Using default ChatbotConfig.")
+    # Load RetrievalChatbot in 'inference' mode
     try:
         chatbot = RetrievalChatbot.load_model(load_dir=MODEL_DIR, mode="inference")
         logger.info("RetrievalChatbot loaded in 'inference' mode successfully.")
     except Exception as e:
         logger.error(f"Failed to load RetrievalChatbot: {e}")
         return
     # Confirm FAISS index & response pool exist
     if not os.path.exists(FAISS_INDEX_PATH) or not os.path.exists(RESPONSE_POOL_PATH):
         logger.error("FAISS index or response pool file is missing.")
         return
+    # Load FAISS index and response pool
     try:
         chatbot.data_pipeline.load_faiss_index(FAISS_INDEX_PATH)
         logger.info(f"FAISS index loaded from {FAISS_INDEX_PATH}.")
+        logger.info("FAISS dimensions:", chatbot.data_pipeline.index.d)
+        logger.info("FAISS index type:", type(chatbot.data_pipeline.index))
+        logger.info("FAISS index total vectors:", chatbot.data_pipeline.index.ntotal)
+        logger.info("FAISS is_trained:", chatbot.data_pipeline.index.is_trained)
         with open(RESPONSE_POOL_PATH, "r", encoding="utf-8") as f:
             chatbot.data_pipeline.response_pool = json.load(f)
+            logger.info(f"Response pool loaded from {RESPONSE_POOL_PATH}.")
+            logger.info("\nTotal responses in pool:", len(chatbot.data_pipeline.response_pool))
         # Validate dimension consistency
         chatbot.data_pipeline.validate_faiss_index()
         logger.info("FAISS index and response pool validated successfully.")
     except Exception as e:
         logger.error(f"Failed to load or validate FAISS index: {e}")
         return
     # Init QualityChecker and Validator
     quality_checker = ResponseQualityChecker(data_pipeline=chatbot.data_pipeline)
     validator = ChatbotValidator(chatbot=chatbot, quality_checker=quality_checker)
     logger.info("ResponseQualityChecker and ChatbotValidator initialized.")
     # Run validation
     try:
         validation_metrics = validator.run_validation(num_examples=5)
     except Exception as e:
         logger.error(f"Validation process failed: {e}")
         return
     # Plot metrics
     # try:
     #     plotter = Plotter(save_dir=env.training_dirs["plots"])
     #     logger.info("Validation metrics plotted successfully.")
     # except Exception as e:
     #     logger.error(f"Failed to plot validation metrics: {e}")
     # Run interactive chat loop
+    logger.info("\nStarting interactive chat session...")
+    run_interactive_chat(chatbot, quality_checker)
 if __name__ == "__main__":
     validate_chatbot()