Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Jan 11

Commit

7a0020b

1 Parent(s): d53c64b

updates - new iteration with type token

Browse files

Files changed (13) hide show

.gitignore +7 -1
build_faiss_index.py +161 -0
chatbot_model.py +286 -202
chatbot_validator.py +141 -68
conversation_summarizer.py +1 -1
cross_encoder_reranker.py +39 -13
new_iteration/pipeline_config.py +9 -0
new_iteration/run_taskmaster_processor.py +39 -0
new_iteration/taskmaster_processor.py +177 -0
prepare_data.py +137 -73
response_quality_checker.py +266 -68
tf_data_pipeline.py +293 -159
validate_model.py +71 -42

.gitignore CHANGED Viewed

@@ -180,4 +180,10 @@ cache/*
 !cache/.gitkeep
 training_data/*
 !training_data/.gitkeep
-augmented_dialogues.json

 !cache/.gitkeep
 training_data/*
 !training_data/.gitkeep
+augmented_dialogues.json
+checkpoints_old_REMOVE/*
+new_iteration/cache/*
+new_iteration/data_prep_iterative_models/*
+new_iteration/training_data/*
+new_iteration/processed_outputs/*

build_faiss_index.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+import json
+from pathlib import Path
+import faiss
+import numpy as np
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModel
+from tqdm.auto import tqdm
+from chatbot_model import ChatbotConfig, EncoderModel
+from tf_data_pipeline import TFDataPipeline
+from logger_config import config_logger
+logger = config_logger(__name__)
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+def sanity_check(encoder: EncoderModel, tokenizer: AutoTokenizer, config: ChatbotConfig):
+    """
+    Perform a quick sanity check to ensure the model is loaded correctly.
+    """
+    sample_response = "This is a test response."
+    encoded_sample = tokenizer(
+        [sample_response],
+        padding=True,
+        truncation=True,
+        max_length=config.max_context_token_limit,
+        return_tensors='tf'
+    )
+    # Get embedding
+    sample_embedding = encoder(encoded_sample['input_ids'], training=False).numpy()
+    # Check shape
+    if sample_embedding.shape[1] != config.embedding_dim:
+        logger.error(
+            f"Embedding dimension mismatch: Expected {config.embedding_dim}, "
+            f"got {sample_embedding.shape[1]}"
+        )
+        raise ValueError("Embedding dimension mismatch.")
+    else:
+        logger.info("Embedding dimension matches the configuration.")
+    # Check normalization
+    embedding_norm = np.linalg.norm(sample_embedding, axis=1)
+    if not np.allclose(embedding_norm, 1.0, atol=1e-5):
+        logger.error("Embeddings are not properly normalized.")
+        raise ValueError("Embeddings are not normalized.")
+    else:
+        logger.info("Embeddings are properly normalized.")
+    logger.info("Sanity check passed: Model loaded correctly and outputs are as expected.")
+def build_faiss_index():
+    """
+    Rebuild the FAISS index by:
+      1) Loading your config.json
+      2) Initializing encoder + loading submodule & custom weights
+      3) Loading tokenizer from disk
+      4) Creating a TFDataPipeline
+      5) Setting the pipeline's response_pool from a JSON file
+      6) Using pipeline.compute_and_index_response_embeddings()
+      7) Saving the FAISS index
+    """
+    # Directories
+    MODELS_DIR = Path("models")
+    FAISS_DIR = MODELS_DIR / "faiss_indices"
+    FAISS_INDEX_PATH = FAISS_DIR / "faiss_index_production.index"
+    RESPONSES_PATH = FAISS_DIR / "faiss_index_production_responses.json"
+    TOKENIZER_DIR = MODELS_DIR / "tokenizer"
+    SHARED_ENCODER_DIR = MODELS_DIR / "shared_encoder"
+    CUSTOM_WEIGHTS_PATH = MODELS_DIR / "encoder_custom_weights.weights.h5"
+    # 1) Load ChatbotConfig
+    config_path = MODELS_DIR / "config.json"
+    if config_path.exists():
+        with open(config_path, "r", encoding="utf-8") as f:
+            config_dict = json.load(f)
+        config = ChatbotConfig.from_dict(config_dict)
+        logger.info(f"Loaded ChatbotConfig from {config_path}")
+    else:
+        config = ChatbotConfig()
+        logger.warning(f"No config.json found at {config_path}. Using default ChatbotConfig.")
+    # 2) Initialize the EncoderModel
+    encoder = EncoderModel(config=config)
+    logger.info("EncoderModel instantiated (empty).")
+    # Overwrite the submodule from 'shared_encoder' directory
+    if SHARED_ENCODER_DIR.exists():
+        logger.info(f"Loading DistilBERT submodule from {SHARED_ENCODER_DIR}...")
+        encoder.pretrained = TFAutoModel.from_pretrained(str(SHARED_ENCODER_DIR))
+        logger.info("Loaded HF submodule into encoder.pretrained.")
+    else:
+        logger.warning(f"No shared_encoder directory at {SHARED_ENCODER_DIR}. Using default pretrained model.")
+    # Build model once, then load custom weights (projection, etc.)
+    dummy_input = tf.zeros((1, config.max_context_token_limit), dtype=tf.int32)
+    _ = encoder(dummy_input, training=False)  # builds the layers
+    if CUSTOM_WEIGHTS_PATH.exists():
+        logger.info(f"Loading custom top-level weights from {CUSTOM_WEIGHTS_PATH}")
+        encoder.load_weights(str(CUSTOM_WEIGHTS_PATH))
+        logger.info("Custom top-level weights loaded successfully.")
+    else:
+        logger.warning(f"Custom weights file not found at {CUSTOM_WEIGHTS_PATH}.")
+    # 3) Load tokenizer
+    if TOKENIZER_DIR.exists():
+        logger.info(f"Loading tokenizer from {TOKENIZER_DIR}")
+        tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_DIR))
+    else:
+        logger.warning(f"No tokenizer dir at {TOKENIZER_DIR}, falling back to default HF tokenizer.")
+        tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
+        #tokenizer.add_special_tokens({'additional_special_tokens': ['<EMPTY_NEGATIVE>']})
+    # 4) Quick sanity check
+    sanity_check(encoder, tokenizer, config)
+    # 5) Prepare a TFDataPipeline
+    pipeline = TFDataPipeline(
+        config=config,
+        tokenizer=tokenizer,
+        encoder=encoder,
+        index_file_path=str(FAISS_INDEX_PATH),
+        response_pool=[],
+        max_length=config.max_context_token_limit,
+        query_embeddings_cache={},
+        neg_samples=config.neg_samples,
+        index_type='IndexFlatIP',
+        nlist=100,
+        max_retries=config.max_retries
+    )
+    # 6) Load the existing response pool
+    if not RESPONSES_PATH.exists():
+        logger.error(f"Response pool JSON file not found at {RESPONSES_PATH}")
+        raise FileNotFoundError(f"No response pool JSON at {RESPONSES_PATH}")
+    with open(RESPONSES_PATH, "r", encoding="utf-8") as f:
+        response_pool = json.load(f)
+    logger.info(f"Loaded {len(response_pool)} responses from {RESPONSES_PATH}")
+    pipeline.response_pool = response_pool  # assign to pipeline
+    # 7) Build (or rebuild) the FAISS index from pipeline method
+    #    This does all the compute-embeddings + index.add in one place
+    logger.info("Starting to compute and index response embeddings via TFDataPipeline...")
+    pipeline.compute_and_index_response_embeddings()
+    # 8) Save the rebuilt FAISS index
+    pipeline.save_faiss_index(str(FAISS_INDEX_PATH))
+    # Verify
+    loaded_index = faiss.read_index(str(FAISS_INDEX_PATH))
+    logger.info(f"Verified the rebuilt FAISS index has {loaded_index.ntotal} vectors.")
+    return loaded_index, pipeline.response_pool
+if __name__ == "__main__":
+    build_faiss_index()

chatbot_model.py CHANGED Viewed

@@ -10,6 +10,8 @@ from pathlib import Path
 import datetime
 import faiss
 import gc
 from tf_data_pipeline import TFDataPipeline
 from response_quality_checker import ResponseQualityChecker
 from cross_encoder_reranker import CrossEncoderReranker
@@ -31,7 +33,7 @@ class ChatbotConfig:
     num_attention_heads: int = 8
     dropout_rate: float = 0.2
     l2_reg_weight: float = 0.001
-    learning_rate: float = 0.0005
     min_text_length: int = 3
     max_context_turns: int = 5
     warmup_steps: int = 200
@@ -41,7 +43,7 @@ class ChatbotConfig:
     embedding_batch_size: int = 64
     search_batch_size: int = 64
     max_batch_size: int = 64
-    neg_samples: int = 3
     max_retries: int = 3
     def to_dict(self) -> Dict:
@@ -54,7 +56,7 @@ class ChatbotConfig:
         """Create config from dictionary."""
         return cls(**{k: v for k, v in config_dict.items()
                      if k in cls.__dataclass_fields__})
 class EncoderModel(tf.keras.Model):
     """Dual encoder model with pretrained embeddings."""
     def __init__(
@@ -154,7 +156,7 @@ class RetrievalChatbot(DeviceAwareModel):
             config=self.config,
             tokenizer=self.tokenizer,
             encoder=self.encoder,
-            index_file_path='path/to/index',  # Update as needed # TODO: Update this path
             response_pool=[],
             max_length=self.config.max_context_token_limit,
             query_embeddings_cache={},
@@ -260,32 +262,49 @@ class RetrievalChatbot(DeviceAwareModel):
     def load_model(cls, load_dir: Union[str, Path], mode: str = 'training') -> 'RetrievalChatbot':
         """
         Load saved models and configuration.
-        Args:
-            load_dir (Union[str, Path]): Directory containing saved model files
-            mode (str): Either 'training' or 'inference'. In inference mode,
-                    also loads FAISS index and response pool.
         """
         load_dir = Path(load_dir)
-        # Load config
         with open(load_dir / "config.json", "r") as f:
             config = ChatbotConfig.from_dict(json.load(f))
-        # Initialize chatbot with appropriate mode
         chatbot = cls(config, mode=mode)
-        # Load models
         chatbot.encoder.pretrained = TFAutoModel.from_pretrained(
             load_dir / "shared_encoder",
             config=config
         )
-        # Load tokenizer
         chatbot.tokenizer = AutoTokenizer.from_pretrained(load_dir / "tokenizer")
         logger.info(f"Models and tokenizer loaded from {load_dir}")
-        # If in inference mode, load additional components
         if mode == 'inference':
             cls._prepare_model_for_inference(chatbot, load_dir)
@@ -296,7 +315,7 @@ class RetrievalChatbot(DeviceAwareModel):
         """Internal method to load inference components."""
         try:
             # Load FAISS index
-            faiss_path = load_dir / 'faiss_index.bin'
             if faiss_path.exists():
                 chatbot.index = faiss.read_index(str(faiss_path))
                 logger.info("FAISS index loaded successfully")
@@ -304,7 +323,7 @@ class RetrievalChatbot(DeviceAwareModel):
                 raise FileNotFoundError(f"FAISS index not found at {faiss_path}")
             # Load response pool
-            response_pool_path = load_dir / 'response_pool.json'
             if response_pool_path.exists():
                 with open(response_pool_path, 'r') as f:
                     chatbot.response_pool = json.load(f)
@@ -332,9 +351,12 @@ class RetrievalChatbot(DeviceAwareModel):
         with open(save_dir / "config.json", "w") as f:
             json.dump(self.config.to_dict(), f, indent=2)
-        # Save models
         self.encoder.pretrained.save_pretrained(save_dir / "shared_encoder")
         # Save tokenizer
         self.tokenizer.save_pretrained(save_dir / "tokenizer")
@@ -343,139 +365,270 @@ class RetrievalChatbot(DeviceAwareModel):
     def retrieve_responses_cross_encoder(
         self,
         query: str,
-        top_k: int,
         reranker: Optional[CrossEncoderReranker] = None,
         summarizer: Optional[Summarizer] = None,
-        summarize_threshold: int = 512  # Summarize over 512 tokens
     ) -> List[Tuple[str, float]]:
         """
-        Retrieve top-k from FAISS, then re-rank them with a cross-encoder.
-        Optionally summarize the user query if it's too long.
         """
-        if reranker is None:
-            reranker = self.reranker
-        if summarizer is None:
-            summarizer = self.summarizer
-        # Optional summarization
         if summarizer and len(query.split()) > summarize_threshold:
-            logger.info(f"Query is long. Summarizing before cross-encoder. Original length: {len(query.split())}")
             query = summarizer.summarize_text(query)
-            logger.info(f"Summarized query: {query}")
-        # 2) Dense retrieval
-        dense_topk = self.retrieve_responses_faiss(query, top_k=top_k)  # [(resp, dense_score), ...]
-        if not dense_topk:
-            return []
-        # 3) Cross-encoder rerank
-        candidate_texts = [pair[0] for pair in dense_topk]
-        cross_scores = reranker.rerank(query, candidate_texts, max_length=256)
-        # Combine
-        combined = [(text, score) for (text, _), score in zip(dense_topk, cross_scores)]
-        # Sort descending by cross-encoder score
-        combined.sort(key=lambda x: x[1], reverse=True)
-        return combined
-    # def retrieve_responses_cross_encoder(
-    #     self,
-    #     query: str,
-    #     top_k: int,
-    #     reranker: Optional[CrossEncoderReranker] = None,
-    #     summarizer: Optional[Summarizer] = None,
-    #     summarize_threshold: int = 512  # Summarize over 512 tokens
-    # ) -> List[Tuple[str, float]]:
-    #     """
-    #     Retrieve top-k from FAISS, then re-rank them with a cross-encoder.
-    #     Optionally summarize the user query if it's too long.
-    #     """
-    #     if reranker is None:
-    #         reranker = self.reranker
-    #     if summarizer is None:
-    #         summarizer = self.summarizer
-    #     # Optional summarization
-    #     if summarizer and len(query.split()) > summarize_threshold:
-    #         logger.info(f"Query is long. Summarizing before cross-encoder. Original length: {len(query.split())}")
-    #         query = summarizer.summarize_text(query)
-    #         logger.info(f"Summarized query: {query}")
-    #     # 2) Dense retrieval
-    #     dense_topk = self.retrieve_responses_faiss(query, top_k=top_k)  # [(resp, dense_score), ...]
-    #     if not dense_topk:
-    #         return []
-    #     # 3) Cross-encoder rerank
-    #     candidate_texts = [pair[0] for pair in dense_topk]
-    #     cross_scores = reranker.rerank(query, candidate_texts, max_length=256)
-    #     # Combine
-    #     combined = [(text, score) for (text, _), score in zip(dense_topk, cross_scores)]
-    #     # Sort descending by cross-encoder score
-    #     combined.sort(key=lambda x: x[1], reverse=True)
-    #     return combined
-    def retrieve_responses_faiss(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]:
-        """Retrieve top-k responses using FAISS."""
-        if not hasattr(self.data_pipeline, 'index') or self.data_pipeline.index is None:
-            logger.warning("FAISS index not initialized. Cannot retrieve responses.")
-            return []
-        # Encode the query using TFDataPipeline's method
-        q_emb = self.data_pipeline.encode_query(query)  # Ensure encode_query is within TFDataPipeline
-        q_emb_np = q_emb.numpy().astype('float32')  # Ensure type match
-        # Normalize the query embedding for cosine similarity
-        faiss.normalize_L2(q_emb_np)
-        # Search the FAISS index
-        distances, indices = self.data_pipeline.index.search(q_emb_np, top_k)
-        # Map indices to responses and distances to similarities
-        top_responses = []
-        for i, idx in enumerate(indices[0]):
-            if idx < len(self.data_pipeline.response_pool):
-                top_responses.append((self.data_pipeline.response_pool[idx], float(distances[0][i])))
             else:
-                logger.warning(f"FAISS returned invalid index {idx}. Skipping.")
         return top_responses
-    # def retrieve_responses_faiss(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]:
-    #     """Retrieve top-k responses using FAISS."""
-    #     if not hasattr(self, 'index') or self.index is None:
-    #         logger.warning("FAISS index not initialized. Cannot retrieve responses.")
-    #         return []
     #     # Encode the query
-    #     q_emb = self.encode_query(query)  # Shape: [1, embedding_dim]
-    #     q_emb_np = q_emb.numpy().astype('float32')  # Ensure type match
-    #     # Normalize the query embedding for cosine similarity
-    #     faiss.normalize_L2(q_emb_np)
-    #     # Search the FAISS index
-    #     distances, indices = self.index.search(q_emb_np, top_k)
-    #     # Map indices to responses and distances to similarities
-    #     top_responses = []
-    #     for i, idx in enumerate(indices[0]):
-    #         if idx < len(self.response_pool):
-    #             top_responses.append((self.response_pool[idx], float(distances[0][i])))
     #         else:
-    #             logger.warning(f"FAISS returned invalid index {idx}. Skipping.")
-    #     return top_responses
     def chat(
         self,
         query: str,
         conversation_history: Optional[List[Tuple[str, str]]] = None,
         quality_checker: Optional['ResponseQualityChecker'] = None,
-        top_k: int = 5,
     ) -> Tuple[str, List[Tuple[str, float]], Dict[str, Any]]:
         """
         Example chat method that always uses cross-encoder re-ranking
@@ -516,52 +669,6 @@ class RetrievalChatbot(DeviceAwareModel):
             return results[0][0], results, {}
         return get_response(self, query)
-    # def chat(
-    #     self,
-    #     query: str,
-    #     conversation_history: Optional[List[Tuple[str, str]]] = None,
-    #     quality_checker: Optional['ResponseQualityChecker'] = None,
-    #     top_k: int = 5,
-    # ) -> Tuple[str, List[Tuple[str, float]], Dict[str, Any]]:
-    #     """
-    #     Example chat method that always uses cross-encoder re-ranking
-    #     if self.reranker is available.
-    #     """
-    #     @self.run_on_device
-    #     def get_response(self_arg, query_arg):  # Add parameters that match decorator's expectations
-    #         # 1) Build conversation context string
-    #         conversation_str = self_arg._build_conversation_context(query_arg, conversation_history)
-    #         # 2) Retrieve + cross-encoder re-rank
-    #         results = self_arg.retrieve_responses_cross_encoder(
-    #             query=conversation_str,
-    #             top_k=top_k,
-    #             reranker=self_arg.reranker,
-    #             summarizer=self_arg.summarizer,
-    #             summarize_threshold=512
-    #         )
-    #         # 3) Handle empty or confidence
-    #         if not results:
-    #             return (
-    #                 "I'm sorry, but I couldn't find a relevant response.",
-    #                 [],
-    #                 {}
-    #             )
-    #         if quality_checker:
-    #             metrics = quality_checker.check_response_quality(query_arg, results)
-    #             if not metrics.get('is_confident', False):
-    #                 return (
-    #                     "I need more information to provide a good answer. Could you please clarify?",
-    #                     results,
-    #                     metrics
-    #                 )
-    #             return results[0][0], results, metrics
-    #         return results[0][0], results, {}
-    #     return get_response(self, query)
     def _build_conversation_context(
         self,
@@ -581,24 +688,6 @@ class RetrievalChatbot(DeviceAwareModel):
         conversation_parts.append(f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}")
         return "\n".join(conversation_parts)
-    # def _build_conversation_context(
-    #     self,
-    #     query: str,
-    #     conversation_history: Optional[List[Tuple[str, str]]]
-    # ) -> str:
-    #     """Build conversation context with better memory management."""
-    #     if not conversation_history:
-    #         return f"{self.special_tokens['user']} {query}"
-    #     conversation_parts = []
-    #     for user_txt, assistant_txt in conversation_history:
-    #         conversation_parts.extend([
-    #             f"{self.special_tokens['user']} {user_txt}",
-    #             f"{self.special_tokens['assistant']} {assistant_txt}"
-    #         ])
-    #     conversation_parts.append(f"{self.special_tokens['user']} {query}")
-    #     return "\n".join(conversation_parts)
     def train_model(
         self,
@@ -707,23 +796,14 @@ class RetrievalChatbot(DeviceAwareModel):
             self.history = {'train_loss': [], 'val_loss': [], 'learning_rate': []}
         if latest_checkpoint and not test_mode:
-            # Debug info before restore
-            logger.info("\nEncoder Variables:")
-            for var in self.encoder.variables:
-                logger.info(f"{var.name}: {var.dtype} - Shape: {var.shape}")
-            logger.info("\nOptimizer Variables:")
-            for var in self.optimizer.variables:
-                logger.info(f"{var.name}: {var.dtype} - Shape: {var.shape}")
             # Add checkpoint inspection
-            logger.info("\nTrying to load checkpoint from: ", latest_checkpoint)
             reader = tf.train.load_checkpoint(latest_checkpoint)
-            shape_from_key = reader.get_variable_to_shape_map()
-            dtype_from_key = reader.get_variable_to_dtype_map()
-            logger.info("\nCheckpoint Variables:")
-            for key in shape_from_key:
-                logger.info(f"{key}: dtype={dtype_from_key[key]} - Shape: {shape_from_key[key]}")
             status = checkpoint.restore(latest_checkpoint)
             status.assert_consumed()
@@ -754,6 +834,10 @@ class RetrievalChatbot(DeviceAwareModel):
                     logger.info(f"Loaded previous training history from {history_path}")
                 except Exception as e:
                     logger.warning(f"Could not load history, starting fresh: {e}")
         else:
             logger.info("Starting training from scratch")
             checkpoint.epoch.assign(tf.cast(0, tf.int32))

 import datetime
 import faiss
 import gc
+import re
 from tf_data_pipeline import TFDataPipeline
 from response_quality_checker import ResponseQualityChecker
 from cross_encoder_reranker import CrossEncoderReranker
     num_attention_heads: int = 8
     dropout_rate: float = 0.2
     l2_reg_weight: float = 0.001
+    learning_rate: float = 0.001
     min_text_length: int = 3
     max_context_turns: int = 5
     warmup_steps: int = 200
     embedding_batch_size: int = 64
     search_batch_size: int = 64
     max_batch_size: int = 64
+    neg_samples: int = 10
     max_retries: int = 3
     def to_dict(self) -> Dict:
         """Create config from dictionary."""
         return cls(**{k: v for k, v in config_dict.items()
                      if k in cls.__dataclass_fields__})
 class EncoderModel(tf.keras.Model):
     """Dual encoder model with pretrained embeddings."""
     def __init__(
             config=self.config,
             tokenizer=self.tokenizer,
             encoder=self.encoder,
+            index_file_path='new_iteration/data_prep_iterative_models/faiss_indices/faiss_index_production.index',
             response_pool=[],
             max_length=self.config.max_context_token_limit,
             query_embeddings_cache={},
     def load_model(cls, load_dir: Union[str, Path], mode: str = 'training') -> 'RetrievalChatbot':
         """
         Load saved models and configuration.
         """
         load_dir = Path(load_dir)
+        # 1) Load config
         with open(load_dir / "config.json", "r") as f:
             config = ChatbotConfig.from_dict(json.load(f))
+        # 2) Initialize chatbot
         chatbot = cls(config, mode=mode)
+        # 3) Load DistilBERT from huggingface folder
         chatbot.encoder.pretrained = TFAutoModel.from_pretrained(
             load_dir / "shared_encoder",
             config=config
         )
+        dummy_input = tf.zeros((1, config.max_context_token_limit), dtype=tf.int32)
+        _ = chatbot.encoder(dummy_input, training=False)
+        # # Then load your custom weights
+        # custom_weights_path = load_dir / "encoder_custom_weights.weights.h5"
+        # if custom_weights_path.exists():
+        #     logger.info(f"Loading custom top-level weights from {custom_weights_path}")
+        #     chatbot.encoder.load_weights(str(custom_weights_path))
+        #     logger.info("Custom top-level weights loaded successfully.")
+        # else:
+        #     logger.warning(f"Custom weights file not found at {custom_weights_path}.")
+        # 4) Load tokenizer
         chatbot.tokenizer = AutoTokenizer.from_pretrained(load_dir / "tokenizer")
         logger.info(f"Models and tokenizer loaded from {load_dir}")
+        # 5) Load the custom top layers' weights
+        custom_weights_path = load_dir / "encoder_custom_weights.weights.h5"
+        if custom_weights_path.exists():
+            chatbot.encoder.load_weights(str(custom_weights_path))
+            logger.info("Loaded custom encoder weights for projection/dropout/etc.")
+        else:
+            logger.warning(f"No custom encoder weights found at {custom_weights_path}. The top-level projection layer won't have learned parameters.")
+        # 6) If in inference mode, load FAISS, etc.
         if mode == 'inference':
             cls._prepare_model_for_inference(chatbot, load_dir)
         """Internal method to load inference components."""
         try:
             # Load FAISS index
+            faiss_path = load_dir / 'faiss_indices/faiss_index_production.index'
             if faiss_path.exists():
                 chatbot.index = faiss.read_index(str(faiss_path))
                 logger.info("FAISS index loaded successfully")
                 raise FileNotFoundError(f"FAISS index not found at {faiss_path}")
             # Load response pool
+            response_pool_path = load_dir / 'faiss_indices/faiss_index_production_responses.json'
             if response_pool_path.exists():
                 with open(response_pool_path, 'r') as f:
                     chatbot.response_pool = json.load(f)
         with open(save_dir / "config.json", "w") as f:
             json.dump(self.config.to_dict(), f, indent=2)
+        # Save the HF DistilBERT submodule:
         self.encoder.pretrained.save_pretrained(save_dir / "shared_encoder")
+        # ALSO save custom top-level layers' weights
+        self.encoder.save_weights(save_dir / "encoder_custom_weights.weights.h5")
         # Save tokenizer
         self.tokenizer.save_pretrained(save_dir / "tokenizer")
     def retrieve_responses_cross_encoder(
         self,
         query: str,
+        top_k: int = 10,
         reranker: Optional[CrossEncoderReranker] = None,
         summarizer: Optional[Summarizer] = None,
+        summarize_threshold: int = 512
     ) -> List[Tuple[str, float]]:
         """
+        Retrieve top-k responses with optional domain-based boosting
+        and cross-encoder re-ranking.
+        Args:
+            query: The user's input text.
+            top_k: Number of final results to return.
+            reranker: CrossEncoderReranker for refined scoring, if available.
+            summarizer: Summarizer for long queries, if desired.
+            summarize_threshold: Summarize if query wordcount > threshold.
+        Returns:
+            List of (response_text, final_score).
         """
+        # 1) Optional query summarization
         if summarizer and len(query.split()) > summarize_threshold:
+            logger.info(f"Query is long ({len(query.split())} words). Summarizing.")
             query = summarizer.summarize_text(query)
+            logger.info(f"Summarized Query: {query}")
+        detected_domain = self.detect_domain_from_query(query)
+        logger.debug(f"Detected domain '{detected_domain}' for query: {query}")
+        # 2) Retrieve more initial candidates from FAISS
+        initial_k = min(top_k * 10, len(self.data_pipeline.response_pool))
+        dense_candidates = self.retrieve_responses_faiss(query, domain=detected_domain, top_k=initial_k)
+        boosted_candidates = dense_candidates
+        # 4) If we have a cross-encoder, re-rank these boosted candidates
+        if not reranker:
+            logger.warning("No CrossEncoderReranker provided; creating a default one.")
+            reranker = CrossEncoderReranker(model_name="cross-encoder/ms-marco-MiniLM-L-12-v2")
+        texts = [item[0] for item in boosted_candidates]
+        ce_scores = reranker.rerank(query, texts, max_length=256)
+        # Combine cross-encoder score with the base FAISS score (simple multiplicative approach)
+        final_candidates = []
+        for (resp_text, faiss_score), ce_score in zip(boosted_candidates, ce_scores):
+            # TODO: dial this in.
+            alpha = 0.8
+            combined_score = alpha * ce_score + (1 - alpha) * faiss_score
+            length_adjusted_score = self.length_adjust_score(resp_text, combined_score)
+            #combined_score = ce_score * faiss_score
+            final_candidates.append((resp_text, combined_score))
+        # Sort descending by combined score
+        final_candidates.sort(key=lambda x: x[1], reverse=True)
+        # Return top_k
+        return final_candidates[:top_k]
+    DOMAIN_KEYWORDS = {
+        'restaurant': ['restaurant', 'dining', 'food', 'dine', 'reservation', 'table', 'menu', 'cuisine', 'eat', 'place to eat', 'hungry', 'chef', 'dish', 'meal', 'brunch', 'bistro', 'buffet', 'catering', 'gourmet', 'fast food', 'fine dining', 'takeaway', 'delivery', 'restaurant booking'],
+        'movie': ['movie', 'cinema', 'film', 'ticket', 'showtime', 'showing', 'theater', 'flick', 'screening', 'film ticket', 'film show', 'blockbuster', 'premiere', 'trailer', 'director', 'actor', 'actress', 'plot', 'genre', 'screen', 'sequel', 'animation', 'documentary'],
+        'ride_share': ['ride', 'taxi', 'uber', 'lyft', 'car service', 'pickup', 'dropoff', 'driver', 'cab', 'hailing', 'rideshare', 'ride hailing', 'carpool', 'chauffeur', 'transit', 'transportation', 'hail ride'],
+        'coffee': ['coffee', 'café', 'cafe', 'starbucks', 'espresso', 'latte', 'mocha', 'americano', 'barista', 'brew', 'cappuccino', 'macchiato', 'iced coffee', 'cold brew', 'espresso machine', 'coffee shop', 'tea', 'chai', 'java', 'bean', 'roast', 'decaf'],
+        'pizza': ['pizza', 'delivery', 'order food', 'pepperoni', 'topping', 'pizzeria', 'slice', 'pie', 'margherita', 'deep dish', 'thin crust', 'cheese', 'oven', 'tossed', 'sauce', 'garlic bread', 'calzone'],
+        'auto': ['car', 'vehicle', 'repair', 'maintenance', 'mechanic', 'oil change', 'garage', 'auto shop', 'tire', 'check engine', 'battery', 'transmission', 'brake', 'engine diagnostics', 'carwash', 'detail', 'alignment', 'exhaust', 'spark plug', 'dashboard'],
+    }
+    def extract_keywords(self, query: str) -> List[str]:
+        """
+        Extract keywords from the query based on DOMAIN_KEYWORDS.
+        """
+        query_lower = query.lower()
+        keywords = set()
+        for domain, kws in self.DOMAIN_KEYWORDS.items():
+            for kw in kws:
+                if kw in query_lower:
+                    keywords.add(kw)
+        return list(keywords)
+    def length_adjust_score(resp_text: str, base_score: float) -> float:
+        # Apply a short penalty
+        words = len(resp_text.split())
+        if words < 3:
+            # big penalty or skip entirely
+            return base_score * 0.1  # or base_score - 0.01
+        # Add a mild bonus for lines that exceed 12 words:
+        if words > 12:
+            # e.g. +0.002 * (words - 12)
+            bonus = 0.002 * (words - 12)
+            base_score += bonus
+        return base_score
+    def detect_domain_from_query(self, query: str) -> str:
+        """
+        Detect the domain of the query based on keywords.
+        """
+        domain_patterns = {
+            'restaurant': r'\b(restaurant|dining|food|dine|reservation|table|menu|cuisine|eat|place\s?to\s?eat|hungry|chef|dish|meal|fork|knife|spoon|brunch|bistro|buffet|catering|gourmet|fast\s?food|fine\s?dining|takeaway|delivery|restaurant\s?booking)\b',
+            'movie': r'\b(movie|cinema|film|ticket|showtime|showing|theater|flick|screening|film\s?ticket|film\s?show|blockbuster|premiere|trailer|director|actor|actress|plot|genre|screen|sequel|animation|documentary)\b',
+            'ride_share': r'\b(ride|taxi|uber|lyft|car\s?service|pickup|dropoff|driver|cab|hailing|rideshare|ride\s?hailing|carpool|chauffeur|transit|transportation|hail\s?ride)\b',
+            'coffee': r'\b(coffee|café|cafe|starbucks|espresso|latte|mocha|americano|barista|brew|cappuccino|macchiato|iced\s?coffee|cold\s?brew|espresso\s?machine|coffee\s?shop|tea|chai|java|bean|roast|decaf)\b',
+            'pizza': r'\b(pizza|delivery|order\s?food|pepperoni|topping|pizzeria|slice|pie|margherita|deep\s?dish|thin\s?crust|cheese|oven|tossed|sauce|garlic\s?bread|calzone)\b',
+            'auto': r'\b(car|vehicle|repair|maintenance|mechanic|oil\s?change|garage|auto\s?shop|tire|check\s?engine|battery|transmission|brake|engine\s?diagnostics|carwash|detail|alignment|exhaust|spark\s?plug|dashboard)\b',
+        }
+        # Check for matches
+        for domain, pattern in domain_patterns.items():
+            if re.search(pattern, query.lower()):
+                return domain
+        return 'other'
+    def is_numeric_response(text: str) -> bool:
+        """
+        Return True if `text` is purely digits (and/or spaces).
+        e.g.: "4 3 13" -> True, " 42 " -> True, "hello 42" -> False
+        """
+        pattern = r'^\s*[0-9]+(\s+[0-9]+)*\s*$'
+        return bool(re.match(pattern, text))
+    def retrieve_responses_faiss(
+        self,
+        query: str,
+        domain: str = 'other',
+        top_k: int = 5,
+        boost_factor: float = 1.3
+    ) -> List[Tuple[str, float]]:
+        """
+        Retrieve top-k responses from the FAISS index (IndexFlatIP) given a user query.
+        Args:
+            query (str): The user input text.
+            domain (str, optional): The detected domain. Defaults to 'other'.
+            top_k (int, optional): Number of top results to return. Defaults to 5.
+            boost_factor (float, optional): Factor to boost scores for keyword matches. Defaults to 1.3.
+        Returns:
+            List[Tuple[str, float]]: List of (response_text, similarity) sorted by descending similarity.
+        """
+        # Encode the query
+        q_emb = self.data_pipeline.encode_query(query)
+        q_emb_np = q_emb.reshape(1, -1).astype('float32')
+        # Search the index
+        distances, indices = self.data_pipeline.index.search(q_emb_np, top_k * 10) # Adjust multiplier as needed
+        # IndexFlatIP: 'distances' are inner products (cosine similarities for normalized vectors)
+        candidates = []
+        for rank, idx in enumerate(indices[0]):
+            if idx == -1:
+                continue  # FAISS may return -1 for invalid indices
+            response = self.data_pipeline.response_pool[idx]
+            text = response.get('text', '')
+            cand_domain = response.get('domain', 'other')
+            score = distances[0][rank]
+            # Filter out numeric responses and very short texts
+            if not self.is_numeric_response(text) and len(text.split()) >= self.config.min_text_length:
+                candidates.append((text, cand_domain, score))
+        if not candidates:
+            logger.warning("No valid candidates found after initial filtering.")
+            return []
+        # Sort candidates by score descending
+        candidates.sort(key=lambda x: x[2], reverse=True)
+        # Filter in-domain responses
+        if domain != 'other':
+            in_domain_responses = [c for c in candidates if c[1] == domain]
+            if not in_domain_responses:
+                logger.info(f"No in-domain responses found for domain '{domain}'. Falling back to all candidates.")
+                in_domain_responses = candidates
+        else:
+            in_domain_responses = candidates
+        # Boost responses containing query keywords
+        query_keywords = self.extract_keywords(query)
+        boosted_responses = []
+        for resp_text, resp_domain, score in in_domain_responses:
+            if any(kw in resp_text.lower() for kw in query_keywords):
+                boosted_score = score * boost_factor
+                logger.debug(f"Boosting response: '{resp_text}' by factor {boost_factor}")
             else:
+                boosted_score = score
+            boosted_responses.append((resp_text, boosted_score))
+        # Sort boosted responses
+        boosted_responses.sort(key=lambda x: x[1], reverse=True)
+        # Select top_k responses
+        top_responses = boosted_responses[:top_k]
+        logger.debug(f"Top {top_k} responses selected.")
         return top_responses
+    # def retrieve_responses_faiss(
+    #     self,
+    #     query: str,
+    #     domain: str = 'other',
+    #     top_k: int = 5,
+    #     boost_factor: float = 1.3
+    # ) -> List[Tuple[str, float]]:
+    #     """
+    #     Retrieve top-k responses from the FAISS index (IndexFlatIP) given a user query.
+    #     Args:
+    #         query: The user input text
+    #         top_k: Number of top results to return
+    #     Returns:
+    #         List of (response_text, similarity) sorted by descending similarity
+    #     """
     #     # Encode the query
+    #     q_emb = self.data_pipeline.encode_query(query)
+    #     q_emb_np = q_emb.reshape(1, -1).astype('float32')
+    #     # Search the index
+    #     distances, indices = self.data_pipeline.index.search(q_emb_np, top_k * 10) # distances: shape [1, k], indices: shape [1, k]
+    #     # IndexFlatIP: 'distances' are cosine similarities in [-1, +1].
+    #     candidates = []
+    #     for rank, idx in enumerate(indices[0]):
+    #         text = self.response_pool[idx]['text']
+    #         cand_domain = self.response_pool[idx]['domain']
+    #         score = distances[0][rank]
+    #         # filter out responses with only numbers or too few words
+    #         word_count = len(text.split())
+    #         if not self.is_numeric_resonse(text) and word_count >= 2:
+    #             candidates.append((text, cand_domain, score))
+    #     # Filter to in-domain responses
+    #     candidates.sort(key=lambda x: x[2], reverse=True)
+    #     in_domain_responses = [(text, score) for (text, cand_domain, score) in candidates if cand_domain == domain]
+    #     # Boost keyword matching responses
+    #     query_keywords = self.extract_keywords(query)
+    #     boosted_responses = []
+    #     for (resp_text, domain, score) in in_domain_responses:
+    #         # Check if any keyword is present in the response text
+    #         for kw in query_keywords:
+    #             if kw in resp_text.lower():
+    #                 boosted_score = score * boost_factor
+    #                 print(f"Boosting response: '{resp_text}' by factor {boost_factor}")
+    #                 break
     #         else:
+    #             boosted_score = score
+    #         boosted_responses.append((resp_text, domain, boosted_score))
+    #     # Debug
+    #     logger.debug("\nFAISS Search Results (top 15 for debug):")
+    #     for i, (resp, score) in enumerate(boosted_responses[:15], start=1):
+    #         logger.debug(f"{i}. Score: {score:.4f} -> {resp[:60]}")
+    #     return boosted_responses[:top_k]
     def chat(
         self,
         query: str,
         conversation_history: Optional[List[Tuple[str, str]]] = None,
         quality_checker: Optional['ResponseQualityChecker'] = None,
+        top_k: int = 10,
     ) -> Tuple[str, List[Tuple[str, float]], Dict[str, Any]]:
         """
         Example chat method that always uses cross-encoder re-ranking
             return results[0][0], results, {}
         return get_response(self, query)
     def _build_conversation_context(
         self,
         conversation_parts.append(f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}")
         return "\n".join(conversation_parts)
     def train_model(
         self,
             self.history = {'train_loss': [], 'val_loss': [], 'learning_rate': []}
         if latest_checkpoint and not test_mode:
             # Add checkpoint inspection
+            logger.info(f"\nTrying to load checkpoint from: {latest_checkpoint}")
             reader = tf.train.load_checkpoint(latest_checkpoint)
+            # shape_from_key = reader.get_variable_to_shape_map()
+            # dtype_from_key = reader.get_variable_to_dtype_map()
+            # logger.info("\nCheckpoint Variables:")
+            # for key in shape_from_key:
+            #     logger.info(f"{key}: dtype={dtype_from_key[key]} - Shape: {shape_from_key[key]}")
             status = checkpoint.restore(latest_checkpoint)
             status.assert_consumed()
                     logger.info(f"Loaded previous training history from {history_path}")
                 except Exception as e:
                     logger.warning(f"Could not load history, starting fresh: {e}")
+            # Fix for custom weights not being saved in the full model.
+            self.save_models(Path(checkpoint_dir) / "pretrained_full_model")
+            logger.info(f"Manually saved custom weights after restore.")
         else:
             logger.info("Starting training from scratch")
             checkpoint.epoch.assign(tf.cast(0, tf.int32))

chatbot_validator.py CHANGED Viewed

@@ -1,30 +1,41 @@
 from typing import Dict, List, Tuple, Any, Optional
 import numpy as np
 from logger_config import config_logger
 logger = config_logger(__name__)
 class ChatbotValidator:
-    """Handles automated validation and performance analysis for the chatbot."""
     def __init__(self, chatbot, quality_checker):
         """
         Initialize the validator.
         Args:
-            chatbot: RetrievalChatbot instance
             quality_checker: ResponseQualityChecker instance
         """
         self.chatbot = chatbot
         self.quality_checker = quality_checker
-        # Domain-specific test queries aligned with Taskmaster-1 and Schema-Guided
         self.domain_queries = {
             'restaurant': [
                 "I'd like to make a reservation for dinner tonight.",
-                "Can you book a table for 4 people at an Italian place?",
-                "Do you have any availability for tomorrow at 7pm?",
-                "I need to change my dinner reservation time.",
                 "What's the wait time for a table right now?"
             ],
             'movie_tickets': [
@@ -38,8 +49,8 @@ class ChatbotValidator:
                 "I need a ride from the airport to downtown.",
                 "How much would it cost to get to the mall?",
                 "Can you book a car for tomorrow morning?",
-                "Is there a driver available now?",
-                "What's the estimated arrival time?"
             ],
             'services': [
                 "I need to schedule an oil change for my car.",
@@ -61,7 +72,9 @@ class ChatbotValidator:
         self,
         num_examples: int = 5,
         top_k: int = 10,
-        domains: Optional[List[str]] = None
     ) -> Dict[str, Any]:
         """
         Run comprehensive validation across specified domains.
@@ -69,36 +82,55 @@ class ChatbotValidator:
         Args:
             num_examples: Number of test queries per domain
             top_k: Number of responses to retrieve for each query
-            domains: Optional list of specific domains to test
         Returns:
             Dict containing detailed validation metrics and domain-specific performance
         """
         logger.info("\n=== Running Enhanced Automatic Validation ===")
-        # Select domains to test
         test_domains = domains if domains else list(self.domain_queries.keys())
         metrics_history = []
         domain_metrics = {}
         # Run validation for each domain
         for domain in test_domains:
             domain_metrics[domain] = []
-            queries = self.domain_queries[domain][:num_examples]
             logger.info(f"\n=== Testing {domain.title()} Domain ===")
             for i, query in enumerate(queries, 1):
-                logger.info(f"\nTest Case {i}:")
-                logger.info(f"Query: {query}")
-                # Get responses with increased top_k
-                responses = self.chatbot.retrieve_responses_cross_encoder(query, top_k=top_k)
-                # Enhanced quality checking with context (assuming no context here)
                 quality_metrics = self.quality_checker.check_response_quality(query, responses)
-                # Add domain info
                 quality_metrics['domain'] = domain
                 metrics_history.append(quality_metrics)
                 domain_metrics[domain].append(quality_metrics)
@@ -106,11 +138,12 @@ class ChatbotValidator:
                 # Detailed logging
                 self._log_validation_results(query, responses, quality_metrics, i)
-        # Calculate and log overall metrics
         aggregate_metrics = self._calculate_aggregate_metrics(metrics_history)
         domain_analysis = self._analyze_domain_performance(domain_metrics)
         confidence_analysis = self._analyze_confidence_distribution(metrics_history)
         aggregate_metrics.update({
             'domain_performance': domain_analysis,
             'confidence_analysis': confidence_analysis
@@ -120,48 +153,74 @@ class ChatbotValidator:
         return aggregate_metrics
     def _calculate_aggregate_metrics(self, metrics_history: List[Dict]) -> Dict[str, float]:
-        """Calculate comprehensive aggregate metrics."""
         metrics = {
             'num_queries_tested': len(metrics_history),
-            'avg_top_response_score': np.mean([m.get('top_score', 0) for m in metrics_history]),
-            'avg_diversity': np.mean([m.get('response_diversity', 0) for m in metrics_history]),
-            'avg_relevance': np.mean([m.get('query_response_relevance', 0) for m in metrics_history]),
-            'avg_length_score': np.mean([m.get('response_length_score', 0) for m in metrics_history]),
-            'avg_score_gap': np.mean([m.get('top_3_score_gap', 0) for m in metrics_history]),
-            'confidence_rate': np.mean([m.get('is_confident', False) for m in metrics_history]),
             # Additional statistical metrics
-            'median_top_score': np.median([m.get('top_score', 0) for m in metrics_history]),
-            'score_std': np.std([m.get('top_score', 0) for m in metrics_history]),
-            'min_score': np.min([m.get('top_score', 0) for m in metrics_history]),
-            'max_score': np.max([m.get('top_score', 0) for m in metrics_history])
         }
         return metrics
-    def _analyze_domain_performance(self, domain_metrics: Dict[str, List[Dict]]) -> Dict[str, Dict]:
-        """Analyze performance by domain."""
-        domain_analysis = {}
-        for domain, metrics in domain_metrics.items():
-            domain_analysis[domain] = {
-                'confidence_rate': np.mean([m.get('is_confident', False) for m in metrics]),
-                'avg_relevance': np.mean([m.get('query_response_relevance', 0) for m in metrics]),
-                'avg_diversity': np.mean([m.get('response_diversity', 0) for m in metrics]),
-                'avg_top_score': np.mean([m.get('top_score', 0) for m in metrics]),
-                'num_samples': len(metrics)
             }
-        return domain_analysis
     def _analyze_confidence_distribution(self, metrics_history: List[Dict]) -> Dict[str, float]:
-        """Analyze the distribution of confidence scores."""
-        scores = [m.get('top_score', 0) for m in metrics_history]
         return {
-            'percentile_25': np.percentile(scores, 25),
-            'percentile_50': np.percentile(scores, 50),
-            'percentile_75': np.percentile(scores, 75),
-            'percentile_90': np.percentile(scores, 90)
         }
     def _log_validation_results(
@@ -171,37 +230,51 @@ class ChatbotValidator:
         metrics: Dict[str, Any],
         case_num: int
     ):
-        """Log detailed validation results."""
-        logger.info(f"\nTest Case {case_num}:")
-        logger.info(f"Query: {query}")
-        logger.info(f"Domain: {metrics.get('domain', 'Unknown')}")
-        logger.info(f"Confidence: {'Yes' if metrics.get('is_confident', False) else 'No'}")
-        logger.info("\nQuality Metrics:")
-        for metric, value in metrics.items():
-            if isinstance(value, (int, float)):
-                logger.info(f"  {metric}: {value:.4f}")
-        logger.info("\nTop Responses:")
-        for i, (response, score) in enumerate(responses[:3], 1):
-            logger.info(f"{i}. Score: {score:.4f}. Response: {response}")
-            if i == 1 and not metrics.get('is_confident', False):
-                logger.info("   [Low Confidence]")
     def _log_validation_summary(self, metrics: Dict[str, Any]):
-        """Log comprehensive validation summary."""
         logger.info("\n=== Validation Summary ===")
         logger.info("\nOverall Metrics:")
         for metric, value in metrics.items():
             if isinstance(value, (int, float)):
                 logger.info(f"{metric}: {value:.4f}")
         logger.info("\nDomain Performance:")
-        for domain, domain_metrics in metrics['domain_performance'].items():
             logger.info(f"\n{domain.title()}:")
-            for metric, value in domain_metrics.items():
                 logger.info(f"  {metric}: {value:.4f}")
         logger.info("\nConfidence Distribution:")
-        for percentile, value in metrics['confidence_analysis'].items():
-            logger.info(f"{percentile}: {value:.4f}")

 from typing import Dict, List, Tuple, Any, Optional
 import numpy as np
+import random
 from logger_config import config_logger
+from cross_encoder_reranker import CrossEncoderReranker
 logger = config_logger(__name__)
 class ChatbotValidator:
+    """
+    Handles automated validation and performance analysis for the chatbot.
+    This validator executes domain-specific test queries, obtains candidate
+    responses via the chatbot, then evaluates them with a quality checker.
+    It aggregates metrics across queries and domains, logs intermediate
+    results, and returns a comprehensive summary.
+    """
     def __init__(self, chatbot, quality_checker):
         """
         Initialize the validator.
         Args:
+            chatbot: RetrievalChatbot instance for inference
             quality_checker: ResponseQualityChecker instance
         """
         self.chatbot = chatbot
         self.quality_checker = quality_checker
+        # Basic domain-specific test queries (easy examples)
+        # Taskmaster-1 and Schema-Guided style
         self.domain_queries = {
             'restaurant': [
                 "I'd like to make a reservation for dinner tonight.",
+                "Can you book a table for 4 at an Italian restaurant?",
+                "Is there any availability to dine tomorrow at 7pm?",
+                "I'd like to cancel my reservation for tonight.",
                 "What's the wait time for a table right now?"
             ],
             'movie_tickets': [
                 "I need a ride from the airport to downtown.",
                 "How much would it cost to get to the mall?",
                 "Can you book a car for tomorrow morning?",
+                "Is there a driver available right now?",
+                "What's the estimated arrival time for the driver?"
             ],
             'services': [
                 "I need to schedule an oil change for my car.",
         self,
         num_examples: int = 5,
         top_k: int = 10,
+        domains: Optional[List[str]] = None,
+        randomize: bool = False,
+        seed: int = 42
     ) -> Dict[str, Any]:
         """
         Run comprehensive validation across specified domains.
         Args:
             num_examples: Number of test queries per domain
             top_k: Number of responses to retrieve for each query
+            domains: Optional list of domain keys to test. If None, test all.
+            randomize: If True, randomly select queries from the domain lists
+            seed: Random seed for consistent sampling if randomize=True
         Returns:
             Dict containing detailed validation metrics and domain-specific performance
         """
         logger.info("\n=== Running Enhanced Automatic Validation ===")
+        # Select which domains to test
         test_domains = domains if domains else list(self.domain_queries.keys())
+        # Initialize results
         metrics_history = []
         domain_metrics = {}
+        reranker = CrossEncoderReranker(model_name="cross-encoder/ms-marco-MiniLM-L-12-v2")
+        # Prepare random selection if needed
+        rng = random.Random(seed)
         # Run validation for each domain
         for domain in test_domains:
+            # Avoid errors if domain key missing
+            if domain not in self.domain_queries:
+                logger.warning(f"Domain '{domain}' not found in domain_queries. Skipping.")
+                continue
+            all_queries = self.domain_queries[domain]
+            if randomize:
+                queries = rng.sample(all_queries, min(num_examples, len(all_queries)))
+            else:
+                queries = all_queries[:num_examples]
+            # Store domain-level metrics
             domain_metrics[domain] = []
             logger.info(f"\n=== Testing {domain.title()} Domain ===")
             for i, query in enumerate(queries, 1):
+                logger.info(f"\nTest Case {i}: {query}")
+                # Retrieve top_k responses (including cross-encoder re-ranking if available)
+                responses = self.chatbot.retrieve_responses_cross_encoder(query, top_k=top_k, reranker=reranker)
+                # Evaluate with quality checker
                 quality_metrics = self.quality_checker.check_response_quality(query, responses)
+                # Save domain info
                 quality_metrics['domain'] = domain
                 metrics_history.append(quality_metrics)
                 domain_metrics[domain].append(quality_metrics)
                 # Detailed logging
                 self._log_validation_results(query, responses, quality_metrics, i)
+        # Final aggregation
         aggregate_metrics = self._calculate_aggregate_metrics(metrics_history)
         domain_analysis = self._analyze_domain_performance(domain_metrics)
         confidence_analysis = self._analyze_confidence_distribution(metrics_history)
+        # Combine into one dictionary
         aggregate_metrics.update({
             'domain_performance': domain_analysis,
             'confidence_analysis': confidence_analysis
         return aggregate_metrics
     def _calculate_aggregate_metrics(self, metrics_history: List[Dict]) -> Dict[str, float]:
+        """
+        Calculate comprehensive aggregate metrics over all tested queries.
+        """
+        if not metrics_history:
+            logger.warning("No metrics to aggregate. Returning empty summary.")
+            return {}
+        top_scores = [m.get('top_score', 0.0) for m in metrics_history]
+        # The length-based metrics are robust to missing or zero-length data
         metrics = {
             'num_queries_tested': len(metrics_history),
+            'avg_top_response_score': np.mean(top_scores),
+            'avg_diversity': np.mean([m.get('response_diversity', 0.0) for m in metrics_history]),
+            'avg_relevance': np.mean([m.get('query_response_relevance', 0.0) for m in metrics_history]),
+            'avg_length_score': np.mean([m.get('response_length_score', 0.0) for m in metrics_history]),
+            'avg_score_gap': np.mean([m.get('top_3_score_gap', 0.0) for m in metrics_history]),
+            'confidence_rate': np.mean([1.0 if m.get('is_confident', False) else 0.0
+                                        for m in metrics_history]),
             # Additional statistical metrics
+            'median_top_score': np.median(top_scores),
+            'score_std': np.std(top_scores),
+            'min_score': np.min(top_scores),
+            'max_score': np.max(top_scores)
         }
         return metrics
+    def _analyze_domain_performance(self, domain_metrics: Dict[str, List[Dict]]) -> Dict[str, Dict[str, float]]:
+        """
+        Analyze performance by domain, returning a nested dict.
+        """
+        analysis = {}
+        for domain, metrics_list in domain_metrics.items():
+            if not metrics_list:
+                analysis[domain] = {}
+                continue
+            top_scores = [m.get('top_score', 0.0) for m in metrics_list]
+            analysis[domain] = {
+                'confidence_rate': np.mean([1.0 if m.get('is_confident', False) else 0.0
+                                            for m in metrics_list]),
+                'avg_relevance': np.mean([m.get('query_response_relevance', 0.0)
+                                          for m in metrics_list]),
+                'avg_diversity': np.mean([m.get('response_diversity', 0.0)
+                                          for m in metrics_list]),
+                'avg_top_score': np.mean(top_scores),
+                'num_samples': len(metrics_list)
             }
+        return analysis
     def _analyze_confidence_distribution(self, metrics_history: List[Dict]) -> Dict[str, float]:
+        """
+        Analyze the distribution of top scores to gauge system confidence levels.
+        """
+        if not metrics_history:
+            return {'percentile_25': 0.0, 'percentile_50': 0.0,
+                    'percentile_75': 0.0, 'percentile_90': 0.0}
+        scores = [m.get('top_score', 0.0) for m in metrics_history]
         return {
+            'percentile_25': float(np.percentile(scores, 25)),
+            'percentile_50': float(np.percentile(scores, 50)),
+            'percentile_75': float(np.percentile(scores, 75)),
+            'percentile_90': float(np.percentile(scores, 90))
         }
     def _log_validation_results(
         metrics: Dict[str, Any],
         case_num: int
     ):
+        """
+        Log detailed validation results for each test case.
+        """
+        domain = metrics.get('domain', 'Unknown')
+        is_confident = metrics.get('is_confident', False)
+        logger.info(f"Domain: {domain} | Confidence: {'Yes' if is_confident else 'No'}")
+        logger.info("Quality Metrics:")
+        for k, v in metrics.items():
+            if isinstance(v, (int, float)):
+                logger.info(f"  {k}: {v:.4f}")
+        logger.info("Top 3 Responses:")
+        for i, (resp_text, score) in enumerate(responses[:3], 1):
+            logger.info(f"{i}) Score: {score:.4f} | {resp_text}")
+            if i == 1 and not is_confident:
+                logger.info("   [Low Confidence on Top Response]")
     def _log_validation_summary(self, metrics: Dict[str, Any]):
+        """
+        Log a summary of all validation metrics and domain performance.
+        """
+        if not metrics:
+            logger.info("No metrics to summarize.")
+            return
         logger.info("\n=== Validation Summary ===")
+        # Overall
         logger.info("\nOverall Metrics:")
         for metric, value in metrics.items():
+            # Skip sub-dicts here
             if isinstance(value, (int, float)):
                 logger.info(f"{metric}: {value:.4f}")
+        # Domain performance
+        domain_perf = metrics.get('domain_performance', {})
         logger.info("\nDomain Performance:")
+        for domain, domain_stats in domain_perf.items():
             logger.info(f"\n{domain.title()}:")
+            for metric, value in domain_stats.items():
                 logger.info(f"  {metric}: {value:.4f}")
+        # Confidence distribution
+        conf_analysis = metrics.get('confidence_analysis', {})
         logger.info("\nConfidence Distribution:")
+        for pct, val in conf_analysis.items():
+            logger.info(f"  {pct}: {val:.4f}")

conversation_summarizer.py CHANGED Viewed

@@ -9,7 +9,7 @@ logger = logging.getLogger(__name__)
 @dataclass
 class ChatConfig:
     max_sequence_length: int = 512
-    default_top_k: int = 5
     chunk_size: int = 512
     chunk_overlap: int = 256
     min_confidence_score: float = 0.7

 @dataclass
 class ChatConfig:
     max_sequence_length: int = 512
+    default_top_k: int = 10
     chunk_size: int = 512
     chunk_overlap: int = 256
     min_confidence_score: float = 0.7

cross_encoder_reranker.py CHANGED Viewed

@@ -1,19 +1,28 @@
 from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
 import tensorflow as tf
-from typing import List, Tuple
 from logger_config import config_logger
 logger = config_logger(__name__)
 class CrossEncoderReranker:
     """
-    Cross-Encoder Re-Ranker: Takes (query, candidate) pairs,
-    outputs a single relevance score (one logit).
     """
     def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-12-v2"):
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
-        # Model outputs shape [batch_size, 1] -> Interpret the logit as relevance score.
     def rerank(
         self,
@@ -22,13 +31,21 @@ class CrossEncoderReranker:
         max_length: int = 256
     ) -> List[float]:
         """
-        Returns a list of re_scores, one for each candidate, indicating
-        how relevant the candidate is to the query.
         """
-        # Build (query, candidate) pairs
         pair_texts = [(query, candidate) for candidate in candidates]
-        # Tokenize the entire batch
         encodings = self.tokenizer(
             pair_texts,
             padding=True,
@@ -37,15 +54,24 @@ class CrossEncoderReranker:
             return_tensors="tf"
         )
-        # Forward pass -> logits shape [batch_size, 1]
         outputs = self.model(
             input_ids=encodings["input_ids"],
             attention_mask=encodings["attention_mask"],
-            token_type_ids=encodings.get("token_type_ids")
         )
-        logits = outputs.logits
-        # Flatten to shape [batch_size]
-        scores = tf.reshape(logits, [-1]).numpy()
         return scores.tolist()

 from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
 import tensorflow as tf
+from typing import List
+import numpy as np
 from logger_config import config_logger
 logger = config_logger(__name__)
 class CrossEncoderReranker:
     """
+    Cross-Encoder Re-Ranker that takes (query, candidate) pairs,
+    outputs a single relevance score in [0,1].
     """
     def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-12-v2"):
+        """
+        Initialize the cross-encoder with a pretrained model.
+        Args:
+            model_name: Name of a HF cross-encoder model. Must be
+                        compatible with TFAutoModelForSequenceClassification.
+        """
+        logger.info(f"Initializing CrossEncoderReranker with {model_name}...")
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+        logger.info("Cross encoder model loaded successfully.")
     def rerank(
         self,
         max_length: int = 256
     ) -> List[float]:
         """
+        Compute relevance scores for each candidate w.r.t. the query.
+        Args:
+            query: User's query text.
+            candidates: List of candidate response texts.
+            max_length: Max token length for each (query, candidate) pair.
+        Returns:
+            A list of float scores in [0,1], one per candidate,
+            indicating model's predicted relevance.
         """
+        # 1) Build (query, candidate) pairs
         pair_texts = [(query, candidate) for candidate in candidates]
+        # 2) Tokenize the entire batch
         encodings = self.tokenizer(
             pair_texts,
             padding=True,
             return_tensors="tf"
         )
+        # 3) Forward pass -> logits shape [batch_size, 1]
         outputs = self.model(
             input_ids=encodings["input_ids"],
             attention_mask=encodings["attention_mask"],
+            token_type_ids=encodings.get("token_type_ids")  # Some models need token_type_ids
         )
+        logits = outputs.logits  # shape [batch_size, 1]
+        # 4) Convert logits -> [0,1] range via sigmoid
+        #    If the cross-encoder is a single-logit regression to [0,1],
+        #    this is a typical interpretation.
+        scores = tf.nn.sigmoid(logits)  # shape [batch_size, 1]
+        # 5) Flatten to a 1D NumPy array of floats
+        scores = tf.reshape(scores, [-1])
+        scores = scores.numpy().astype(float)
+        # logger.debug(f"Cross-Encoder raw logits: {logits.numpy().flatten().tolist()}")
+        # logger.debug(f"Cross-Encoder sigmoid scores: {scores.tolist()}")
         return scores.tolist()

new_iteration/pipeline_config.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from dataclasses import dataclass
+@dataclass
+class PipelineConfig:
+    """Minimal pipeline config."""
+    max_length: int = 512        # max length if you want to skip long utterances
+    min_turns: int = 4           # minimum total turns (user + assistant)
+    min_user_words: int = 3      # min words in each user turn
+    debug: bool = True           # enable debug prints

new_iteration/run_taskmaster_processor.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import json
+from datetime import datetime
+from pathlib import Path
+from pipeline_config import PipelineConfig
+from taskmaster_processor import TaskmasterProcessor
+def main():
+    # 1) Setup config
+    config = PipelineConfig(
+        max_length=512,
+        min_turns=3,
+        min_user_words=3,
+        debug=True
+    )
+    # 2) Instantiate processor
+    base_dir = "datasets/taskmaster"
+    processor = TaskmasterProcessor(config)
+    # 3) Load raw dialogues
+    dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None)
+    # 4) Filter & convert to final structure
+    final_dialogues = processor.filter_and_convert(dialogues)
+    # 5) Save final data
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_dir = Path("processed_outputs")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out_file = output_dir / f"taskmaster_only_{timestamp}.json"
+    with open(out_file, 'w', encoding='utf-8') as f:
+        json.dump(final_dialogues, f, indent=2)
+    print(f"[Taskmaster Only] Kept {len(final_dialogues)} dialogues => {out_file}")
+if __name__ == "__main__":
+    main()

new_iteration/taskmaster_processor.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import json
+import re
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+from pipeline_config import PipelineConfig
+@dataclass
+class TaskmasterDialogue:
+    """Structured representation of a Taskmaster-1 dialogue."""
+    conversation_id: str
+    instruction_id: Optional[str]
+    scenario: Optional[str]
+    domain: str
+    turns: List[Dict[str, Any]] = field(default_factory=list)
+    def validate(self) -> bool:
+        """Check if this dialogue has an ID and a list of turns."""
+        return bool(self.conversation_id and isinstance(self.turns, list))
+class TaskmasterProcessor:
+    """
+    Loads Taskmaster-1 dialogues, extracts domain from scenario,
+    filters them, and outputs a final pipeline-friendly format.
+    """
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+    def load_taskmaster_dataset(self, base_dir: str, max_examples: Optional[int] = None) -> List[TaskmasterDialogue]:
+        """
+        Load and parse Taskmaster JSON for self-dialogs & woz-dialogs (Taskmaster-1).
+        Combines scenario text + conversation utterances to detect domain more robustly.
+        """
+        required_files = {
+            "self-dialogs": "self-dialogs.json",
+            "woz-dialogs": "woz-dialogs.json",
+            "ontology": "ontology.json",  # we might not actively use this, but let's expect it
+        }
+        # Check for missing
+        missing = [k for k, v in required_files.items() if not Path(base_dir, v).exists()]
+        if missing:
+            raise FileNotFoundError(f"Missing Taskmaster files: {missing}")
+        # Load ontology (optional usage)
+        ontology_path = Path(base_dir, required_files["ontology"])
+        with open(ontology_path, 'r', encoding='utf-8') as f:
+            ontology = json.load(f)
+            if self.config.debug:
+                print(f"[TaskmasterProcessor] Loaded ontology with {len(ontology.keys())} top-level keys (unused).")
+        dialogues: List[TaskmasterDialogue] = []
+        # We'll read the 2 main files
+        file_keys = ["self-dialogs", "woz-dialogs"]
+        for file_key in file_keys:
+            file_path = Path(base_dir, required_files[file_key])
+            with open(file_path, 'r', encoding='utf-8') as f:
+                raw_data = json.load(f)
+            for d in raw_data:
+                conversation_id = d.get("conversation_id", "")
+                instruction_id = d.get("instruction_id", None)
+                scenario_text = d.get("scenario", "")  # old scenario approach
+                # Collect utterances -> turns
+                utterances = d.get("utterances", [])
+                turns = self._process_utterances(utterances)
+                # Instead of only using scenario_text, we combine scenario + turn texts.
+                # We'll pass everything to _extract_domain
+                domain = self._extract_domain(
+                    scenario_text,
+                    turns  # pass the entire turn list so we can pick up domain keywords
+                )
+                # Create a structured object
+                new_dlg = TaskmasterDialogue(
+                    conversation_id=conversation_id,
+                    instruction_id=instruction_id,
+                    scenario=scenario_text,
+                    domain=domain,
+                    turns=turns
+                )
+                dialogues.append(new_dlg)
+                if max_examples and len(dialogues) >= max_examples:
+                    break
+        if self.config.debug:
+            print(f"[TaskmasterProcessor] Loaded {len(dialogues)} total dialogues from Taskmaster-1.")
+        return dialogues
+    def _extract_domain(self, scenario: str, turns: List[Dict[str, str]]) -> str:
+        """
+        Combine scenario text + all turn texts to detect the domain more robustly.
+        """
+        # 1) Combine scenario + conversation text
+        combined_text = scenario.lower()
+        for turn in turns:
+            text = turn.get('text', '').strip().lower()
+            combined_text += " " + text
+        # 2) Expanded domain patterns (edit or expand as you wish)
+        domain_patterns = {
+            'restaurant': r'\b(restaurant|dining|food|reservation|table|menu|cuisine|eat)\b',
+            'movie': r'\b(movie|cinema|film|ticket|showtime|theater)\b',
+            'ride_share': r'\b(ride|taxi|uber|lyft|car\s?service|pickup|dropoff)\b',
+            'coffee': r'\b(coffee|café|cafe|starbucks|espresso|latte|mocha|americano)\b',
+            'pizza': r'\b(pizza|delivery|order\s?food|pepperoni|topping|pizzeria)\b',
+            'auto': r'\b(car|vehicle|repair|maintenance|mechanic|oil\s?change)\b'
+        }
+        # 3) Return first matched domain or 'other'
+        for dom, pattern in domain_patterns.items():
+            if re.search(pattern, combined_text):
+                print(f"Matched domain: {dom}")
+                return dom
+        print("No domain match, returning 'other'")
+        return 'other'
+    def _process_utterances(self, utterances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
+        """Map speaker to user/assistant, store text."""
+        turns = []
+        for utt in utterances:
+            speaker = 'assistant' if utt.get('speaker') == 'ASSISTANT' else 'user'
+            text = utt.get('text', '').strip()
+            turns.append({
+                'speaker': speaker,
+                'text': text
+            })
+        return turns
+    def filter_and_convert(self, dialogues: List[TaskmasterDialogue]) -> List[Dict]:
+        """
+        Filter out dialogues that don't meet min turns / min user words,
+        then convert them to final pipeline dict:
+        {
+          "dialogue_id": "...",
+          "domain": "...",
+          "turns": [
+            {"speaker": "user", "text": "..."},
+            ...
+          ]
+        }
+        """
+        results = []
+        for dlg in dialogues:
+            if not dlg.validate():
+                continue
+            if len(dlg.turns) < self.config.min_turns:
+                continue
+            # Check user-turn min words
+            keep = True
+            for turn in dlg.turns:
+                if turn['speaker'] == 'user':
+                    word_count = len(turn['text'].split())
+                    if word_count < self.config.min_user_words:
+                        keep = False
+                        break
+            if not keep:
+                continue
+            pipeline_dlg = {
+                'dialogue_id': dlg.conversation_id,
+                'domain': dlg.domain,
+                'turns': dlg.turns  # or you can refine further if needed
+            }
+            results.append(pipeline_dlg)
+        if self.config.debug:
+            print(f"[TaskmasterProcessor] Filtered down to {len(results)} dialogues.")
+        return results

prepare_data.py CHANGED Viewed

@@ -3,10 +3,13 @@ import sys
 import faiss
 import json
 import pickle
-from transformers import AutoTokenizer
 from tqdm.auto import tqdm
 from chatbot_model import ChatbotConfig, EncoderModel
-from environment_setup import EnvironmentSetup
 from tf_data_pipeline import TFDataPipeline
 from logger_config import config_logger
@@ -14,32 +17,23 @@ logger = config_logger(__name__)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-def cleanup_test_indices(faiss_dir, test_prefix='test_'):
-    test_files = [f for f in os.listdir(faiss_dir) if f.startswith(test_prefix)]
-    for file in test_files:
-        file_path = os.path.join(faiss_dir, file)
-        os.remove(file_path)
-        logger.info(f"Removed test FAISS index file: {file_path}")
 def main():
     # Constants
-    MODELS_DIR = 'models'
-    PROCESSED_DATA_DIR = 'processed_outputs'
-    CACHE_DIR = 'cache'
     TOKENIZER_DIR = os.path.join(MODELS_DIR, 'tokenizer')
     FAISS_INDICES_DIR = os.path.join(MODELS_DIR, 'faiss_indices')
-    TF_RECORD_DIR = 'training_data'
     FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_production.index')
-    FAISS_INDEX_TEST_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_test.index')
-    ENVIRONMENT = 'production'  # or 'test'
-    if ENVIRONMENT == 'test':
-        FAISS_INDEX_PATH = FAISS_INDEX_TEST_PATH
-    else:
-        FAISS_INDEX_PATH = FAISS_INDEX_PRODUCTION_PATH
-    JSON_TRAINING_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, 'augmented_dialogues.json')
     CACHE_FILE = os.path.join(CACHE_DIR, 'query_embeddings_cache.pkl')
-    TF_RECORD_PATH = os.path.join(TF_RECORD_DIR, 'training_data.tfrecord')
-    DEBUG_SAMPLES = None
     # Ensure output directories exist
     os.makedirs(MODELS_DIR, exist_ok=True)
@@ -49,58 +43,120 @@ def main():
     os.makedirs(FAISS_INDICES_DIR, exist_ok=True)
     os.makedirs(TF_RECORD_DIR, exist_ok=True)
-    # Initialize configuration
-    config = ChatbotConfig()
-    logger.info(f"Chatbot Configuration: {config}")
-    # Initialize tokenizer and add special tokens
     try:
-        tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
-        logger.info(f"Tokenizer '{config.pretrained_model}' loaded successfully.")
-        tokenizer.add_special_tokens({'additional_special_tokens': ['<EMPTY_NEGATIVE>']})
-        logger.info("Added special tokens to tokenizer.")
     except Exception as e:
-        logger.error(f"Failed to load tokenizer: {e}")
         sys.exit(1)
-    # Initialize encoder model and resize token embeddings
     try:
         encoder = EncoderModel(config=config)
         logger.info("EncoderModel initialized successfully.")
         encoder.pretrained.resize_token_embeddings(len(tokenizer))
         logger.info(f"Token embeddings resized to: {len(tokenizer)}")
     except Exception as e:
         logger.error(f"Failed to initialize EncoderModel: {e}")
         sys.exit(1)
     # Load JSON dialogues
     try:
-        dialogues = TFDataPipeline.load_json_training_data(JSON_TRAINING_DATA_PATH, DEBUG_SAMPLES)
-        logger.info(f"Loaded {len(dialogues)} dialogues from {JSON_TRAINING_DATA_PATH}.")
     except Exception as e:
         logger.error(f"Failed to load dialogues: {e}")
         sys.exit(1)
     # Load or initialize query_embeddings_cache
-    try:
-        if os.path.exists(CACHE_FILE):
             with open(CACHE_FILE, 'rb') as f:
                 query_embeddings_cache = pickle.load(f)
             logger.info(f"Loaded {len(query_embeddings_cache)} query embeddings from {CACHE_FILE}.")
-        else:
-            query_embeddings_cache = {}
-            logger.info("Initialized empty query embeddings cache.")
-    except Exception as e:
-        logger.error(f"Failed to load or initialize query embeddings cache: {e}")
-        sys.exit(1)
     # Initialize TFDataPipeline
     try:
         data_pipeline = TFDataPipeline(
             config=config,
             tokenizer=tokenizer,
             encoder=encoder,
-            index_file_path=FAISS_INDEX_PATH,
             response_pool=[],
             max_length=config.max_context_token_limit,
             neg_samples=config.neg_samples,
@@ -114,48 +170,55 @@ def main():
         logger.error(f"Failed to initialize TFDataPipeline: {e}")
         sys.exit(1)
-    # Collect unique assistant responses from dialogues
     try:
-        response_pool = data_pipeline.collect_responses(dialogues)
-        data_pipeline.response_pool = response_pool
-        logger.info(f"Collected {len(response_pool)} unique assistant responses from dialogues.")
     except Exception as e:
         logger.error(f"Failed to collect responses: {e}")
         sys.exit(1)
-    # Compute and add response embeddings to FAISS index
     try:
-        logger.info("Computing and adding response embeddings to FAISS index...")
-        data_pipeline.compute_and_index_response_embeddings()
-        logger.info("Response embeddings computed and added to FAISS index.")
-    except Exception as e:
-        logger.error(f"Failed to compute or add response embeddings: {e}")
-        sys.exit(1)
-    # Save FAISS index and response pool
-    try:
-        logger.info(f"Saving FAISS index to {FAISS_INDEX_PATH}...")
-        faiss.write_index(data_pipeline.index, FAISS_INDEX_PATH)
-        logger.info("FAISS index saved successfully.")
-        response_pool_path = FAISS_INDEX_PATH.replace('.index', '_responses.json')
-        with open(response_pool_path, 'w', encoding='utf-8') as f:
-            json.dump(data_pipeline.response_pool, f, indent=2)
-        logger.info(f"Response pool saved to {response_pool_path}.")
     except Exception as e:
-        logger.error(f"Failed to save FAISS index: {e}")
         sys.exit(1)
-    # Prepare and save training data as TFRecords
     try:
-        logger.info("Starting data preparation and saving as TFRecord...")
-        data_pipeline.prepare_and_save_data(dialogues, TF_RECORD_PATH)
-        logger.info(f"Data saved as TFRecord at {TF_RECORD_PATH}.")
     except Exception as e:
         logger.error(f"Failed during data preparation and saving: {e}")
         sys.exit(1)
-    # Save query embeddings cache
     try:
         with open(CACHE_FILE, 'wb') as f:
             pickle.dump(data_pipeline.query_embeddings_cache, f)
@@ -164,7 +227,7 @@ def main():
         logger.error(f"Failed to save query embeddings cache: {e}")
         sys.exit(1)
-    # Save Tokenizer (including special tokens)
     try:
         tokenizer.save_pretrained(TOKENIZER_DIR)
         logger.info(f"Tokenizer saved to {TOKENIZER_DIR}.")
@@ -173,6 +236,7 @@ def main():
         sys.exit(1)
     logger.info("Data preparation pipeline completed successfully.")
 if __name__ == "__main__":
-    main()

 import faiss
 import json
 import pickle
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModel
 from tqdm.auto import tqdm
+from pathlib import Path
+# Your existing modules
 from chatbot_model import ChatbotConfig, EncoderModel
 from tf_data_pipeline import TFDataPipeline
 from logger_config import config_logger
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 def main():
     # Constants
+    MODELS_DIR = 'new_iteration/data_prep_iterative_models'
+    PROCESSED_DATA_DIR = 'new_iteration/processed_outputs'
+    CACHE_DIR = 'new_iteration/cache'
     TOKENIZER_DIR = os.path.join(MODELS_DIR, 'tokenizer')
     FAISS_INDICES_DIR = os.path.join(MODELS_DIR, 'faiss_indices')
+    TF_RECORD_DIR = 'new_iteration/training_data'
     FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_production.index')
+    JSON_TRAINING_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, 'taskmaster_dialogues.json')
     CACHE_FILE = os.path.join(CACHE_DIR, 'query_embeddings_cache.pkl')
+    TF_RECORD_PATH = os.path.join(TF_RECORD_DIR, 'training_data_3.tfrecord')
+    # Decide whether to load the **custom** fine-tuned model or just base DistilBERT.
+    # True for custom, False for base DistilBERT.
+    LOAD_CUSTOM_MODEL = True
+    NUM_NEG_SAMPLES = 10
     # Ensure output directories exist
     os.makedirs(MODELS_DIR, exist_ok=True)
     os.makedirs(FAISS_INDICES_DIR, exist_ok=True)
     os.makedirs(TF_RECORD_DIR, exist_ok=True)
+    # Initialize config
+    config_json = Path(MODELS_DIR) / "config.json"
+    if config_json.exists():
+        with open(config_json, "r", encoding="utf-8") as f:
+            config_dict = json.load(f)
+        config = ChatbotConfig.from_dict(config_dict)
+        logger.info(f"Loaded ChatbotConfig from {config_json}")
+    else:
+        config = ChatbotConfig()
+        logger.warning("No config.json found. Using default ChatbotConfig.")
+    config.neg_samples = NUM_NEG_SAMPLES
+    # Load or initialize tokenizer
     try:
+        # If the directory has a valid tokenizer
+        if Path(TOKENIZER_DIR).exists() and list(Path(TOKENIZER_DIR).iterdir()):
+            logger.info(f"Loading tokenizer from {TOKENIZER_DIR}")
+            tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
+        else:
+            # Initialize from base DistilBERT
+            logger.info(f"Loading base tokenizer for {config.pretrained_model}")
+            tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
+            # Save to disk
+            Path(TOKENIZER_DIR).mkdir(parents=True, exist_ok=True)
+            tokenizer.save_pretrained(TOKENIZER_DIR)
+            logger.info(f"New tokenizer saved to {TOKENIZER_DIR}")
     except Exception as e:
+        logger.error(f"Failed to load or create tokenizer: {e}")
         sys.exit(1)
+    # Initialize the encoder
     try:
         encoder = EncoderModel(config=config)
         logger.info("EncoderModel initialized successfully.")
+        if LOAD_CUSTOM_MODEL:
+            # Load the DistilBERT submodule from 'shared_encoder'
+            shared_encoder_path = Path(MODELS_DIR) / "shared_encoder"
+            if shared_encoder_path.exists():
+                logger.info(f"Loading DistilBERT submodule from {shared_encoder_path}")
+                encoder.pretrained = TFAutoModel.from_pretrained(shared_encoder_path)
+            else:
+                logger.warning(f"No shared_encoder found at {shared_encoder_path}, using base DistilBERT instead.")
+            # Load top-level custom .weights.h5 (projection, dropout, etc.)
+            custom_weights_path = Path(MODELS_DIR) / "encoder_custom_weights.weights.h5"
+            if custom_weights_path.exists():
+                logger.info(f"Loading custom top-level weights from {custom_weights_path}")
+                # Build model layers with a dummy forward pass
+                dummy_input = tf.zeros((1, config.max_context_token_limit), dtype=tf.int32)
+                _ = encoder(dummy_input, training=False)
+                encoder.load_weights(str(custom_weights_path))
+                logger.info("Custom encoder weights loaded successfully.")
+            else:
+                logger.warning(f"Custom weights file not found at {custom_weights_path}. Using only submodule weights.")
+        else:
+            # Just base DistilBERT with special tokens resized
+            logger.info("Using the base DistilBERT without loading custom weights.")
+        # Resize token embeddings in case we added special tokens
         encoder.pretrained.resize_token_embeddings(len(tokenizer))
         logger.info(f"Token embeddings resized to: {len(tokenizer)}")
     except Exception as e:
         logger.error(f"Failed to initialize EncoderModel: {e}")
         sys.exit(1)
     # Load JSON dialogues
     try:
+        if not Path(JSON_TRAINING_DATA_PATH).exists():
+            logger.warning(f"No dialogues found at {JSON_TRAINING_DATA_PATH}, skipping.")
+            dialogues = []
+        else:
+            dialogues = TFDataPipeline.load_json_training_data(JSON_TRAINING_DATA_PATH, debug_samples=None)
+            logger.info(f"Loaded {len(dialogues)} dialogues from {JSON_TRAINING_DATA_PATH}.")
     except Exception as e:
         logger.error(f"Failed to load dialogues: {e}")
         sys.exit(1)
     # Load or initialize query_embeddings_cache
+    query_embeddings_cache = {}
+    if os.path.exists(CACHE_FILE):
+        try:
             with open(CACHE_FILE, 'rb') as f:
                 query_embeddings_cache = pickle.load(f)
             logger.info(f"Loaded {len(query_embeddings_cache)} query embeddings from {CACHE_FILE}.")
+        except Exception as e:
+            logger.warning(f"Failed to load query embeddings cache: {e}")
+    else:
+        logger.info("No existing query embeddings cache found. Starting fresh.")
     # Initialize TFDataPipeline
     try:
+        # Determine if FAISS index should be loaded or initialized
+        if Path(FAISS_INDEX_PRODUCTION_PATH).exists():
+            # Load existing index
+            logger.info(f"Loading existing FAISS index from {FAISS_INDEX_PRODUCTION_PATH}...")
+            faiss_index = faiss.read_index(FAISS_INDEX_PRODUCTION_PATH)
+            logger.info("FAISS index loaded successfully.")
+        else:
+            # Initialize a new FAISS index
+            logger.info("No existing FAISS index found. Initializing a new index.")
+            dimension = config.embedding_dim  # Ensure this matches your encoder's output
+            faiss_index = faiss.IndexFlatIP(dimension)  # Using Inner Product for cosine similarity
+            logger.info(f"Initialized new FAISS index with dimension {dimension}.")
+        # Initialize TFDataPipeline with the FAISS index
         data_pipeline = TFDataPipeline(
             config=config,
             tokenizer=tokenizer,
             encoder=encoder,
+            index_file_path=FAISS_INDEX_PRODUCTION_PATH,
             response_pool=[],
             max_length=config.max_context_token_limit,
             neg_samples=config.neg_samples,
         logger.error(f"Failed to initialize TFDataPipeline: {e}")
         sys.exit(1)
+    # 7) Collect unique assistant responses from dialogues
     try:
+        if dialogues:
+            response_pool = data_pipeline.collect_responses_with_domain(dialogues)
+            data_pipeline.response_pool = response_pool
+            logger.info(f"Collected {len(response_pool)} unique assistant responses from dialogues.")
+        else:
+            logger.warning("No dialogues loaded. response_pool remains empty.")
     except Exception as e:
         logger.error(f"Failed to collect responses: {e}")
         sys.exit(1)
+    # 8) Build the FAISS index with response embeddings
+    #    Instead of manually computing embeddings, we use the pipeline method
     try:
+        if data_pipeline.response_pool:
+            data_pipeline.build_text_to_domain_map()
+            logger.info("Computing and adding response embeddings to FAISS index using TFDataPipeline...")
+            data_pipeline.compute_and_index_response_embeddings()
+            logger.info("Response embeddings computed and added to FAISS index.")
+            # Save the updated FAISS index
+            data_pipeline.save_faiss_index(FAISS_INDEX_PRODUCTION_PATH)
+            # Also save the response pool JSON
+            response_pool_path = FAISS_INDEX_PRODUCTION_PATH.replace('.index', '_responses.json')
+            with open(response_pool_path, 'w', encoding='utf-8') as f:
+                json.dump(data_pipeline.response_pool, f, indent=2)
+            logger.info(f"Response pool saved to {response_pool_path}.")
+        else:
+            logger.warning("No responses to embed. Skipping FAISS indexing.")
     except Exception as e:
+        logger.error(f"Failed to compute or add response embeddings: {e}")
         sys.exit(1)
+    # 9) Prepare and save training data as TFRecords
     try:
+        if dialogues:
+            logger.info("Starting data preparation and saving as TFRecord...")
+            data_pipeline.prepare_and_save_data(dialogues, TF_RECORD_PATH)
+            logger.info(f"Data saved as TFRecord at {TF_RECORD_PATH}.")
+        else:
+            logger.warning("No dialogues to build TFRecord from. Skipping TFRecord creation.")
     except Exception as e:
         logger.error(f"Failed during data preparation and saving: {e}")
         sys.exit(1)
+    # 10) Save query embeddings cache
     try:
         with open(CACHE_FILE, 'wb') as f:
             pickle.dump(data_pipeline.query_embeddings_cache, f)
         logger.error(f"Failed to save query embeddings cache: {e}")
         sys.exit(1)
+    # Save Tokenizer
     try:
         tokenizer.save_pretrained(TOKENIZER_DIR)
         logger.info(f"Tokenizer saved to {TOKENIZER_DIR}.")
         sys.exit(1)
     logger.info("Data preparation pipeline completed successfully.")
 if __name__ == "__main__":
+    main()

response_quality_checker.py CHANGED Viewed

@@ -9,27 +9,41 @@ if TYPE_CHECKING:
     from tf_data_pipeline import TFDataPipeline
 class ResponseQualityChecker:
-    """Enhanced quality checking with dynamic thresholds."""
     def __init__(
         self,
         data_pipeline: 'TFDataPipeline',
-        confidence_threshold: float = 0.6,
         diversity_threshold: float = 0.15,
         min_response_length: int = 5,
-        similarity_cap: float = 0.85  # Renamed from max_similarity_ratio and used in diversity calc
     ):
         self.confidence_threshold = confidence_threshold
         self.diversity_threshold = diversity_threshold
         self.min_response_length = min_response_length
         self.similarity_cap = similarity_cap
-        self.data_pipeline = data_pipeline  # Reference to TFDataPipeline
-        # Dynamic thresholds based on response patterns
         self.thresholds = {
-            'relevance': 0.35,
-            'length_score': 0.85,
-            'score_gap': 0.07
         }
     def check_response_quality(
@@ -38,14 +52,14 @@ class ResponseQualityChecker:
         responses: List[Tuple[str, float]]
     ) -> Dict[str, Any]:
         """
-        Evaluate the quality of responses based on various metrics.
         Args:
-            query: The user's query
-            responses: List of (response_text, score) tuples
         Returns:
-            Dict containing quality metrics and confidence assessment
         """
         if not responses:
             return {
@@ -57,98 +71,282 @@ class ResponseQualityChecker:
                 'top_3_score_gap': 0.0
             }
-        # Calculate core metrics
-        metrics = {
-            'response_diversity': self.calculate_diversity(responses),
-            'query_response_relevance': self.calculate_relevance(query, responses),
-            'response_length_score': np.mean([
-                self._calculate_length_score(response) for response, _ in responses
-            ]),
-            'top_score': responses[0][1],
-            'top_3_score_gap': self._calculate_score_gap([score for _, score in responses], top_n=3)
-        }
-        # Determine confidence using thresholds
         metrics['is_confident'] = self._determine_confidence(metrics)
         logger.info(f"Quality metrics: {metrics}")
         return metrics
     def calculate_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
-        """Calculate relevance as weighted similarity between query and responses."""
         if not responses:
             return 0.0
-        # Get embeddings
-        query_embedding = self.data_pipeline.encode_query(query)
-        response_texts = [resp for resp, _ in responses]
-        response_embeddings = self.data_pipeline.encode_responses(response_texts)
-        # Compute similarities
-        similarities = cosine_similarity([query_embedding], response_embeddings)[0]
-        # Apply decreasing weights for later responses
-        weights = np.array([1.0 / (i + 1) for i in range(len(similarities))])
-        return np.average(similarities, weights=weights)
     def calculate_diversity(self, responses: List[Tuple[str, float]]) -> float:
-        """Calculate diversity with length normalization and similarity capping."""
-        if not responses:
-            return 0.0
-        response_texts = [resp for resp, _ in responses]
-        embeddings = self.data_pipeline.encode_responses(response_texts)
-        if len(embeddings) < 2:
-            return 1.0
-        # Calculate pairwise cosine similarities
-        similarity_matrix = cosine_similarity(embeddings)
-        np.fill_diagonal(similarity_matrix, 0)  # Exclude self-similarity
-        # Apply similarity cap
-        similarity_matrix = np.minimum(similarity_matrix, self.similarity_cap)
-        # Calculate average similarity
-        sum_similarities = np.sum(similarity_matrix)
-        num_pairs = len(embeddings) * (len(embeddings) - 1)
-        avg_similarity = sum_similarities / num_pairs if num_pairs > 0 else 0.0
-        # Diversity is inversely related to average similarity
-        diversity_score = 1 - avg_similarity
-        return diversity_score
     def _determine_confidence(self, metrics: Dict[str, float]) -> bool:
-        """Determine confidence using primary and secondary conditions."""
-        # Primary conditions (must all be met)
         primary_conditions = [
             metrics['top_score'] >= self.confidence_threshold,
             metrics['response_diversity'] >= self.diversity_threshold,
             metrics['response_length_score'] >= self.thresholds['length_score']
         ]
-        # Secondary conditions (majority must be met)
         secondary_conditions = [
             metrics['query_response_relevance'] >= self.thresholds['relevance'],
             metrics['top_3_score_gap'] >= self.thresholds['score_gap'],
-            metrics['top_score'] >= (self.confidence_threshold * 1.1)  # Extra confidence boost
         ]
-        return all(primary_conditions) and sum(secondary_conditions) >= 2
-    def _calculate_length_score(self, response: str) -> float:
-        """Calculate length score with penalty for very short or long responses."""
-        words = len(response.split())
         if words < self.min_response_length:
-            return words / self.min_response_length
-        elif words > 50:  # Penalty for very long responses
-            return min(1.0, 50 / words)
         return 1.0
     def _calculate_score_gap(self, scores: List[float], top_n: int = 3) -> float:
-        """Calculate average gap between top N scores."""
-        if len(scores) < top_n + 1:
             return 0.0
-        gaps = [scores[i] - scores[i + 1] for i in range(min(len(scores) - 1, top_n))]
-        return np.mean(gaps)

     from tf_data_pipeline import TFDataPipeline
 class ResponseQualityChecker:
+    """
+    Enhanced quality checking that calculates:
+      - Relevance between query & responses
+      - Diversity among top responses
+      - Response length scoring
+      - Confidence determination based on multiple thresholds
+    """
     def __init__(
         self,
         data_pipeline: 'TFDataPipeline',
+        confidence_threshold: float = 0.45,
         diversity_threshold: float = 0.15,
         min_response_length: int = 5,
+        similarity_cap: float = 0.85,
     ):
+        """
+        Args:
+            data_pipeline: Reference to TFDataPipeline for encoding
+            confidence_threshold: Minimum top_score for a 'confident' result
+            diversity_threshold: Minimum required diversity among top responses
+            min_response_length: Minimum words for a decent response
+            similarity_cap: Cap on pairwise similarity for diversity calc
+        """
         self.confidence_threshold = confidence_threshold
         self.diversity_threshold = diversity_threshold
         self.min_response_length = min_response_length
         self.similarity_cap = similarity_cap
+        self.data_pipeline = data_pipeline
+        # Additional thresholds for more refined checks
         self.thresholds = {
+            'relevance': 0.30,     # Slightly relaxed
+            'length_score': 0.80,  # Stricter length requirement
+            'score_gap': 0.05      # Gap between top scores
         }
     def check_response_quality(
         responses: List[Tuple[str, float]]
     ) -> Dict[str, Any]:
         """
+        Evaluate the quality of a set of ranked responses for a given query.
         Args:
+            query: The user's original query
+            responses: List of (response_text, score) sorted by descending score
         Returns:
+            Dictionary of metrics, including 'is_confident' and others
         """
         if not responses:
             return {
                 'top_3_score_gap': 0.0
             }
+        # 1) Calculate relevant metrics
+        metrics = {}
+        metrics['response_diversity'] = self.calculate_diversity(responses)
+        metrics['query_response_relevance'] = self.calculate_relevance(query, responses)
+        metrics['response_length_score'] = self._average_length_score(responses)
+        metrics['top_score'] = responses[0][1]
+        metrics['top_3_score_gap'] = self._calculate_score_gap([s for _, s in responses], top_n=3)
+        # 2) Determine confidence
         metrics['is_confident'] = self._determine_confidence(metrics)
         logger.info(f"Quality metrics: {metrics}")
         return metrics
     def calculate_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
+        """
+        Compute an overall 'relevance' metric between the query and the top responses.
+        Uses an exponential transform on the similarity to penalize weaker matches.
+        """
         if not responses:
             return 0.0
+        # Encode query and responses
+        query_emb = self.data_pipeline.encode_query(query)
+        resp_texts = [r for r, _ in responses]
+        resp_embs = self.data_pipeline.encode_responses(resp_texts)
+        # Normalize embeddings
+        query_emb = query_emb / (np.linalg.norm(query_emb) + 1e-12)
+        resp_norms = np.linalg.norm(resp_embs, axis=1, keepdims=True) + 1e-12
+        resp_embs = resp_embs / resp_norms
+        # Cosine similarity
+        sims = cosine_similarity([query_emb], resp_embs)[0]
+        # Exponential transform: higher sims remain close to 1, lower sims drop quickly
+        sims = np.exp(sims - 1.0)
+        # Weighted average: give heavier weighting to higher-ranked items
+        weights = np.exp(-np.arange(len(sims)) / 2.0)
+        weighted_avg = np.average(sims, weights=weights)
+        return float(weighted_avg)
     def calculate_diversity(self, responses: List[Tuple[str, float]]) -> float:
+        """
+        Calculate how 'different' the top responses are from each other.
+        Diversity = 1 - avg_cosine_similarity (capped).
+        """
+        if len(responses) < 2:
+            return 1.0  # Single response is trivially 'unique'
+        resp_texts = [r for r, _ in responses]
+        embs = self.data_pipeline.encode_responses(resp_texts)
+        # Pairwise similarity
+        sim_matrix = cosine_similarity(embs, embs)
+        np.fill_diagonal(sim_matrix, 0.0)
+        # Cap similarity to avoid outliers
+        sim_matrix = np.minimum(sim_matrix, self.similarity_cap)
+        # Mean off-diagonal similarity
+        sum_sims = np.sum(sim_matrix)
+        num_pairs = len(resp_texts) * (len(resp_texts) - 1)
+        avg_sim = sum_sims / num_pairs if num_pairs > 0 else 0.0
+        # Invert to get diversity
+        return 1.0 - avg_sim
     def _determine_confidence(self, metrics: Dict[str, float]) -> bool:
+        """
+        Decide if we're 'confident' based on multiple metric thresholds.
+        """
         primary_conditions = [
             metrics['top_score'] >= self.confidence_threshold,
             metrics['response_diversity'] >= self.diversity_threshold,
             metrics['response_length_score'] >= self.thresholds['length_score']
         ]
         secondary_conditions = [
             metrics['query_response_relevance'] >= self.thresholds['relevance'],
             metrics['top_3_score_gap'] >= self.thresholds['score_gap'],
+            metrics['top_score'] >= (self.confidence_threshold + 0.05)  # Extra buffer
         ]
+        # Must pass all primary checks, and at least 2 of the 3 secondary
+        return all(primary_conditions) and (sum(secondary_conditions) >= 2)
+    def _average_length_score(self, responses: List[Tuple[str, float]]) -> float:
+        """
+        Compute an average length score across all responses.
+        """
+        length_scores = []
+        for response, _ in responses:
+            length_scores.append(self._length_score(response))
+        return float(np.mean(length_scores)) if length_scores else 0.0
+    def _length_score(self, text: str) -> float:
+        """
+        Calculate how well the text meets our length requirement.
+        Scores 1.0 if text is >= min_response_length and not too long,
+        else it scales down.
+        """
+        words = len(text.split())
         if words < self.min_response_length:
+            return words / float(self.min_response_length)
+        elif words > 60:
+            return max(0.5, 60.0 / words)  # Slight penalty for very long
         return 1.0
     def _calculate_score_gap(self, scores: List[float], top_n: int = 3) -> float:
+        """
+        Calculate the average gap between consecutive scores in the top N.
+        """
+        if len(scores) < 2:
             return 0.0
+        top_n = min(len(scores), top_n)
+        gaps = []
+        for i in range(top_n - 1):
+            gaps.append(scores[i] - scores[i + 1])
+        return float(np.mean(gaps)) if gaps else 0.0
+# import numpy as np
+# from typing import List, Tuple, Dict, Any, TYPE_CHECKING
+# from sklearn.metrics.pairwise import cosine_similarity
+# from logger_config import config_logger
+# logger = config_logger(__name__)
+# if TYPE_CHECKING:
+#     from tf_data_pipeline import TFDataPipeline
+# class ResponseQualityChecker:
+#     """Enhanced quality checking with dynamic thresholds."""
+#     def __init__(
+#         self,
+#         data_pipeline: 'TFDataPipeline',
+#         confidence_threshold: float = 0.4,
+#         diversity_threshold: float = 0.15,
+#         min_response_length: int = 5,
+#         similarity_cap: float = 0.85  # Renamed from max_similarity_ratio and used in diversity calc
+#     ):
+#         self.confidence_threshold = confidence_threshold
+#         self.diversity_threshold = diversity_threshold
+#         self.min_response_length = min_response_length
+#         self.similarity_cap = similarity_cap
+#         self.data_pipeline = data_pipeline  # Reference to TFDataPipeline
+#         # Dynamic thresholds based on response patterns
+#         self.thresholds = {
+#             'relevance': 0.35,
+#             'length_score': 0.85,
+#             'score_gap': 0.04
+#         }
+#     def check_response_quality(
+#         self,
+#         query: str,
+#         responses: List[Tuple[str, float]]
+#     ) -> Dict[str, Any]:
+#         """
+#         Evaluate the quality of responses based on various metrics.
+#         Args:
+#             query: The user's query
+#             responses: List of (response_text, score) tuples
+#         Returns:
+#             Dict containing quality metrics and confidence assessment
+#         """
+#         if not responses:
+#             return {
+#                 'response_diversity': 0.0,
+#                 'query_response_relevance': 0.0,
+#                 'is_confident': False,
+#                 'top_score': 0.0,
+#                 'response_length_score': 0.0,
+#                 'top_3_score_gap': 0.0
+#             }
+#         # Calculate core metrics
+#         metrics = {
+#             'response_diversity': self.calculate_diversity(responses),
+#             'query_response_relevance': self.calculate_relevance(query, responses),
+#             'response_length_score': np.mean([
+#                 self._calculate_length_score(response) for response, _ in responses
+#             ]),
+#             'top_score': responses[0][1],
+#             'top_3_score_gap': self._calculate_score_gap([score for _, score in responses], top_n=3)
+#         }
+#         # Determine confidence using thresholds
+#         metrics['is_confident'] = self._determine_confidence(metrics)
+#         logger.info(f"Quality metrics: {metrics}")
+#         return metrics
+#     def calculate_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
+#         """Calculate relevance with stricter scoring."""
+#         if not responses:
+#             return 0.0
+#         query_embedding = self.data_pipeline.encode_query(query)
+#         response_texts = [resp for resp, _ in responses]
+#         response_embeddings = self.data_pipeline.encode_responses(response_texts)
+#         # Normalize embeddings
+#         query_embedding = query_embedding / np.linalg.norm(query_embedding)
+#         response_embeddings = response_embeddings / np.linalg.norm(response_embeddings, axis=1)[:, np.newaxis]
+#         # Compute similarities with exponential decay for far matches
+#         similarities = cosine_similarity([query_embedding], response_embeddings)[0]
+#         similarities = np.exp(similarities - 1)  # Penalize lower similarities more strongly
+#         # Apply stronger position weighting
+#         weights = np.exp(-np.arange(len(similarities)) / 2)
+#         return float(np.average(similarities, weights=weights))
+#     def calculate_diversity(self, responses: List[Tuple[str, float]]) -> float:
+#         """Calculate diversity with length normalization and similarity capping."""
+#         if not responses:
+#             return 0.0
+#         response_texts = [resp for resp, _ in responses]
+#         embeddings = self.data_pipeline.encode_responses(response_texts)
+#         if len(embeddings) < 2:
+#             return 1.0
+#         # Calculate pairwise cosine similarities
+#         similarity_matrix = cosine_similarity(embeddings)
+#         np.fill_diagonal(similarity_matrix, 0)  # Exclude self-similarity
+#         # Apply similarity cap
+#         similarity_matrix = np.minimum(similarity_matrix, self.similarity_cap)
+#         # Calculate average similarity
+#         sum_similarities = np.sum(similarity_matrix)
+#         num_pairs = len(embeddings) * (len(embeddings) - 1)
+#         avg_similarity = sum_similarities / num_pairs if num_pairs > 0 else 0.0
+#         # Diversity is inversely related to average similarity
+#         diversity_score = 1 - avg_similarity
+#         return diversity_score
+#     def _determine_confidence(self, metrics: Dict[str, float]) -> bool:
+#         """Determine confidence using primary and secondary conditions."""
+#         # Primary conditions (must all be met)
+#         primary_conditions = [
+#             metrics['top_score'] >= self.confidence_threshold,
+#             metrics['response_diversity'] >= self.diversity_threshold,
+#             metrics['response_length_score'] >= self.thresholds['length_score']
+#         ]
+#         # Secondary conditions (majority must be met)
+#         secondary_conditions = [
+#             metrics['query_response_relevance'] >= self.thresholds['relevance'],
+#             metrics['top_3_score_gap'] >= self.thresholds['score_gap'],
+#             metrics['top_score'] >= (self.confidence_threshold * 1.1)  # Extra confidence boost
+#         ]
+#         return all(primary_conditions) and sum(secondary_conditions) >= 2
+#     def _calculate_length_score(self, response: str) -> float:
+#         """Calculate length score with penalty for very short or long responses."""
+#         words = len(response.split())
+#         if words < self.min_response_length:
+#             return words / self.min_response_length
+#         elif words > 50:  # Penalty for very long responses
+#             return min(1.0, 50 / words)
+#         return 1.0
+#     def _calculate_score_gap(self, scores: List[float], top_n: int = 3) -> float:
+#         """Calculate average gap between top N scores."""
+#         if len(scores) < top_n + 1:
+#             return 0.0
+#         gaps = [scores[i] - scores[i + 1] for i in range(min(len(scores) - 1, top_n))]
+#         return np.mean(gaps)

tf_data_pipeline.py CHANGED Viewed

@@ -8,11 +8,12 @@ import math
 from tqdm import tqdm
 import json
 from pathlib import Path
-from typing import Union, Optional, List, Tuple, Generator
 from transformers import AutoTokenizer
 from typing import List, Tuple, Generator
 from transformers import AutoTokenizer
 from gpu_monitor import GPUMemoryMonitor
 from logger_config import config_logger
 logger = config_logger(__name__)
@@ -27,7 +28,7 @@ class TFDataPipeline:
         response_pool: List[str],
         max_length: int,
         query_embeddings_cache: dict,
-        neg_samples: int = 3,
         index_type: str = 'IndexFlatIP',
         nlist: int = 100,
         max_retries: int = 3
@@ -47,6 +48,10 @@ class TFDataPipeline:
         self.max_batch_size = 16 if len(response_pool) < 100 else 64
         self.memory_monitor = GPUMemoryMonitor()
         self.max_retries = max_retries
         if os.path.exists(index_file_path):
             logger.info(f"Loading existing FAISS index from {index_file_path}...")
@@ -135,21 +140,49 @@ class TFDataPipeline:
         logger.info(f"Loaded {len(dialogues)} dialogues.")
         return dialogues
-    def collect_responses(self, dialogues: List[dict]) -> List[str]:
-        """Extract unique assistant responses from dialogues."""
-        response_set = set()
         for dialogue in tqdm(dialogues, desc="Processing Dialogues", unit="dialogue"):
             turns = dialogue.get('turns', [])
             for turn in turns:
                 speaker = turn.get('speaker')
                 text = turn.get('text', '').strip()
                 if speaker == 'assistant' and text:
-                    # Ensure we don't exclude valid shorter responses
                     if len(text) <= self.max_length:
-                        response_set.add(text)
-        logger.info(f"Collected {len(response_set)} unique assistant responses from dialogues.")
-        return list(response_set)
     def _extract_pairs_from_dialogue(self, dialogue: dict) -> List[Tuple[str, str]]:
         """Extract query-response pairs from a dialogue."""
@@ -173,113 +206,101 @@ class TFDataPipeline:
     def compute_and_index_response_embeddings(self):
         """
-        Computes embeddings for the response pool and adds them to the FAISS index with progress bars.
         """
         logger.info("Computing embeddings for the response pool...")
-        # Ensure all responses are strings
-        if not all(isinstance(response, str) for response in self.response_pool):
-            logger.error("All elements in response_pool must be strings.")
-            raise ValueError("Invalid data type in response_pool.")
-        # Tokenization
-        logger.info("Tokenizing responses...")
-        encoded_responses = self.tokenizer(
-            self.response_pool,
-            padding=True,
-            truncation=True,
-            max_length=self.max_length,
-            return_tensors='tf'
-        )
-        response_ids = encoded_responses['input_ids']
-        # Compute embeddings in batches with progress bar
-        batch_size = getattr(self, 'embedding_batch_size', 64)  # Default to 64 if not set
-        total_responses = len(response_ids)
-        logger.info(f"Computing embeddings in batches of {batch_size}...")
         embeddings = []
-        with tqdm(total=total_responses, desc="Computing Embeddings", unit="response") as pbar:
-            for i in range(0, total_responses, batch_size):
-                batch_ids = response_ids[i:i + batch_size]
-                # Compute embeddings
-                batch_embeddings = self.encoder(batch_ids, training=False).numpy()
-                # Normalize embeddings for cosine similarity
-                faiss.normalize_L2(batch_embeddings)
-                embeddings.append(batch_embeddings)
-                pbar.update(len(batch_ids))
-        if embeddings:
-            embeddings = np.vstack(embeddings).astype(np.float32)
-            # Add embeddings to FAISS index with progress bar
-            logger.info(f"Adding {len(embeddings)} response embeddings to FAISS index...")
-            # Determine number of batches for indexing
-            index_batch_size = getattr(self, 'index_batch_size', 1000)  # Adjust as needed
-            total_embeddings = len(embeddings)
-            num_index_batches = math.ceil(total_embeddings / index_batch_size)
-            with tqdm(total=total_embeddings, desc="Indexing Embeddings", unit="embedding") as pbar_index:
-                for i in range(0, total_embeddings, index_batch_size):
-                    batch = embeddings[i:i + index_batch_size]
-                    self.index.add(batch)
-                    pbar_index.update(len(batch))
-            logger.info("Response embeddings added to FAISS index.")
-        else:
-            logger.warning("No embeddings to add to FAISS index.")
-        # **Sanity Check:** Verify the number of embeddings in FAISS index
-        logger.info(f"Total embeddings in FAISS index after addition: {self.index.ntotal}")
     def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
-        """Find hard negatives for a batch of queries with error handling and retries."""
         retry_count = 0
         total_responses = len(self.response_pool)
-        # Set k to be neg_samples + additional candidates to improve negative selection
-        k = self.neg_samples + 0
         while retry_count < self.max_retries:
             try:
-                # Compute embeddings in sub-batches to manage memory
-                batch_size = 128  # Example sub-batch size; adjust as needed
                 query_embeddings = []
                 for i in range(0, len(queries), batch_size):
-                    sub_queries = queries[i:i + batch_size]
-                    sub_embeddings = np.vstack([
-                        self.query_embeddings_cache[q] for q in sub_queries
-                    ]).astype(np.float32)
-                    faiss.normalize_L2(sub_embeddings)
-                    query_embeddings.append(sub_embeddings)
-                query_embeddings = np.vstack(query_embeddings)
-                # Ensure contiguous memory layout
                 query_embeddings = np.ascontiguousarray(query_embeddings)
-                # Perform FAISS search on CPU
                 distances, indices = self.index.search(query_embeddings, k)
                 all_negatives = []
-                for query_indices, query, positive in zip(indices, queries, positives):
-                    negatives = []
-                    positive_strip = positive.strip()
-                    seen = {positive_strip}
                     for idx in query_indices:
-                        if idx >= 0 and idx < total_responses:
-                            candidate = self.response_pool[idx].strip()
-                            if candidate and candidate not in seen:
-                                seen.add(candidate)
-                                negatives.append(candidate)
-                                if len(negatives) >= self.neg_samples:
                                     break
-                    # If not enough negatives are found, pad with a special token
-                    while len(negatives) < self.neg_samples:
-                        negatives.append("<EMPTY_NEGATIVE>")  # Use a special token
-                    all_negatives.append(negatives)
                 return all_negatives
@@ -288,123 +309,236 @@ class TFDataPipeline:
                 logger.warning(f"Hard negative search attempt {retry_count} failed due to missing embeddings: {ke}")
                 if retry_count == self.max_retries:
                     logger.error("Max retries reached for hard negative search due to missing embeddings.")
-                    return [["<EMPTY_NEGATIVE>"] * self.neg_samples for _ in queries]
-                # Perform memory cleanup
                 gc.collect()
                 if tf.config.list_physical_devices('GPU'):
                     tf.keras.backend.clear_session()
             except Exception as e:
                 retry_count += 1
                 logger.warning(f"Hard negative search attempt {retry_count} failed: {e}")
                 if retry_count == self.max_retries:
                     logger.error("Max retries reached for hard negative search.")
-                    return [["<EMPTY_NEGATIVE>"] * self.neg_samples for _ in queries]
-                # Perform memory cleanup
                 gc.collect()
                 if tf.config.list_physical_devices('GPU'):
                     tf.keras.backend.clear_session()
-    def encode_query(self, query: str, context: Optional[List[Tuple[str, str]]] = None) -> np.ndarray:
         """
-        Encode a query with optional conversation context into an embedding vector.
         Args:
-            query (str): The user query.
-            context (Optional[List[Tuple[str, str]]]): Optional conversation history as a list of (user, assistant) tuples.
         Returns:
-            np.ndarray: The normalized embedding vector for the query.
         """
-        # Prepare query with context
         if context:
-            context_str = ' '.join([
-                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {q} "
-                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {r}"
-                for q, r in context[-self.config.max_context_turns:]
-            ])
-            query = f"{context_str} {self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]}" \
-                    f" {query}"
         else:
-            query = f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
-        # Tokenize and encode
         encodings = self.tokenizer(
-            [query],
             padding='max_length',
             truncation=True,
             max_length=self.max_length,
-            return_tensors='np'  # Use NumPy arrays for compatibility with FAISS
         )
         input_ids = encodings['input_ids']
-        # Verify token IDs
         max_id = np.max(input_ids)
-        new_vocab_size = len(self.tokenizer)
-        if max_id >= new_vocab_size:
-            logger.error(f"Token ID {max_id} exceeds the vocabulary size {new_vocab_size}.")
             raise ValueError("Token ID exceeds vocabulary size.")
-        # Get embeddings from the shared encoder
         embeddings = self.encoder(input_ids, training=False).numpy()
-        # Normalize embeddings for cosine similarity
-        faiss.normalize_L2(embeddings)
-        return embeddings[0]  # Return as a 1D array
-    def encode_responses(self, responses: List[str], context: Optional[List[Tuple[str, str]]] = None) -> np.ndarray:
         """
-        Encode a list of responses into embedding vectors.
         Args:
-            responses (List[str]): List of response texts.
-            context (Optional[List[Tuple[str, str]]]): Optional conversation history as a list of (user, assistant) tuples.
         Returns:
-            np.ndarray: Array of normalized embedding vectors.
         """
-        # Prepare responses with context if provided
         if context:
-            prepared_responses = []
-            for response in responses:
-                context_str = ' '.join([
-                    f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {q} "
-                    f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {r}"
-                    for q, r in context[-self.config.max_context_turns:]
-                ])
-                full_response = f"{context_str} {self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {response}"
-                prepared_responses.append(full_response)
         else:
-            prepared_responses = [
-                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {resp}"
-                for resp in responses
             ]
-        # Tokenize and encode
         encodings = self.tokenizer(
-            prepared_responses,
             padding='max_length',
             truncation=True,
             max_length=self.max_length,
-            return_tensors='np'  # Use NumPy arrays for compatibility with FAISS
         )
         input_ids = encodings['input_ids']
-        # Verify token IDs
         max_id = np.max(input_ids)
-        new_vocab_size = len(self.tokenizer)
-        if max_id >= new_vocab_size:
-            logger.error(f"Token ID {max_id} exceeds the vocabulary size {new_vocab_size}.")
             raise ValueError("Token ID exceeds vocabulary size.")
-        # Get embeddings from the shared encoder
         embeddings = self.encoder(input_ids, training=False).numpy()
-        # Normalize embeddings for cosine similarity
-        faiss.normalize_L2(embeddings)
         return embeddings.astype('float32')
     def prepare_and_save_data(self, dialogues: List[dict], tf_record_path: str, batch_size: int = 32):

 from tqdm import tqdm
 import json
 from pathlib import Path
+from typing import Union, Optional, Dict, List, Tuple, Generator
 from transformers import AutoTokenizer
 from typing import List, Tuple, Generator
 from transformers import AutoTokenizer
 from gpu_monitor import GPUMemoryMonitor
+import random
 from logger_config import config_logger
 logger = config_logger(__name__)
         response_pool: List[str],
         max_length: int,
         query_embeddings_cache: dict,
+        neg_samples: int = 5,
         index_type: str = 'IndexFlatIP',
         nlist: int = 100,
         max_retries: int = 3
         self.max_batch_size = 16 if len(response_pool) < 100 else 64
         self.memory_monitor = GPUMemoryMonitor()
         self.max_retries = max_retries
+        # Build a quick text->domain map for O(1) domain lookups
+        self._text_domain_map = {}
+        self.build_text_to_domain_map()
         if os.path.exists(index_file_path):
             logger.info(f"Loading existing FAISS index from {index_file_path}...")
         logger.info(f"Loaded {len(dialogues)} dialogues.")
         return dialogues
+    def collect_responses_with_domain(self, dialogues: List[dict]) -> List[Dict[str, str]]:
+        """
+        Extract unique assistant responses from dialogues, along with the domain.
+        Returns a list of dicts: [{'domain': str, 'text': str}, ...]
+        """
+        response_set = set()  # We'll store (domain, text) tuples to keep them unique
+        results = []
         for dialogue in tqdm(dialogues, desc="Processing Dialogues", unit="dialogue"):
+            # domain is stored at the top level in your new JSON format
+            domain = dialogue.get('domain', 'other')
             turns = dialogue.get('turns', [])
             for turn in turns:
                 speaker = turn.get('speaker')
                 text = turn.get('text', '').strip()
                 if speaker == 'assistant' and text:
                     if len(text) <= self.max_length:
+                        # Use a tuple as a "set" key to ensure uniqueness
+                        key = (domain, text)
+                        if key not in response_set:
+                            response_set.add(key)
+                            results.append({
+                                "domain": domain,
+                                "text": text
+                            })
+        logger.info(f"Collected {len(results)} unique assistant responses from dialogues.")
+        return results
+    # def collect_responses(self, dialogues: List[dict]) -> List[str]:
+    #     """Extract unique assistant responses from dialogues."""
+    #     response_set = set()
+    #     for dialogue in tqdm(dialogues, desc="Processing Dialogues", unit="dialogue"):
+    #         turns = dialogue.get('turns', [])
+    #         for turn in turns:
+    #             speaker = turn.get('speaker')
+    #             text = turn.get('text', '').strip()
+    #             if speaker == 'assistant' and text:
+    #                 # Ensure we don't exclude valid shorter responses
+    #                 if len(text) <= self.max_length:
+    #                     response_set.add(text)
+    #     logger.info(f"Collected {len(response_set)} unique assistant responses from dialogues.")
+    #     return list(response_set)
     def _extract_pairs_from_dialogue(self, dialogue: dict) -> List[Tuple[str, str]]:
         """Extract query-response pairs from a dialogue."""
     def compute_and_index_response_embeddings(self):
         """
+        Computes embeddings for the response pool and adds them to the FAISS index.
+        self.response_pool is now List[Dict[str, str]] with keys "domain" and "text".
         """
         logger.info("Computing embeddings for the response pool...")
+        # Extract just the assistant text
+        texts = [resp["text"] for resp in self.response_pool]
+        logger.debug(f"Total texts to embed: {len(texts)}")
+        batch_size = getattr(self, 'embedding_batch_size', 64)
         embeddings = []
+        with tqdm(total=len(texts), desc="Computing Embeddings", unit="response") as pbar:
+            for i in range(0, len(texts), batch_size):
+                batch_texts = texts[i:i+batch_size]
+                encodings = self.tokenizer(
+                    batch_texts,
+                    padding=True,
+                    truncation=True,
+                    max_length=self.max_length,
+                    return_tensors='tf'
+                )
+                batch_embeds = self.encoder(encodings['input_ids'], training=False).numpy()
+                embeddings.append(batch_embeds)
+                pbar.update(len(batch_texts))
+        # Combine embeddings and add to FAISS
+        all_embeddings = np.vstack(embeddings).astype(np.float32)
+        logger.info(f"Adding {len(all_embeddings)} response embeddings to FAISS index...")
+        self.index.add(all_embeddings)
+        # For debugging or repeated usage, you might store them:
+        self.response_embeddings = all_embeddings
+        logger.info(f"FAISS index now has {self.index.ntotal} vectors.")
     def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
+        """
+        Find hard negatives for a batch of queries using FAISS search.
+        Falls back to random negatives if we run out of tries or can't find enough.
+        Uses domain-based fallback if possible.
+        """
+        import random
+        import gc
         retry_count = 0
         total_responses = len(self.response_pool)
+        k = self.neg_samples  # Number of negatives to retrieve from FAISS
+        batch_size = 128
         while retry_count < self.max_retries:
             try:
+                # 1) Build query embeddings from the cache
                 query_embeddings = []
                 for i in range(0, len(queries), batch_size):
+                    sub_queries = queries[i : i + batch_size]
+                    sub_embeds = [self.query_embeddings_cache[q] for q in sub_queries]
+                    sub_embeds = np.vstack(sub_embeds).astype(np.float32)
+                    faiss.normalize_L2(sub_embeds)  # If not already normalized
+                    query_embeddings.append(sub_embeds)
+                query_embeddings = np.vstack(query_embeddings)
                 query_embeddings = np.ascontiguousarray(query_embeddings)
+                # 2) Perform FAISS search
                 distances, indices = self.index.search(query_embeddings, k)
                 all_negatives = []
+                # For each query, find domain from the corresponding positive if possible
+                for query_indices, query_text, pos_text in zip(indices, queries, positives):
+                    negative_list = []
+                    seen = {pos_text.strip()}
+                    # Attempt to detect the domain of the positive text
+                    domain_of_positive = self._detect_domain_for_text(pos_text)
+                    # Collect hard negatives from FAISS
                     for idx in query_indices:
+                        if 0 <= idx < total_responses:
+                            candidate_dict = self.response_pool[idx]  # e.g. {domain, text}
+                            candidate_text = candidate_dict["text"].strip()
+                            if candidate_text and candidate_text not in seen:
+                                seen.add(candidate_text)
+                                negative_list.append(candidate_text)
+                                if len(negative_list) >= self.neg_samples:
                                     break
+                    # If not enough negatives, fallback to random domain-based
+                    if len(negative_list) < self.neg_samples:
+                        needed = self.neg_samples - len(negative_list)
+                        # Pass in domain_of_positive to your updated `_get_random_negatives(...)`
+                        random_negatives = self._get_random_negatives(needed, seen, domain=domain_of_positive)
+                        negative_list.extend(random_negatives)
+                    all_negatives.append(negative_list)
                 return all_negatives
                 logger.warning(f"Hard negative search attempt {retry_count} failed due to missing embeddings: {ke}")
                 if retry_count == self.max_retries:
                     logger.error("Max retries reached for hard negative search due to missing embeddings.")
+                    return self._fallback_negatives(queries, positives, reason="key_error")
                 gc.collect()
                 if tf.config.list_physical_devices('GPU'):
                     tf.keras.backend.clear_session()
             except Exception as e:
                 retry_count += 1
                 logger.warning(f"Hard negative search attempt {retry_count} failed: {e}")
                 if retry_count == self.max_retries:
                     logger.error("Max retries reached for hard negative search.")
+                    return self._fallback_negatives(queries, positives, reason="generic_error")
                 gc.collect()
                 if tf.config.list_physical_devices('GPU'):
                     tf.keras.backend.clear_session()
+    def _detect_domain_for_text(self, text: str) -> Optional[str]:
+        """
+        O(1) domain detection by looking up text in our dictionary.
+        Returns the domain if found, else None.
+        """
+        stripped_text = text.strip()
+        return self._text_domain_map.get(stripped_text, None)
+    def _get_random_negatives(self, needed: int, seen: set, domain: Optional[str] = None) -> List[str]:
+        """
+        Return a list of 'needed' random negative texts from the same domain if possible,
+        otherwise fallback to all-domain.
+        """
+        # 1) Filter response_pool for domain if provided
+        if domain:
+            domain_texts = [r["text"] for r in self.response_pool if r["domain"] == domain]
+            # fallback to entire set if insufficient domain_texts
+            if len(domain_texts) < needed * 2:  # pick some threshold
+                domain_texts = [r["text"] for r in self.response_pool]
+        else:
+            domain_texts = [r["text"] for r in self.response_pool]
+        negatives = []
+        tries = 0
+        max_tries = needed * 10
+        while len(negatives) < needed and tries < max_tries:
+            tries += 1
+            candidate = random.choice(domain_texts).strip()
+            if candidate and candidate not in seen:
+                negatives.append(candidate)
+                seen.add(candidate)
+        # If still not enough, we do the best we can
+        if len(negatives) < needed:
+            logger.warning(f"Could not find enough domain-based random negatives; needed {needed}, got {len(negatives)}.")
+        return negatives
+    def _fallback_negatives(self, queries: List[str], positives: List[str], reason: str) -> List[List[str]]:
+        """
+        Called if FAISS fails or embeddings are missing.
+        We use entirely random negatives for each query, ignoring FAISS,
+        but still attempt domain-based selection if possible.
         """
+        logger.error(f"Falling back to random negatives due to: {reason}")
+        all_negatives = []
+        for pos_text in positives:
+            # Build a 'seen' set with the positive
+            seen = {pos_text.strip()}
+            # Attempt to detect the domain of the positive text
+            domain_of_positive = self._detect_domain_for_text(pos_text)
+            # Use domain-based random negatives if available
+            negs = self._get_random_negatives(self.neg_samples, seen, domain=domain_of_positive)
+            all_negatives.append(negs)
+        return all_negatives
+    def build_text_to_domain_map(self):
+        """
+        Build an O(1) lookup dict: text -> domain,
+        so we don't have to scan the entire self.response_pool each time.
+        """
+        self._text_domain_map = {}
+        for item in self.response_pool:
+            # e.g., item = {"domain": "restaurant", "text": "some text..."}
+            stripped_text = item["text"].strip()
+            domain = item["domain"]
+            # If the same text appears multiple times with the same domain, no big deal.
+            # If it appears with a different domain, you can decide how to handle collisions.
+            if stripped_text in self._text_domain_map:
+                existing_domain = self._text_domain_map[stripped_text]
+                if existing_domain != domain:
+                    # Log a warning or decide on a policy:
+                    logger.warning(
+                        f"Collision detected: text '{stripped_text}' found with domains "
+                        f"'{existing_domain}' and '{domain}'. Keeping the first."
+                    )
+                # By default, keep the first domain or overwrite. We'll skip overwriting:
+                continue
+            else:
+                # Insert into the dict
+                self._text_domain_map[stripped_text] = domain
+        logger.info(f"Built text->domain map with {len(self._text_domain_map)} unique text entries.")
+    def encode_query(
+        self,
+        query: str,
+        context: Optional[List[Tuple[str, str]]] = None
+    ) -> np.ndarray:
+        """
+        Encode a user query (and optional conversation context) into an embedding vector.
         Args:
+            query: The user query.
+            context: Optional conversation history as a list of (user_text, assistant_text).
         Returns:
+            np.ndarray of shape [embedding_dim], typically L2-normalized already.
         """
+        # 1) Prepare context (if any) by concatenating user/assistant pairs
         if context:
+            # Take the last N turns
+            relevant_history = context[-self.config.max_context_turns:]
+            context_str_parts = []
+            for (u_text, a_text) in relevant_history:
+                context_str_parts.append(
+                    f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {u_text} "
+                    f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {a_text}"
+                )
+            context_str = " ".join(context_str_parts)
+            # Append the user's new query
+            full_query = (
+                f"{context_str} "
+                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
+            )
         else:
+            # Just a single user turn
+            full_query = (
+                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
+            )
+        # 2) Tokenize
         encodings = self.tokenizer(
+            [full_query],
             padding='max_length',
             truncation=True,
             max_length=self.max_length,
+            return_tensors='np'  # to keep it compatible with FAISS
         )
         input_ids = encodings['input_ids']
+        # 3) Check for out-of-vocab IDs
         max_id = np.max(input_ids)
+        vocab_size = len(self.tokenizer)
+        if max_id >= vocab_size:
+            logger.error(f"Token ID {max_id} exceeds tokenizer vocab size {vocab_size}.")
             raise ValueError("Token ID exceeds vocabulary size.")
+        # 4) Get embeddings from the model
         embeddings = self.encoder(input_ids, training=False).numpy()
+        # Typically your custom model already L2-normalizes the final embeddings.
+        # 5) Return the single embedding as 1D array
+        return embeddings[0]
+    def encode_responses(
+        self,
+        responses: List[str],
+        context: Optional[List[Tuple[str, str]]] = None
+    ) -> np.ndarray:
         """
+        Encode multiple response texts into embedding vectors.
         Args:
+            responses: List of raw assistant responses.
+            context: Optional conversation context (last N turns).
         Returns:
+            np.ndarray of shape [num_responses, embedding_dim].
         """
+        # 1) If you want to incorporate context into response encoding
+        #    Usually for retrieval we might skip this. But if you want it:
         if context:
+            relevant_history = context[-self.config.max_context_turns:]
+            prepared = []
+            for resp in responses:
+                context_str_parts = []
+                for (u_text, a_text) in relevant_history:
+                    context_str_parts.append(
+                        f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {u_text} "
+                        f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {a_text}"
+                    )
+                context_str = " ".join(context_str_parts)
+                # Now treat resp as an assistant turn
+                full_resp = (
+                    f"{context_str} "
+                    f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {resp}"
+                )
+                prepared.append(full_resp)
         else:
+            # By default, just mark each response as from the assistant
+            prepared = [
+                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {r}"
+                for r in responses
             ]
+        # 2) Tokenize
         encodings = self.tokenizer(
+            prepared,
             padding='max_length',
             truncation=True,
             max_length=self.max_length,
+            return_tensors='np'
         )
         input_ids = encodings['input_ids']
+        # 3) Check for out-of-vocab
         max_id = np.max(input_ids)
+        vocab_size = len(self.tokenizer)
+        if max_id >= vocab_size:
+            logger.error(f"Token ID {max_id} exceeds tokenizer vocab size {vocab_size}.")
             raise ValueError("Token ID exceeds vocabulary size.")
+        # 4) Model forward
         embeddings = self.encoder(input_ids, training=False).numpy()
+        # Typically already L2-normalized if your final layer is normalized.
         return embeddings.astype('float32')
     def prepare_and_save_data(self, dialogues: List[dict], tf_record_path: str, batch_size: int = 32):

validate_model.py CHANGED Viewed

@@ -1,16 +1,17 @@
 import os
 import json
 from chatbot_model import ChatbotConfig, RetrievalChatbot
 from response_quality_checker import ResponseQualityChecker
 from chatbot_validator import ChatbotValidator
 from plotter import Plotter
 from environment_setup import EnvironmentSetup
 from logger_config import config_logger
 logger = config_logger(__name__)
 def run_interactive_chat(chatbot, quality_checker):
-    """Separate function for interactive chat loop"""
     while True:
         try:
             user_input = input("You: ")
@@ -18,7 +19,7 @@ def run_interactive_chat(chatbot, quality_checker):
             print("\nAssistant: Goodbye!")
             break
-        if user_input.lower() in ['quit', 'exit', 'bye']:
             print("Assistant: Goodbye!")
             break
@@ -26,69 +27,97 @@ def run_interactive_chat(chatbot, quality_checker):
             query=user_input,
             conversation_history=None,
             quality_checker=quality_checker,
-            top_k=5
         )
         print(f"Assistant: {response}")
-        if metrics.get('is_confident', False):
             print("\nAlternative responses:")
             for resp, score in candidates[1:4]:
                 print(f"Score: {score:.4f} - {resp}")
         else:
             print("\n[Low Confidence]: Consider rephrasing your query for better assistance.")
-# TODO:
 def validate_chatbot():
     # Initialize environment
     env = EnvironmentSetup()
     env.initialize()
-    MODEL_DIR = 'models'
-    FAISS_INDICES_DIR = os.path.join(MODEL_DIR, 'faiss_indices')
-    FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_production.index')
-    FAISS_INDEX_TEST_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_test.index')
-    RESPONSE_POOL_PRODUCTION_PATH = FAISS_INDEX_PRODUCTION_PATH.replace('.index', '_responses.json')
-    RESPONSE_POOL_TEST_PATH = FAISS_INDEX_TEST_PATH.replace('.index', '_responses.json')
-    ENVIRONMENT = 'production'  # or 'test'
-    if ENVIRONMENT == 'test':
         FAISS_INDEX_PATH = FAISS_INDEX_TEST_PATH
-        RESPONSE_POOL_PATH = RESPONSE_POOL_TEST_PATH
     else:
         FAISS_INDEX_PATH = FAISS_INDEX_PRODUCTION_PATH
-        RESPONSE_POOL_PATH = RESPONSE_POOL_PRODUCTION_PATH
-    # Load config
-    config = ChatbotConfig()
-    # Initialize RetrievalChatbot in 'inference' mode
     try:
-        chatbot = RetrievalChatbot(config=config, mode='inference')
-        logger.info("RetrievalChatbot initialized in 'inference' mode.")
     except Exception as e:
-        logger.error(f"Failed to initialize RetrievalChatbot: {e}")
         return
-    # Ensure FAISS index and response pool are accessible, then load
     if not os.path.exists(FAISS_INDEX_PATH) or not os.path.exists(RESPONSE_POOL_PATH):
         logger.error("FAISS index or response pool file is missing.")
         return
     try:
         chatbot.data_pipeline.load_faiss_index(FAISS_INDEX_PATH)
         logger.info(f"FAISS index loaded from {FAISS_INDEX_PATH}.")
-        with open(RESPONSE_POOL_PATH, 'r', encoding='utf-8') as f:
             chatbot.data_pipeline.response_pool = json.load(f)
         logger.info(f"Response pool loaded from {RESPONSE_POOL_PATH}.")
         chatbot.data_pipeline.validate_faiss_index()
         logger.info("FAISS index and response pool validated successfully.")
     except Exception as e:
-        logger.error(f"Failed to load FAISS index: {e}")
         return
-    # Initialize ResponseQualityChecker and ChatbotValidator
     quality_checker = ResponseQualityChecker(data_pipeline=chatbot.data_pipeline)
     validator = ChatbotValidator(chatbot=chatbot, quality_checker=quality_checker)
     logger.info("ResponseQualityChecker and ChatbotValidator initialized.")
@@ -101,17 +130,17 @@ def validate_chatbot():
         logger.error(f"Validation process failed: {e}")
         return
-    # Plot validation_metrics
-    try:
-        plotter = Plotter(save_dir=env.training_dirs['plots'])
-        plotter.plot_validation_metrics(validation_metrics)
-        logger.info("Validation metrics plotted successfully.")
-    except Exception as e:
-        logger.error(f"Failed to plot validation metrics: {e}")
-    # Run interactive chat
-    logger.info("\nStarting interactive chat session...")
-    run_interactive_chat(chatbot, quality_checker)
-if __name__ == '__main__':
-    validate_chatbot()

 import os
 import json
 from chatbot_model import ChatbotConfig, RetrievalChatbot
 from response_quality_checker import ResponseQualityChecker
 from chatbot_validator import ChatbotValidator
 from plotter import Plotter
 from environment_setup import EnvironmentSetup
 from logger_config import config_logger
 logger = config_logger(__name__)
 def run_interactive_chat(chatbot, quality_checker):
+    """Separate function for interactive chat loop."""
     while True:
         try:
             user_input = input("You: ")
             print("\nAssistant: Goodbye!")
             break
+        if user_input.lower() in ["quit", "exit", "bye"]:
             print("Assistant: Goodbye!")
             break
             query=user_input,
             conversation_history=None,
             quality_checker=quality_checker,
+            top_k=10
         )
         print(f"Assistant: {response}")
+        # Show alternative responses if confident
+        if metrics.get("is_confident", False):
             print("\nAlternative responses:")
             for resp, score in candidates[1:4]:
                 print(f"Score: {score:.4f} - {resp}")
         else:
             print("\n[Low Confidence]: Consider rephrasing your query for better assistance.")
 def validate_chatbot():
     # Initialize environment
     env = EnvironmentSetup()
     env.initialize()
+    MODEL_DIR = "new_iteration/data_prep_iterative_models"
+    FAISS_INDICES_DIR = os.path.join(MODEL_DIR, "faiss_indices")
+    FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_production.index")
+    FAISS_INDEX_TEST_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_test.index")
+    # Toggle 'production' or 'test' env
+    ENVIRONMENT = "production"
+    if ENVIRONMENT == "test":
         FAISS_INDEX_PATH = FAISS_INDEX_TEST_PATH
+        RESPONSE_POOL_PATH = FAISS_INDEX_TEST_PATH.replace(".index", "_responses.json")
     else:
         FAISS_INDEX_PATH = FAISS_INDEX_PRODUCTION_PATH
+        RESPONSE_POOL_PATH = FAISS_INDEX_PRODUCTION_PATH.replace(".index", "_responses.json")
+    # Load the config
+    config_path = os.path.join(MODEL_DIR, "config.json")
+    if os.path.exists(config_path):
+        with open(config_path, "r", encoding="utf-8") as f:
+            config_dict = json.load(f)
+        config = ChatbotConfig.from_dict(config_dict)
+        logger.info(f"Loaded ChatbotConfig from {config_path}")
+    else:
+        config = ChatbotConfig()
+        logger.warning("No config.json found. Using default ChatbotConfig.")
+    # Load RetrievalChatbot in 'inference' mode using the classmethod
+    # This:
+    #    - Loads shared_encoder submodule
+    #    - Loads encoder_custom_weights.weights.h5
+    #    - Loads tokenizer
+    #    - Prepares the model for inference
     try:
+        chatbot = RetrievalChatbot.load_model(load_dir=MODEL_DIR, mode="inference")
+        logger.info("RetrievalChatbot loaded in 'inference' mode successfully.")
     except Exception as e:
+        logger.error(f"Failed to load RetrievalChatbot: {e}")
         return
+    # Confirm FAISS index & response pool exist
     if not os.path.exists(FAISS_INDEX_PATH) or not os.path.exists(RESPONSE_POOL_PATH):
         logger.error("FAISS index or response pool file is missing.")
         return
+    # Load specific FAISS index and response pool
     try:
+        # Even though load_model might auto-load an index, we override here with the specific file
         chatbot.data_pipeline.load_faiss_index(FAISS_INDEX_PATH)
         logger.info(f"FAISS index loaded from {FAISS_INDEX_PATH}.")
+        print("FAISS dimensions:", chatbot.data_pipeline.index.d)
+        print("FAISS index type:", type(chatbot.data_pipeline.index))
+        print("FAISS index total vectors:", chatbot.data_pipeline.index.ntotal)
+        print("FAISS is_trained:", chatbot.data_pipeline.index.is_trained)
+        with open(RESPONSE_POOL_PATH, "r", encoding="utf-8") as f:
             chatbot.data_pipeline.response_pool = json.load(f)
         logger.info(f"Response pool loaded from {RESPONSE_POOL_PATH}.")
+        print("Sample from response pool (first 10):")
+        for i, response in enumerate(chatbot.data_pipeline.response_pool[:10]):
+            print(f"{i}: {response}")
+        print("\nTotal responses in pool:", len(chatbot.data_pipeline.response_pool))
+        # Validate dimension consistency
         chatbot.data_pipeline.validate_faiss_index()
         logger.info("FAISS index and response pool validated successfully.")
     except Exception as e:
+        logger.error(f"Failed to load or validate FAISS index: {e}")
         return
+    # Init QualityChecker and Validator
     quality_checker = ResponseQualityChecker(data_pipeline=chatbot.data_pipeline)
     validator = ChatbotValidator(chatbot=chatbot, quality_checker=quality_checker)
     logger.info("ResponseQualityChecker and ChatbotValidator initialized.")
         logger.error(f"Validation process failed: {e}")
         return
+    # Plot metrics
+    # try:
+    #     plotter = Plotter(save_dir=env.training_dirs["plots"])
+    #     plotter.plot_validation_metrics(validation_metrics)
+    #     logger.info("Validation metrics plotted successfully.")
+    # except Exception as e:
+    #     logger.error(f"Failed to plot validation metrics: {e}")
+    # Run interactive chat loop
+    # logger.info("\nStarting interactive chat session...")
+    # run_interactive_chat(chatbot, quality_checker)
+if __name__ == "__main__":
+    validate_chatbot()