Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Jan 1

Commit

5b413d1

1 Parent(s): ee0f664

training and inference updates

Browse files

Files changed (10) hide show

chatbot_model.py +443 -995
chatbot_validator.py +26 -26
conversation_summarizer.py +11 -4
environment_setup.py +0 -9
run_data_preparer.py → prepare_data.py +14 -18
response_quality_checker.py +40 -56
test_trained_model.py +0 -0
tf_data_pipeline.py +111 -327
run_model_train.py → train_model.py +17 -51
validate_model.py +117 -0

chatbot_model.py CHANGED Viewed

@@ -1,7 +1,6 @@
-import time
 from transformers import TFAutoModel, AutoTokenizer
 import tensorflow as tf
-import numpy as np
 from typing import List, Tuple, Dict, Optional, Union, Any
 import math
 from dataclasses import dataclass
@@ -66,23 +65,17 @@ class EncoderModel(tf.keras.Model):
         super().__init__(name=name, **kwargs)
         self.config = config
-        # Load pretrained model
         self.pretrained = TFAutoModel.from_pretrained(config.pretrained_model)
-        # Freeze layers based on config
         self._freeze_layers()
-        # Pooling layer (Global Average Pooling)
         self.pooler = tf.keras.layers.GlobalAveragePooling1D()
-        # Projection layer
         self.projection = tf.keras.layers.Dense(
             config.embedding_dim,
             activation='tanh',
             name="projection"
         )
-        # Dropout and normalization
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
         self.normalize = tf.keras.layers.Lambda(
             lambda x: tf.nn.l2_normalize(x, axis=1),
@@ -110,13 +103,13 @@ class EncoderModel(tf.keras.Model):
         """Forward pass."""
         # Get pretrained embeddings
         pretrained_outputs = self.pretrained(inputs, training=training)
-        x = pretrained_outputs.last_hidden_state  # Shape: [batch_size, seq_len, embedding_dim]
         # Apply pooling, projection, dropout, and normalization
-        x = self.pooler(x)  # Shape: [batch_size, 768]
-        x = self.projection(x) # Shape: [batch_size, 768]
-        x = self.dropout(x, training=training) # Apply dropout
-        x = self.normalize(x)  # Shape: [batch_size, 768]
         return x
@@ -134,12 +127,11 @@ class RetrievalChatbot(DeviceAwareModel):
     def __init__(
         self,
         config: ChatbotConfig,
-        dialogues: List[dict] = [],
         device: str = None,
         strategy=None,
         reranker: Optional[CrossEncoderReranker] = None,
         summarizer: Optional[Summarizer] = None,
-        mode: str = 'preparation'
     ):
         super().__init__()
         self.config = config
@@ -147,17 +139,37 @@ class RetrievalChatbot(DeviceAwareModel):
         self.device = device or self._setup_default_device()
         self.mode = mode.lower()
-        # Initialize reranker, summarizer, tokenizer, and memory monitor
         self.reranker = reranker or self._initialize_reranker()
-        self.summarizer = summarizer or self._initialize_summarizer()
         self.tokenizer = self._initialize_tokenizer()
         self.memory_monitor = GPUMemoryMonitor()
-        # # Initialize models
-        # self.min_batch_size = 8
-        # self.max_batch_size = 128
-        # self.current_batch_size = 32
         # Initialize training history
         self.history = {
             "train_loss": [],
@@ -165,15 +177,7 @@ class RetrievalChatbot(DeviceAwareModel):
             "train_metrics": {},
             "val_metrics": {}
         }
-        # Collect unique responses from dialogues
-        if self.mode == 'preparation':
-            # Collect unique responses from dialogues only in preparation mode
-            self.response_pool, self.unique_responses = self._collect_responses(dialogues)
-        else:
-            # In training mode, assume response_pool is handled via TFRecord
-            self.response_pool = []
-            self.unique_responses = []
     def _setup_default_device(self) -> str:
         """Set up default device if none is provided."""
@@ -189,8 +193,13 @@ class RetrievalChatbot(DeviceAwareModel):
     def _initialize_summarizer(self) -> Summarizer:
         """Initialize the Summarizer."""
-        logger.info("Initializing default Summarizer...")
-        return Summarizer(device=self.device)
     def _initialize_tokenizer(self) -> AutoTokenizer:
         """Initialize the tokenizer and add special tokens."""
@@ -207,559 +216,127 @@ class RetrievalChatbot(DeviceAwareModel):
         )
         return tokenizer
-    def _collect_responses(self, dialogues: List[dict]) -> Tuple[List[str], List[str]]:
-        """
-        Collect unique responses from dialogues.
-        Returns:
-            response_pool: List of all possible responses.
-            unique_responses: List of unique responses.
-        """
-        logger.info("Collecting unique responses from dialogues...")
-        responses = set()
-        for dialogue in dialogues:
-            turns = dialogue.get('turns', [])
-            for turn in turns:
-                if turn.get('speaker') == 'assistant' and 'text' in turn:
-                    response = turn['text'].strip()
-                    if len(response) >= self.config.min_text_length:
-                        responses.add(response)
-        response_pool = list(responses)
-        unique_responses = list(responses)  # Assuming uniqueness
-        logger.info(f"Collected {len(response_pool)} unique responses.")
-        return response_pool, unique_responses
-    def build_models(self):
-        """Initialize the shared encoder and FAISS index."""
-        logger.info("Building encoder model...")
-        tf.keras.backend.clear_session()
-        # Shared encoder for both queries and responses
-        self.encoder = EncoderModel(
             self.config,
             name="shared_encoder",
         )
-        # Resize token embeddings after adding special tokens
         new_vocab_size = len(self.tokenizer)
-        self.encoder.pretrained.resize_token_embeddings(new_vocab_size)
         logger.info(f"Token embeddings resized to: {new_vocab_size}")
-        if self.mode == 'preparation':
-            # Initialize FAISS index only in preparation mode
-            self._initialize_faiss()
-            # Compute and index embeddings
-            self._compute_and_index_embeddings()
-        else:
-            # In training mode, skip FAISS indexing from dialogues
-            logger.info("Training mode: Skipping FAISS index initialization from dialogues.")
-        # Retrieve embedding dimension from encoder
-        embedding_dim = self.config.embedding_dim
-        vocab_size = len(self.tokenizer)
-        logger.info(f"Encoder Embedding Dimension: {embedding_dim}")
-        logger.info(f"Encoder Embedding Vocabulary Size: {vocab_size}")
-        if vocab_size >= embedding_dim:
-            logger.info("Encoder model built and embeddings resized successfully.")
-        else:
-            logger.error("Vocabulary size is less than embedding dimension.")
-            raise ValueError("Vocabulary size is less than embedding dimension.")
-    def _adjust_batch_size(self) -> None:
-        """Dynamically adjust batch size based on GPU memory usage."""
-        if self.memory_monitor.should_reduce_batch_size():
-            new_size = max(self.min_batch_size, self.current_batch_size // 2)
-            if new_size != self.current_batch_size:
-                logger.info(f"Reducing batch size to {new_size} due to high memory usage")
-                self.current_batch_size = new_size
-                gc.collect()
-                if tf.config.list_physical_devices('GPU'):
-                    tf.keras.backend.clear_session()
-        elif self.memory_monitor.can_increase_batch_size():
-            new_size = min(self.max_batch_size, self.current_batch_size * 2)
-            if new_size != self.current_batch_size:
-                logger.info(f"Increasing batch size to {new_size}")
-                self.current_batch_size = new_size
-    def _initialize_faiss(self):
-        """Initialize FAISS with safe GPU handling and memory monitoring."""
-        logger.info("Initializing FAISS index...")
-        # Detect if we have GPU-enabled FAISS
-        self.faiss_gpu = False
-        self.gpu_resources = []
-        try:
-            if hasattr(faiss, 'get_num_gpus'):
-                ngpus = faiss.get_num_gpus()
-                if ngpus > 0:
-                    # Configure GPU resources with memory limit
-                    for i in range(ngpus):
-                        res = faiss.StandardGpuResources()
-                        # Set temp memory to 1/4 of total memory to avoid OOM
-                        if self.memory_monitor.has_gpu:
-                            stats = self.memory_monitor.get_memory_stats()
-                            if stats:
-                                temp_memory = int(stats.total * 0.25)  # 25% of total memory
-                                res.setTempMemory(temp_memory)
-                        self.gpu_resources.append(res)
-                    self.faiss_gpu = True
-                    logger.info(f"FAISS GPU resources initialized on {ngpus} GPUs")
-        except Exception as e:
-            logger.warning(f"Using CPU due to GPU initialization error: {e}")
         try:
-            # Create appropriate index based on dataset size
-            if len(self.unique_responses) < 1000:
-                logger.info("Small dataset detected, using simple FlatIP index")
-                self.index = faiss.IndexFlatIP(self.config.embedding_dim)
             else:
-                # For larger datasets, consider using more efficient indices like IVF
-                self.index = faiss.IndexFlatIP(self.config.embedding_dim)
-            # Move to GPU(s) if available and needed
-            if self.faiss_gpu and self.gpu_resources:
-                try:
-                    if len(self.gpu_resources) > 1:
-                        self.index = faiss.index_cpu_to_gpus_list(self.index, self.gpu_resources)
-                        logger.info("FAISS index distributed across multiple GPUs")
-                    else:
-                        self.index = faiss.index_cpu_to_gpu(self.gpu_resources[0], 0, self.index)
-                        logger.info("FAISS index moved to single GPU")
-                except Exception as e:
-                    logger.warning(f"Failed to move index to GPU: {e}. Falling back to CPU")
-                    self.faiss_gpu = False
         except Exception as e:
-            logger.error(f"Error initializing FAISS: {e}")
             raise
-    def encode_responses(
-        self,
-        responses: List[str],
-        batch_size: int = 64
-    ) -> tf.Tensor:
-        """
-        Encodes responses with more conservative memory management.
         """
-        if not responses:
-            logger.info("No responses to encode. Returning empty tensor.")
-            return tf.constant([], dtype=tf.float32)
-        all_embeddings = []
-        self.current_batch_size = batch_size
-        if self.memory_monitor.has_gpu:
-            batch_size = 128
-        total_processed = 0
-        with tqdm(total=len(responses), desc="Encoding responses") as pbar:
-            while total_processed < len(responses):
-                # Monitor memory and adjust batch size
-                if self.memory_monitor.has_gpu:
-                    gpu_usage = self.memory_monitor.get_memory_usage()
-                    if gpu_usage > 0.8:  # Over 80% usage
-                        self.current_batch_size = max(128, self.current_batch_size // 2)
-                        logger.info(f"High GPU memory usage ({gpu_usage:.1%}), reducing batch size to {self.current_batch_size}")
-                        gc.collect()
-                        tf.keras.backend.clear_session()
-                # Get batch
-                end_idx = min(total_processed + self.current_batch_size, len(responses))
-                batch_texts = responses[total_processed:end_idx]
-                try:
-                    # Tokenize
-                    encodings = self.tokenizer(
-                        batch_texts,
-                        padding='max_length',
-                        truncation=True,
-                        max_length=self.config.max_context_token_limit,
-                        return_tensors='tf'
-                    )
-                    # Encode
-                    embeddings_batch = self.encoder(encodings['input_ids'], training=False)
-                    # Cast to float32
-                    if embeddings_batch.dtype != tf.float32:
-                        embeddings_batch = tf.cast(embeddings_batch, tf.float32)
-                    # Store
-                    all_embeddings.append(embeddings_batch)
-                    # Update progress
-                    batch_processed = len(batch_texts)
-                    total_processed += batch_processed
-                    # Update progress bar
-                    if self.memory_monitor.has_gpu:
-                        gpu_usage = self.memory_monitor.get_memory_usage()
-                        pbar.set_postfix({
-                            'GPU mem': f'{gpu_usage:.1%}',
-                            'batch_size': self.current_batch_size
-                        })
-                    pbar.update(batch_processed)
-                    # Memory cleanup every 1000 samples
-                    if total_processed % 1000 == 0:
-                        gc.collect()
-                        if tf.config.list_physical_devices('GPU'):
-                            tf.keras.backend.clear_session()
-                except tf.errors.ResourceExhaustedError:
-                    logger.warning("GPU memory exhausted during encoding, reducing batch size")
-                    self.current_batch_size = max(8, self.current_batch_size // 2)
-                    continue
-                except Exception as e:
-                    logger.error(f"Error during encoding: {str(e)}")
-                    raise
-        # Concatenate results
-        if not all_embeddings:
-            logger.info("No embeddings were encoded. Returning empty tensor.")
-            return tf.constant([], dtype=tf.float32)
-        if len(all_embeddings) == 1:
-            final_embeddings = all_embeddings[0]
-        else:
-            final_embeddings = tf.concat(all_embeddings, axis=0)
-        return final_embeddings
-    def _train_faiss_index(self, response_embeddings: np.ndarray) -> None:
-        """Train FAISS index with better memory management and robust fallback mechanisms."""
-        if self.index.is_trained:
-            logger.info("Index already trained, skipping training phase")
-            return
-        logger.info("Starting FAISS index training...")
-        try:
-            # First attempt: Try training with smaller subset
-            subset_size = min(5000, len(response_embeddings))  # Reduced from 10000
-            logger.info(f"Using {subset_size} samples for initial training attempt")
-            subset_idx = np.random.choice(len(response_embeddings), subset_size, replace=False)
-            training_embeddings = response_embeddings[subset_idx].copy()  # Make a copy
-            # Ensure contiguous memory layout
-            training_embeddings = np.ascontiguousarray(training_embeddings)
-            # Force cleanup before training
-            gc.collect()
-            if tf.config.list_physical_devices('GPU'):
-                tf.keras.backend.clear_session()
-            # Verify data properties
-            logger.info(f"FAISS training data shape: {training_embeddings.shape}")
-            logger.info(f"FAISS training data dtype: {training_embeddings.dtype}")
-            logger.info("Starting initial training attempt...")
-            self.index.train(training_embeddings)
-            logger.info("Training completed successfully")
-        except (RuntimeError, Exception) as e:
-            logger.warning(f"Initial training attempt failed: {str(e)}")
-            logger.info("Attempting fallback strategy...")
-            try:
-                # Move to CPU for more stable training
-                if self.faiss_gpu:
-                    logger.info("Moving index to CPU for fallback training")
-                    cpu_index = faiss.index_gpu_to_cpu(self.index)
-                else:
-                    cpu_index = self.index
-                # Create simpler index type if needed
-                if isinstance(cpu_index, faiss.IndexIVFFlat):
-                    logger.info("Creating simpler FlatL2 index for fallback")
-                    cpu_index = faiss.IndexFlatL2(self.config.embedding_dim)
-                # Use even smaller subset for CPU training
-                subset_size = min(2000, len(response_embeddings))
-                subset_idx = np.random.choice(len(response_embeddings), subset_size, replace=False)
-                fallback_embeddings = response_embeddings[subset_idx].copy()
-                # Ensure data is properly formatted
-                if not fallback_embeddings.flags['C_CONTIGUOUS']:
-                    fallback_embeddings = np.ascontiguousarray(fallback_embeddings)
-                if fallback_embeddings.dtype != np.float32:
-                    fallback_embeddings = fallback_embeddings.astype(np.float32)
-                # Train on CPU
-                logger.info("Training fallback index on CPU...")
-                cpu_index.train(fallback_embeddings)
-                # Move back to GPU if needed
-                if self.faiss_gpu:
-                    logger.info("Moving trained index back to GPU...")
-                    if len(self.gpu_resources) > 1:
-                        self.index = faiss.index_cpu_to_gpus_list(cpu_index, self.gpu_resources)
-                    else:
-                        self.index = faiss.index_cpu_to_gpu(self.gpu_resources[0], 0, cpu_index)
-                else:
-                    self.index = cpu_index
-                logger.info("Fallback training completed successfully")
-            except Exception as e2:
-                logger.error(f"Fallback training also failed: {str(e2)}")
-                logger.warning("Creating basic brute-force index as last resort")
-                try:
-                    # Create basic brute-force index as last resort
-                    dim = response_embeddings.shape[1]
-                    basic_index = faiss.IndexFlatL2(dim)
-                    if self.faiss_gpu:
-                        if len(self.gpu_resources) > 1:
-                            self.index = faiss.index_cpu_to_gpus_list(basic_index, self.gpu_resources)
-                        else:
-                            self.index = faiss.index_cpu_to_gpu(self.gpu_resources[0], 0, basic_index)
-                    else:
-                        self.index = basic_index
-                    logger.info("Basic index created as fallback")
-                except Exception as e3:
-                    logger.error(f"All training attempts failed: {str(e3)}")
-                    raise RuntimeError("Unable to create working FAISS index")
-    def _add_vectors_to_index(self, response_embeddings: np.ndarray) -> None:
-        """Add vectors to FAISS index with enhanced memory management."""
-        logger.info("Starting vector addition process...")
-        # Even smaller batches
-        initial_batch_size = 128
-        min_batch_size = 32
-        max_batch_size = 1024
-        total_added = 0
-        retry_count = 0
-        max_retries = 5
-        while total_added < len(response_embeddings):
-            try:
-                # Monitor memory
-                if self.memory_monitor.has_gpu:
-                    gpu_usage = self.memory_monitor.get_memory_usage()
-                    #logger.info(f"GPU memory usage before batch: {gpu_usage:.1%}")
-                    # Force cleanup if memory usage is high
-                    if gpu_usage > 0.7:  # Lower threshold to 70%
-                        logger.info("High memory usage detected, forcing cleanup")
-                        gc.collect()
-                        tf.keras.backend.clear_session()
-                # Get batch
-                end_idx = min(total_added + initial_batch_size, len(response_embeddings))
-                batch = response_embeddings[total_added:end_idx]
-                # Add batch
-                self.index.add(batch)
-                # Update progress
-                batch_size = len(batch)
-                total_added += batch_size
-                # Memory cleanup every few batches
-                if total_added % (initial_batch_size * 5) == 0:
-                    gc.collect()
-                    if tf.config.list_physical_devices('GPU'):
-                        tf.keras.backend.clear_session()
-                # Gradually increase batch size
-                if initial_batch_size < max_batch_size:
-                    initial_batch_size = min(initial_batch_size + 25, max_batch_size)
-            except Exception as e:
-                logger.warning(f"Error adding batch: {str(e)}")
-                retry_count += 1
-                if retry_count > max_retries:
-                    logger.error("Max retries exceeded.")
-                    raise
-                # Reduce batch size
-                initial_batch_size = max(min_batch_size, initial_batch_size // 2)
-                logger.info(f"Reducing batch size to {initial_batch_size} and retrying...")
-                # Cleanup
-                gc.collect()
-                if tf.config.list_physical_devices('GPU'):
-                    tf.keras.backend.clear_session()
-                time.sleep(1)  # Brief pause before retry
-        logger.info(f"Successfully added all {total_added} vectors to index")
-    def _add_vectors_cpu_fallback(self, remaining_embeddings: np.ndarray, already_added: int = 0) -> None:
-        """CPU fallback with extra safeguards and progress tracking."""
-        logger.info(f"CPU Fallback: Adding {len(remaining_embeddings)} remaining vectors...")
         try:
-            # Move index to CPU
-            if self.faiss_gpu:
-                logger.info("Moving index to CPU...")
-                cpu_index = faiss.index_gpu_to_cpu(self.index)
             else:
-                cpu_index = self.index
-            # Add remaining vectors on CPU with very small batches
-            batch_size = 128
-            total_added = already_added
-            for i in range(0, len(remaining_embeddings), batch_size):
-                end_idx = min(i + batch_size, len(remaining_embeddings))
-                batch = remaining_embeddings[i:end_idx]
-                # Add batch
-                cpu_index.add(batch)
-                # Update progress
-                total_added += len(batch)
-                if i % (batch_size * 10) == 0:
-                    logger.info(f"Added {total_added} vectors total "
-                            f"({i}/{len(remaining_embeddings)} in current phase)")
-                # Periodic cleanup
-                if i % (batch_size * 20) == 0:
-                    gc.collect()
-            # Move back to GPU if needed
-            if self.faiss_gpu:
-                logger.info("Moving index back to GPU...")
-                if len(self.gpu_resources) > 1:
-                    self.index = faiss.index_cpu_to_gpus_list(cpu_index, self.gpu_resources)
-                else:
-                    self.index = faiss.index_cpu_to_gpu(self.gpu_resources[0], 0, cpu_index)
             else:
-                self.index = cpu_index
-            logger.info("CPU fallback completed successfully")
-        except Exception as e:
-            logger.error(f"Error during CPU fallback: {str(e)}")
-            raise
-    def _compute_and_index_embeddings(self):
-        """Compute embeddings and build FAISS index with simpler handling."""
-        logger.info("Computing embeddings and indexing with FAISS...")
-        try:
-            # Encode responses with memory monitoring
-            logger.info("Encoding unique responses")
-            response_embeddings = self.encode_responses(self.unique_responses)
-            response_embeddings = response_embeddings.numpy()
-            # Memory cleanup after encoding
-            gc.collect()
-            if tf.config.list_physical_devices('GPU'):
-                tf.keras.backend.clear_session()
-            # Ensure float32 and memory contiguous
-            response_embeddings = response_embeddings.astype('float32')
-            response_embeddings = np.ascontiguousarray(response_embeddings)
-            # Log memory state before normalization
-            if self.memory_monitor.has_gpu:
-                stats = self.memory_monitor.get_memory_stats()
-                if stats:
-                    logger.info(f"GPU memory before normalization: {stats.used/1e9:.2f}GB used")
-            # Normalize embeddings
-            logger.info("Normalizing embeddings with FAISS")
-            faiss.normalize_L2(response_embeddings)
-            # Create and initialize simple FlatIP index
-            dim = response_embeddings.shape[1]
-            if self.faiss_gpu:
-                cpu_index = faiss.IndexFlatIP(dim)
-                if len(self.gpu_resources) > 1:
-                    self.index = faiss.index_cpu_to_gpus_list(cpu_index, self.gpu_resources)
-                else:
-                    self.index = faiss.index_cpu_to_gpu(self.gpu_resources[0], 0, cpu_index)
-            else:
-                self.index = faiss.IndexFlatIP(dim)
-            # Add vectors to index
-            self._add_vectors_to_index(response_embeddings)
-            # Store responses and embeddings
-            self.response_pool = self.unique_responses
-            self.response_embeddings = response_embeddings
-            # Final memory cleanup
-            gc.collect()
-            if tf.config.list_physical_devices('GPU'):
-                tf.keras.backend.clear_session()
-            # Log final state
-            logger.info(f"Successfully indexed {self.index.ntotal} responses")
-            if self.memory_monitor.has_gpu:
-                stats = self.memory_monitor.get_memory_stats()
-                if stats:
-                    logger.info(f"Final GPU memory usage: {stats.used/1e9:.2f}GB used")
-            logger.info("Indexing completed successfully")
         except Exception as e:
-            logger.error(f"Error during indexing: {e}")
-            # Ensure cleanup even on error
-            gc.collect()
-            if tf.config.list_physical_devices('GPU'):
-                tf.keras.backend.clear_session()
             raise
-    def verify_faiss_index(self):
-        """Verify that FAISS index matches the response pool, if index exists."""
-        if not hasattr(self, 'index') or self.index is None:
-            logger.info("FAISS index not initialized. Skipping verification.")
-            return
-        indexed_size = self.index.ntotal
-        pool_size = len(self.response_pool)
-        logger.info(f"FAISS index size: {indexed_size}")
-        logger.info(f"Response pool size: {pool_size}")
-        if indexed_size != pool_size:
-            logger.warning("Mismatch between FAISS index size and response pool size.")
-        else:
-            logger.info("FAISS index correctly matches the response pool.")
-    def encode_query(self, query: str, context: Optional[List[Tuple[str, str]]] = None) -> tf.Tensor:
-        """Encode a query with optional conversation context."""
-        # Prepare query with context
-        if context:
-            context_str = ' '.join([
-                f"{self.special_tokens['user']} {q} "
-                f"{self.special_tokens['assistant']} {r}"
-                for q, r in context[-self.config.max_context_turns:]
-            ])
-            query = f"{context_str} {self.special_tokens['user']} {query}"
-        else:
-            query = f"{self.special_tokens['user']} {query}"
-        # Tokenize and encode
-        encodings = self.tokenizer(
-            [query],
-            padding='max_length',
-            truncation=True,
-            max_length=self.config.max_context_token_limit,
-            return_tensors='tf'
-        )
-        input_ids = encodings['input_ids']
-        # Verify token IDs
-        max_id = tf.reduce_max(input_ids).numpy()
-        new_vocab_size = len(self.tokenizer)
-        if max_id >= new_vocab_size:
-            logger.error(f"Token ID {max_id} exceeds the vocabulary size {new_vocab_size}.")
-            raise ValueError("Token ID exceeds vocabulary size.")
-        # Get embeddings from the shared encoder
-        return self.encoder(input_ids, training=False)
     def retrieve_responses_cross_encoder(
         self,
@@ -786,7 +363,7 @@ class RetrievalChatbot(DeviceAwareModel):
         # 2) Dense retrieval
         dense_topk = self.retrieve_responses_faiss(query, top_k=top_k)  # [(resp, dense_score), ...]
         if not dense_topk:
             return []
@@ -800,75 +377,228 @@ class RetrievalChatbot(DeviceAwareModel):
         combined.sort(key=lambda x: x[1], reverse=True)
         return combined
     def retrieve_responses_faiss(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]:
         """Retrieve top-k responses using FAISS."""
-        if not hasattr(self, 'index') or self.index is None:
             logger.warning("FAISS index not initialized. Cannot retrieve responses.")
             return []
-        # Encode the query
-        q_emb = self.encode_query(query)  # Shape: [1, embedding_dim]
         q_emb_np = q_emb.numpy().astype('float32')  # Ensure type match
         # Normalize the query embedding for cosine similarity
         faiss.normalize_L2(q_emb_np)
         # Search the FAISS index
-        distances, indices = self.index.search(q_emb_np, top_k)
         # Map indices to responses and distances to similarities
         top_responses = []
         for i, idx in enumerate(indices[0]):
-            if idx < len(self.response_pool):
-                top_responses.append((self.response_pool[idx], float(distances[0][i])))
             else:
                 logger.warning(f"FAISS returned invalid index {idx}. Skipping.")
         return top_responses
-    def save_models(self, save_dir: Union[str, Path]):
-        """Save models and configuration."""
-        save_dir = Path(save_dir)
-        save_dir.mkdir(parents=True, exist_ok=True)
-        # Save config
-        with open(save_dir / "config.json", "w") as f:
-            json.dump(self.config.to_dict(), f, indent=2)
-        # Save models
-        self.encoder.pretrained.save_pretrained(save_dir / "shared_encoder")
-        # Save tokenizer
-        self.tokenizer.save_pretrained(save_dir / "tokenizer")
-        logger.info(f"Models and tokenizer saved to {save_dir}.")
-    @classmethod
-    def load_models(cls, load_dir: Union[str, Path]) -> 'RetrievalChatbot':
-        """Load saved models and configuration."""
-        load_dir = Path(load_dir)
-        # Load config
-        with open(load_dir / "config.json", "r") as f:
-            config = ChatbotConfig.from_dict(json.load(f))
-        # Initialize chatbot
-        chatbot = cls(config)
-        # Load models
-        chatbot.encoder.pretrained = TFAutoModel.from_pretrained(
-            load_dir / "shared_encoder",
-            config=config
-        )
-        # Load tokenizer
-        chatbot.tokenizer = AutoTokenizer.from_pretrained(load_dir / "tokenizer")
-        logger.info(f"Models and tokenizer loaded from {load_dir}.")
-        return chatbot
-    def train_streaming(
         self,
         tfrecord_file_path: str,
         epochs: int = 20,
@@ -876,10 +606,12 @@ class RetrievalChatbot(DeviceAwareModel):
         validation_split: float = 0.2,
         checkpoint_dir: str = "checkpoints/",
         use_lr_schedule: bool = True,
-        peak_lr: float = 2e-5,
         warmup_steps_ratio: float = 0.1,
         early_stopping_patience: int = 3,
         min_delta: float = 1e-4,
     ) -> None:
         """Training using a pre-prepared TFRecord dataset."""
         logger.info("Starting training with pre-prepared TFRecord dataset...")
@@ -908,8 +640,8 @@ class RetrievalChatbot(DeviceAwareModel):
             negative_ids = tf.cast(parsed_features['negative_ids'], tf.int32)
             negative_ids = tf.reshape(negative_ids, [neg_samples, max_length])
-            return query_ids, positive_ids, negative_ids
         # Calculate total steps by counting the number of records in the TFRecord
         raw_dataset = tf.data.TFRecordDataset(tfrecord_file_path)
         total_pairs = sum(1 for _ in raw_dataset)
@@ -920,6 +652,7 @@ class RetrievalChatbot(DeviceAwareModel):
         steps_per_epoch = math.ceil(train_size / batch_size)
         val_steps = math.ceil(val_size / batch_size)
         total_steps = steps_per_epoch * epochs
         logger.info(f"Training pairs: {train_size}")
         logger.info(f"Validation pairs: {val_size}")
@@ -942,9 +675,42 @@ class RetrievalChatbot(DeviceAwareModel):
             logger.info("Using fixed learning rate.")
         # Initialize checkpoint manager
-        checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.encoder)
-        manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)
         # Setup TensorBoard
         log_dir = Path(checkpoint_dir) / "tensorboard_logs"
         log_dir.mkdir(parents=True, exist_ok=True)
@@ -960,20 +726,47 @@ class RetrievalChatbot(DeviceAwareModel):
         # Create the full dataset
         dataset = tf.data.TFRecordDataset(tfrecord_file_path)
         dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.AUTOTUNE)
-        dataset = dataset.shuffle(buffer_size=10000)  # Adjust buffer size as needed
-        dataset = dataset.batch(batch_size, drop_remainder=True)
-        dataset = dataset.prefetch(tf.data.AUTOTUNE)
-        # Split into training and validation
         train_dataset = dataset.take(train_size)
         val_dataset = dataset.skip(train_size).take(val_size)
         # Training loop
         best_val_loss = float("inf")
         epochs_no_improve = 0
-        for epoch in range(1, epochs + 1):
             # --- Training Phase ---
             epoch_loss_avg = tf.keras.metrics.Mean()
             batches_processed = 0
@@ -987,13 +780,28 @@ class RetrievalChatbot(DeviceAwareModel):
                 logger.info("Training progress bar disabled")
             for q_batch, p_batch, n_batch in train_dataset:
-                loss = self.train_step(q_batch, p_batch, n_batch)
                 epoch_loss_avg(loss)
                 batches_processed += 1
                 # Log to TensorBoard
                 with train_summary_writer.as_default():
-                    tf.summary.scalar("loss", loss, step=(epoch - 1) * steps_per_epoch + batches_processed)
                 # Update progress bar
                 if use_lr_schedule:
@@ -1005,6 +813,8 @@ class RetrievalChatbot(DeviceAwareModel):
                     train_pbar.update(1)
                     train_pbar.set_postfix({
                         "loss": f"{loss.numpy():.4f}",
                         "lr": f"{current_lr:.2e}",
                         "batches": f"{batches_processed}/{steps_per_epoch}"
                     })
@@ -1064,6 +874,11 @@ class RetrievalChatbot(DeviceAwareModel):
             # Save checkpoint
             manager.save()
             # Store metrics in history
             self.history['train_loss'].append(train_loss)
@@ -1074,8 +889,14 @@ class RetrievalChatbot(DeviceAwareModel):
             else:
                 current_lr = float(self.optimizer.learning_rate.numpy())
             self.history.setdefault('learning_rate', []).append(current_lr)
             # Early stopping logic
             if val_loss < best_val_loss - min_delta:
                 best_val_loss = val_loss
@@ -1144,10 +965,19 @@ class RetrievalChatbot(DeviceAwareModel):
             )
             loss = tf.reduce_mean(loss)
-        # Apply gradients
         gradients = tape.gradient(loss, self.encoder.trainable_variables)
         self.optimizer.apply_gradients(zip(gradients, self.encoder.trainable_variables))
-        return loss
     @tf.function
     def validation_step(
@@ -1185,316 +1015,6 @@ class RetrievalChatbot(DeviceAwareModel):
         loss = tf.reduce_mean(loss)
         return loss
-    # def train_streaming(
-    #     self,
-    #     dialogues: List[dict],
-    #     epochs: int = 20,
-    #     batch_size: int = 16,
-    #     validation_split: float = 0.2,
-    #     checkpoint_dir: str = "checkpoints/",
-    #     use_lr_schedule: bool = True,
-    #     peak_lr: float = 2e-5,
-    #     warmup_steps_ratio: float = 0.1,
-    #     early_stopping_patience: int = 3,
-    #     min_delta: float = 1e-4,
-    #     neg_samples: int = 1
-    # ) -> None:
-    #     """Streaming training with tf.data pipeline."""
-    #     logger.info("Starting streaming training pipeline with tf.data...")
-    #     # Initialize TFDataPipeline (replaces StreamingDataPipeline)
-    #     dataset_preparer = TFDataPipeline(
-    #         embedding_batch_size=self.config.embedding_batch_size,
-    #         tokenizer=self.tokenizer,
-    #         encoder=self.encoder,
-    #         index=self.index,  # Pass CPU version of FAISS index
-    #         response_pool=self.response_pool,
-    #         max_length=self.config.max_context_token_limit,
-    #         neg_samples=neg_samples
-    #     )
-    #     # Calculate total steps for learning rate schedule
-    #     total_pairs = dataset_preparer.estimate_total_pairs(dialogues)
-    #     train_size = int(total_pairs * (1 - validation_split))
-    #     val_size = int(total_pairs * validation_split)
-    #     steps_per_epoch = int(math.ceil(train_size / batch_size))
-    #     val_steps = int(math.ceil(val_size / batch_size))
-    #     total_steps = steps_per_epoch * epochs
-    #     logger.info(f"Total pairs: {total_pairs}")
-    #     logger.info(f"Training pairs: {train_size}")
-    #     logger.info(f"Validation pairs: {val_size}")
-    #     logger.info(f"Steps per epoch: {steps_per_epoch}")
-    #     logger.info(f"Validation steps: {val_steps}")
-    #     logger.info(f"Total steps: {total_steps}")
-    #     # Set up optimizer with learning rate schedule
-    #     if use_lr_schedule:
-    #         warmup_steps = int(total_steps * warmup_steps_ratio)
-    #         lr_schedule = self._get_lr_schedule(
-    #             total_steps=total_steps,
-    #             peak_lr=peak_lr,
-    #             warmup_steps=warmup_steps
-    #         )
-    #         self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
-    #         logger.info("Using custom learning rate schedule.")
-    #     else:
-    #         self.optimizer = tf.keras.optimizers.Adam(learning_rate=peak_lr)
-    #         logger.info("Using fixed learning rate.")
-    #     # Initialize checkpoint manager
-    #     checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.encoder)
-    #     manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)
-    #     # Setup TensorBoard
-    #     log_dir = Path(checkpoint_dir) / "tensorboard_logs"
-    #     log_dir.mkdir(parents=True, exist_ok=True)
-    #     current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
-    #     train_log_dir = str(log_dir / f"train_{current_time}")
-    #     val_log_dir = str(log_dir / f"val_{current_time}")
-    #     train_summary_writer = tf.summary.create_file_writer(train_log_dir)
-    #     val_summary_writer = tf.summary.create_file_writer(val_log_dir)
-    #     logger.info(f"TensorBoard logs will be saved in {log_dir}")
-    #     # Create training and validation datasets
-    #     train_dataset = dataset_preparer.get_tf_dataset(dialogues, batch_size).take(train_size)
-    #     val_dataset = dataset_preparer.get_tf_dataset(dialogues, batch_size).skip(train_size).take(val_size)
-    #     # Training loop
-    #     best_val_loss = float("inf")
-    #     epochs_no_improve = 0
-    #     for epoch in range(1, epochs + 1):
-    #         # --- Training Phase ---
-    #         epoch_loss_avg = tf.keras.metrics.Mean()
-    #         batches_processed = 0
-    #         try:
-    #             train_pbar = tqdm(total=steps_per_epoch, desc=f"Training Epoch {epoch}", unit="batch")
-    #             is_tqdm_train = True
-    #         except ImportError:
-    #             train_pbar = None
-    #             is_tqdm_train = False
-    #             logger.info("Training progress bar disabled")
-    #         for q_batch, p_batch, n_batch in train_dataset:
-    #             #p_batch = p_n_batch[:, 0, :] # Extract positive from (positive, negative) pair
-    #             loss = self.train_step(q_batch, p_batch, n_batch)
-    #             epoch_loss_avg(loss)
-    #             batches_processed += 1
-    #             # Log to TensorBoard
-    #             with train_summary_writer.as_default():
-    #                 tf.summary.scalar("loss", loss, step=(epoch - 1) * steps_per_epoch + batches_processed)
-    #             # Update progress bar
-    #             if use_lr_schedule:
-    #                 current_lr = float(lr_schedule(self.optimizer.iterations))
-    #             else:
-    #                 current_lr = float(self.optimizer.learning_rate.numpy())
-    #             if is_tqdm_train:
-    #                 train_pbar.update(1)
-    #                 train_pbar.set_postfix({
-    #                     "loss": f"{loss.numpy():.4f}",
-    #                     "lr": f"{current_lr:.2e}",
-    #                     "batches": f"{batches_processed}/{steps_per_epoch}"
-    #                 })
-    #             # Memory cleanup
-    #             gc.collect()
-    #             if batches_processed >= steps_per_epoch:
-    #                 break
-    #         if is_tqdm_train and train_pbar:
-    #             train_pbar.close()
-    #         # --- Validation Phase ---
-    #         val_loss_avg = tf.keras.metrics.Mean()
-    #         val_batches_processed = 0
-    #         try:
-    #             val_pbar = tqdm(total=val_steps, desc="Validation", unit="batch")
-    #             is_tqdm_val = True
-    #         except ImportError:
-    #             val_pbar = None
-    #             is_tqdm_val = False
-    #             logger.info("Validation progress bar disabled")
-    #         for q_batch, p_batch, n_batch in val_dataset:
-    #             #p_batch = p_n_batch[:, 0, :] # Extract positive from (positive, negative) pair
-    #             val_loss = self.validation_step(q_batch, p_batch, n_batch)
-    #             val_loss_avg(val_loss)
-    #             val_batches_processed += 1
-    #             if is_tqdm_val:
-    #                 val_pbar.update(1)
-    #                 val_pbar.set_postfix({
-    #                     "val_loss": f"{val_loss.numpy():.4f}",
-    #                     "batches": f"{val_batches_processed}/{val_steps}"
-    #                 })
-    #             # Memory cleanup
-    #             gc.collect()
-    #             if val_batches_processed >= val_steps:
-    #                 break
-    #         if is_tqdm_val and val_pbar:
-    #             val_pbar.close()
-    #         # End of epoch: compute final epoch stats, log, and save checkpoint
-    #         train_loss = epoch_loss_avg.result().numpy()
-    #         val_loss = val_loss_avg.result().numpy()
-    #         logger.info(f"Epoch {epoch} Complete: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
-    #         # Log epoch metrics
-    #         with train_summary_writer.as_default():
-    #             tf.summary.scalar("epoch_loss", train_loss, step=epoch)
-    #         with val_summary_writer.as_default():
-    #             tf.summary.scalar("val_loss", val_loss, step=epoch)
-    #         # Save checkpoint
-    #         manager.save()
-    #         # Store metrics in history
-    #         self.history['train_loss'].append(train_loss)
-    #         self.history['val_loss'].append(val_loss)
-    #         if use_lr_schedule:
-    #             current_lr = float(lr_schedule(self.optimizer.iterations))
-    #         else:
-    #             current_lr = float(self.optimizer.learning_rate.numpy())
-    #         self.history.setdefault('learning_rate', []).append(current_lr)
-    #         # Early stopping logic
-    #         if val_loss < best_val_loss - min_delta:
-    #             best_val_loss = val_loss
-    #             epochs_no_improve = 0
-    #             logger.info(f"Validation loss improved to {val_loss:.4f}. Reset patience.")
-    #         else:
-    #             epochs_no_improve += 1
-    #             logger.info(f"No improvement this epoch. Patience: {epochs_no_improve}/{early_stopping_patience}")
-    #             if epochs_no_improve >= early_stopping_patience:
-    #                 logger.info("Early stopping triggered.")
-    #                 break
-    #     logger.info("Streaming training completed!")
-    # @tf.function
-    # def train_step(
-    #     self,
-    #     q_batch: tf.Tensor,
-    #     p_batch: tf.Tensor,
-    #     n_batch: tf.Tensor,
-    #     attention_mask: Optional[tf.Tensor] = None
-    # ) -> tf.Tensor:
-    #     """
-    #     Single training step that uses queries, positives, and negatives in a
-    #     contrastive/InfoNCE style. The label is always 0 (the positive) vs.
-    #     the negative alternatives.
-    #     """
-    #     with tf.GradientTape() as tape:
-    #         # Encode queries
-    #         q_enc = self.encoder(q_batch, training=True)  # [batch_size, embed_dim]
-    #         # Encode positives
-    #         p_enc = self.encoder(p_batch, training=True)  # [batch_size, embed_dim]
-    #         # Encode negatives
-    #         # n_batch: [batch_size, neg_samples, max_length]
-    #         shape = tf.shape(n_batch)
-    #         bs = shape[0]
-    #         neg_samples = shape[1]
-    #         # Flatten negatives to feed them in one pass:
-    #         # => [batch_size * neg_samples, max_length]
-    #         n_batch_flat = tf.reshape(n_batch, [bs * neg_samples, shape[2]])
-    #         n_enc_flat = self.encoder(n_batch_flat, training=True)  # [bs*neg_samples, embed_dim]
-    #         # Reshape back => [batch_size, neg_samples, embed_dim]
-    #         n_enc = tf.reshape(n_enc_flat, [bs, neg_samples, -1])
-    #         # Combine the positive embedding and negative embeddings along dim=1
-    #         # => shape [batch_size, 1 + neg_samples, embed_dim]
-    #         # The first column is the positive; subsequent columns are negatives
-    #         combined_p_n = tf.concat(
-    #             [tf.expand_dims(p_enc, axis=1), n_enc],
-    #             axis=1
-    #         )  # [bs, (1+neg_samples), embed_dim]
-    #         # Now compute scores: dot product of q_enc with each column in combined_p_n
-    #         # We'll use `tf.einsum` to handle the batch dimension properly
-    #         # dot_products => shape [batch_size, (1+neg_samples)]
-    #         dot_products = tf.einsum('bd,bkd->bk', q_enc, combined_p_n)
-    #         # The label for each row is 0 (the first column is the correct/positive)
-    #         labels = tf.zeros([bs], dtype=tf.int32)
-    #         # Cross-entropy over the [batch_size, 1+neg_samples] scores
-    #         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-    #             labels=labels,
-    #             logits=dot_products
-    #         )
-    #         loss = tf.reduce_mean(loss)
-    #         # If there's an attention_mask you want to apply (less common in this scenario),
-    #         # you could do something like:
-    #         if attention_mask is not None:
-    #             loss = loss * attention_mask
-    #             loss = tf.reduce_sum(loss) / tf.reduce_sum(attention_mask)
-    #     # Apply gradients
-    #     gradients = tape.gradient(loss, self.encoder.trainable_variables)
-    #     self.optimizer.apply_gradients(zip(gradients, self.encoder.trainable_variables))
-    #     return loss
-    # @tf.function
-    # def validation_step(
-    #     self,
-    #     q_batch: tf.Tensor,
-    #     p_batch: tf.Tensor,
-    #     n_batch: tf.Tensor,
-    #     attention_mask: Optional[tf.Tensor] = None
-    # ) -> tf.Tensor:
-    #     """
-    #     Single validation step with queries, positives, and negatives.
-    #     Uses the same loss calculation as train_step, but `training=False`.
-    #     """
-    #     q_enc = self.encoder(q_batch, training=False)
-    #     p_enc = self.encoder(p_batch, training=False)
-    #     shape = tf.shape(n_batch)
-    #     bs = shape[0]
-    #     neg_samples = shape[1]
-    #     n_batch_flat = tf.reshape(n_batch, [bs * neg_samples, shape[2]])
-    #     n_enc_flat = self.encoder(n_batch_flat, training=False)
-    #     n_enc = tf.reshape(n_enc_flat, [bs, neg_samples, -1])
-    #     combined_p_n = tf.concat(
-    #         [tf.expand_dims(p_enc, axis=1), n_enc],
-    #         axis=1
-    #     )
-    #     dot_products = tf.einsum('bd,bkd->bk', q_enc, combined_p_n)
-    #     labels = tf.zeros([bs], dtype=tf.int32)
-    #     loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-    #         labels=labels,
-    #         logits=dot_products
-    #     )
-    #     loss = tf.reduce_mean(loss)
-    #     if attention_mask is not None:
-    #         loss = loss * attention_mask
-    #         loss = tf.reduce_sum(loss) / tf.reduce_sum(attention_mask)
-    #     return loss
     def _get_lr_schedule(
         self,
@@ -1561,75 +1081,3 @@ class RetrievalChatbot(DeviceAwareModel):
                 }
         return CustomSchedule(total_steps, peak_lr, warmup_steps)
-    def _cosine_similarity(self, emb1: np.ndarray, emb2: np.ndarray) -> np.ndarray:
-        """Compute cosine similarity between two numpy arrays."""
-        normalized_emb1 = emb1 / np.linalg.norm(emb1, axis=1, keepdims=True)
-        normalized_emb2 = emb2 / np.linalg.norm(emb2, axis=1, keepdims=True)
-        return np.dot(normalized_emb1, normalized_emb2.T)
-    def chat(
-        self,
-        query: str,
-        conversation_history: Optional[List[Tuple[str, str]]] = None,
-        quality_checker: Optional['ResponseQualityChecker'] = None,
-        top_k: int = 5,
-    ) -> Tuple[str, List[Tuple[str, float]], Dict[str, Any]]:
-        """
-        Example chat method that always uses cross-encoder re-ranking
-        if self.reranker is available.
-        """
-        @self.run_on_device
-        def get_response(self_arg, query_arg):  # Add parameters that match decorator's expectations
-            # 1) Build conversation context string
-            conversation_str = self_arg._build_conversation_context(query_arg, conversation_history)
-            # 2) Retrieve + cross-encoder re-rank
-            results = self_arg.retrieve_responses_cross_encoder(
-                query=conversation_str,
-                top_k=top_k,
-                reranker=self_arg.reranker,
-                summarizer=self_arg.summarizer,
-                summarize_threshold=512
-            )
-            # 3) Handle empty or confidence
-            if not results:
-                return (
-                    "I'm sorry, but I couldn't find a relevant response.",
-                    [],
-                    {}
-                )
-            if quality_checker:
-                metrics = quality_checker.check_response_quality(query_arg, results)
-                if not metrics.get('is_confident', False):
-                    return (
-                        "I need more information to provide a good answer. Could you please clarify?",
-                        results,
-                        metrics
-                    )
-                return results[0][0], results, metrics
-            return results[0][0], results, {}
-        return get_response(self, query)
-    def _build_conversation_context(
-        self,
-        query: str,
-        conversation_history: Optional[List[Tuple[str, str]]]
-    ) -> str:
-        """Build conversation context with better memory management."""
-        if not conversation_history:
-            return f"{self.special_tokens['user']} {query}"
-        conversation_parts = []
-        for user_txt, assistant_txt in conversation_history:
-            conversation_parts.extend([
-                f"{self.special_tokens['user']} {user_txt}",
-                f"{self.special_tokens['assistant']} {assistant_txt}"
-            ])
-        conversation_parts.append(f"{self.special_tokens['user']} {query}")
-        return "\n".join(conversation_parts)

+import os
 from transformers import TFAutoModel, AutoTokenizer
 import tensorflow as tf
 from typing import List, Tuple, Dict, Optional, Union, Any
 import math
 from dataclasses import dataclass
         super().__init__(name=name, **kwargs)
         self.config = config
+        # Load pretrained model and freeze layers based on config
         self.pretrained = TFAutoModel.from_pretrained(config.pretrained_model)
         self._freeze_layers()
+        # Add Pooling layer (Global Average Pooling), Projection layer, Dropout, and Normalization
         self.pooler = tf.keras.layers.GlobalAveragePooling1D()
         self.projection = tf.keras.layers.Dense(
             config.embedding_dim,
             activation='tanh',
             name="projection"
         )
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
         self.normalize = tf.keras.layers.Lambda(
             lambda x: tf.nn.l2_normalize(x, axis=1),
         """Forward pass."""
         # Get pretrained embeddings
         pretrained_outputs = self.pretrained(inputs, training=training)
+        x = pretrained_outputs.last_hidden_state # Shape: [batch_size, seq_len, embedding_dim]
         # Apply pooling, projection, dropout, and normalization
+        x = self.pooler(x)                      # Shape: [batch_size, 768]
+        x = self.projection(x)                  # Shape: [batch_size, 768]
+        x = self.dropout(x, training=training)
+        x = self.normalize(x)                   # Shape: [batch_size, 768]
         return x
     def __init__(
         self,
         config: ChatbotConfig,
         device: str = None,
         strategy=None,
         reranker: Optional[CrossEncoderReranker] = None,
         summarizer: Optional[Summarizer] = None,
+        mode: str = 'training'
     ):
         super().__init__()
         self.config = config
         self.device = device or self._setup_default_device()
         self.mode = mode.lower()
+        # Initialize reranker, summarizer, tokenizer, encoder, and memory monitor
         self.reranker = reranker or self._initialize_reranker()
         self.tokenizer = self._initialize_tokenizer()
+        self.encoder = self._initialize_encoder()
+        self.summarizer = summarizer or self._initialize_summarizer()
         self.memory_monitor = GPUMemoryMonitor()
+        # Initialize data pipeline
+        logger.info("Initializing TFDataPipeline.")
+        self.data_pipeline = TFDataPipeline(
+            config=self.config,
+            tokenizer=self.tokenizer,
+            encoder=self.encoder,
+            index_file_path='path/to/index',  # Update as needed # TODO: Update this path
+            response_pool=[],
+            max_length=self.config.max_context_token_limit,
+            query_embeddings_cache={},
+            neg_samples=self.config.neg_samples,
+            index_type='IndexFlatIP',
+            nlist=100, # Not used with IndexFlatIP
+            max_retries=self.config.max_retries
+        )
+        # Collect unique responses from dialogues
+        if self.mode == 'inference':
+            logger.info("Mode set to 'inference'. Loading FAISS index and response pool.")
+            self._load_faiss_index_and_responses()
+        elif self.mode != 'training':
+            logger.error(f"Unsupported mode in RetrievalChatbot init: {self.mode}")
+            raise ValueError(f"Unsupported mode in RetrievalChatbot init: {self.mode}")
         # Initialize training history
         self.history = {
             "train_loss": [],
             "train_metrics": {},
             "val_metrics": {}
         }
     def _setup_default_device(self) -> str:
         """Set up default device if none is provided."""
     def _initialize_summarizer(self) -> Summarizer:
         """Initialize the Summarizer."""
+        return Summarizer(
+            tokenizer=self.tokenizer,
+            model_name="t5-small",
+            max_summary_length=self.config.max_context_token_limit // 4,
+            device=self.device,
+            max_summary_rounds=2
+        )
     def _initialize_tokenizer(self) -> AutoTokenizer:
         """Initialize the tokenizer and add special tokens."""
         )
         return tokenizer
+    def _initialize_encoder(self) -> EncoderModel:
+        """Initialize the EncoderModel and resize token embeddings."""
+        logger.info("Initializing encoder model...")
+        encoder = EncoderModel(
             self.config,
             name="shared_encoder",
         )
         new_vocab_size = len(self.tokenizer)
+        encoder.pretrained.resize_token_embeddings(new_vocab_size)
         logger.info(f"Token embeddings resized to: {new_vocab_size}")
+        return encoder
+    def _load_faiss_index_and_responses(self) -> None:
+        """Load FAISS index and response pool for inference."""
         try:
+            logger.info(f"Loading FAISS index from {self.data_pipeline.index_file_path}...")
+            self.data_pipeline.load_faiss_index(self.data_pipeline.index_file_path)
+            logger.info("FAISS index loaded successfully.")
+            # Load response pool associated with the FAISS index
+            response_pool_path = self.data_pipeline.index_file_path.replace('.index', '_responses.json')
+            if os.path.exists(response_pool_path):
+                with open(response_pool_path, 'r', encoding='utf-8') as f:
+                    self.data_pipeline.response_pool = json.load(f)
+                logger.info(f"Loaded {len(self.data_pipeline.response_pool)} responses from {response_pool_path}.")
             else:
+                logger.error(f"Response pool file not found at {response_pool_path}.")
+                raise FileNotFoundError(f"Response pool file not found at {response_pool_path}.")
+            # Validate FAISS index and response pool
+            self.data_pipeline.validate_faiss_index()
+            logger.info("FAISS index and response pool validated successfully.")
         except Exception as e:
+            logger.error(f"Failed to load FAISS index and response pool: {e}")
             raise
+    @classmethod
+    def load_model(cls, load_dir: Union[str, Path], mode: str = 'training') -> 'RetrievalChatbot':
         """
+        Load saved models and configuration.
+        Args:
+            load_dir (Union[str, Path]): Directory containing saved model files
+            mode (str): Either 'training' or 'inference'. In inference mode,
+                    also loads FAISS index and response pool.
+        """
+        load_dir = Path(load_dir)
+        # Load config
+        with open(load_dir / "config.json", "r") as f:
+            config = ChatbotConfig.from_dict(json.load(f))
+        # Initialize chatbot with appropriate mode
+        chatbot = cls(config, mode=mode)
+        # Load models
+        chatbot.encoder.pretrained = TFAutoModel.from_pretrained(
+            load_dir / "shared_encoder",
+            config=config
+        )
+        # Load tokenizer
+        chatbot.tokenizer = AutoTokenizer.from_pretrained(load_dir / "tokenizer")
+        logger.info(f"Models and tokenizer loaded from {load_dir}")
+        # If in inference mode, load additional components
+        if mode == 'inference':
+            cls._prepare_model_for_inference(chatbot, load_dir)
+        return chatbot
+    @classmethod
+    def _prepare_model_for_inference(cls, chatbot: 'RetrievalChatbot', load_dir: Path) -> None:
+        """Internal method to load inference components."""
         try:
+            # Load FAISS index
+            faiss_path = load_dir / 'faiss_index.bin'
+            if faiss_path.exists():
+                chatbot.index = faiss.read_index(str(faiss_path))
+                logger.info("FAISS index loaded successfully")
             else:
+                raise FileNotFoundError(f"FAISS index not found at {faiss_path}")
+            # Load response pool
+            response_pool_path = load_dir / 'response_pool.json'
+            if response_pool_path.exists():
+                with open(response_pool_path, 'r') as f:
+                    chatbot.response_pool = json.load(f)
+                logger.info(f"Loaded {len(chatbot.response_pool)} responses")
             else:
+                raise FileNotFoundError(f"Response pool not found at {response_pool_path}")
+            # Verify dimensions match
+            if chatbot.index.d != chatbot.config.embedding_dim:
+                raise ValueError(
+                    f"FAISS index dimension {chatbot.index.d} doesn't match "
+                    f"model dimension {chatbot.config.embedding_dim}"
+                )
         except Exception as e:
+            logger.error(f"Error loading inference components: {e}")
             raise
+    def save_models(self, save_dir: Union[str, Path]):
+        """Save models and configuration."""
+        save_dir = Path(save_dir)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        # Save config
+        with open(save_dir / "config.json", "w") as f:
+            json.dump(self.config.to_dict(), f, indent=2)
+        # Save models
+        self.encoder.pretrained.save_pretrained(save_dir / "shared_encoder")
+        # Save tokenizer
+        self.tokenizer.save_pretrained(save_dir / "tokenizer")
+        logger.info(f"Models and tokenizer saved to {save_dir}.")
     def retrieve_responses_cross_encoder(
         self,
         # 2) Dense retrieval
         dense_topk = self.retrieve_responses_faiss(query, top_k=top_k)  # [(resp, dense_score), ...]
         if not dense_topk:
             return []
         combined.sort(key=lambda x: x[1], reverse=True)
         return combined
+    # def retrieve_responses_cross_encoder(
+    #     self,
+    #     query: str,
+    #     top_k: int,
+    #     reranker: Optional[CrossEncoderReranker] = None,
+    #     summarizer: Optional[Summarizer] = None,
+    #     summarize_threshold: int = 512  # Summarize over 512 tokens
+    # ) -> List[Tuple[str, float]]:
+    #     """
+    #     Retrieve top-k from FAISS, then re-rank them with a cross-encoder.
+    #     Optionally summarize the user query if it's too long.
+    #     """
+    #     if reranker is None:
+    #         reranker = self.reranker
+    #     if summarizer is None:
+    #         summarizer = self.summarizer
+    #     # Optional summarization
+    #     if summarizer and len(query.split()) > summarize_threshold:
+    #         logger.info(f"Query is long. Summarizing before cross-encoder. Original length: {len(query.split())}")
+    #         query = summarizer.summarize_text(query)
+    #         logger.info(f"Summarized query: {query}")
+    #     # 2) Dense retrieval
+    #     dense_topk = self.retrieve_responses_faiss(query, top_k=top_k)  # [(resp, dense_score), ...]
+    #     if not dense_topk:
+    #         return []
+    #     # 3) Cross-encoder rerank
+    #     candidate_texts = [pair[0] for pair in dense_topk]
+    #     cross_scores = reranker.rerank(query, candidate_texts, max_length=256)
+    #     # Combine
+    #     combined = [(text, score) for (text, _), score in zip(dense_topk, cross_scores)]
+    #     # Sort descending by cross-encoder score
+    #     combined.sort(key=lambda x: x[1], reverse=True)
+    #     return combined
     def retrieve_responses_faiss(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]:
         """Retrieve top-k responses using FAISS."""
+        if not hasattr(self.data_pipeline, 'index') or self.data_pipeline.index is None:
             logger.warning("FAISS index not initialized. Cannot retrieve responses.")
             return []
+        # Encode the query using TFDataPipeline's method
+        q_emb = self.data_pipeline.encode_query(query)  # Ensure encode_query is within TFDataPipeline
         q_emb_np = q_emb.numpy().astype('float32')  # Ensure type match
         # Normalize the query embedding for cosine similarity
         faiss.normalize_L2(q_emb_np)
         # Search the FAISS index
+        distances, indices = self.data_pipeline.index.search(q_emb_np, top_k)
         # Map indices to responses and distances to similarities
         top_responses = []
         for i, idx in enumerate(indices[0]):
+            if idx < len(self.data_pipeline.response_pool):
+                top_responses.append((self.data_pipeline.response_pool[idx], float(distances[0][i])))
             else:
                 logger.warning(f"FAISS returned invalid index {idx}. Skipping.")
         return top_responses
+    # def retrieve_responses_faiss(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]:
+    #     """Retrieve top-k responses using FAISS."""
+    #     if not hasattr(self, 'index') or self.index is None:
+    #         logger.warning("FAISS index not initialized. Cannot retrieve responses.")
+    #         return []
+    #     # Encode the query
+    #     q_emb = self.encode_query(query)  # Shape: [1, embedding_dim]
+    #     q_emb_np = q_emb.numpy().astype('float32')  # Ensure type match
+    #     # Normalize the query embedding for cosine similarity
+    #     faiss.normalize_L2(q_emb_np)
+    #     # Search the FAISS index
+    #     distances, indices = self.index.search(q_emb_np, top_k)
+    #     # Map indices to responses and distances to similarities
+    #     top_responses = []
+    #     for i, idx in enumerate(indices[0]):
+    #         if idx < len(self.response_pool):
+    #             top_responses.append((self.response_pool[idx], float(distances[0][i])))
+    #         else:
+    #             logger.warning(f"FAISS returned invalid index {idx}. Skipping.")
+    #     return top_responses
+    def chat(
+        self,
+        query: str,
+        conversation_history: Optional[List[Tuple[str, str]]] = None,
+        quality_checker: Optional['ResponseQualityChecker'] = None,
+        top_k: int = 5,
+    ) -> Tuple[str, List[Tuple[str, float]], Dict[str, Any]]:
+        """
+        Example chat method that always uses cross-encoder re-ranking
+        if self.reranker is available.
+        """
+        @self.run_on_device
+        def get_response(self_arg, query_arg):
+            # 1) Build conversation context string
+            conversation_str = self_arg._build_conversation_context(query_arg, conversation_history)
+            # 2) Retrieve + cross-encoder re-rank
+            results = self_arg.retrieve_responses_cross_encoder(
+                query=conversation_str,
+                top_k=top_k,
+                reranker=self_arg.reranker,
+                summarizer=self_arg.summarizer,
+                summarize_threshold=512
+            )
+            # 3) Handle empty or confidence
+            if not results:
+                return (
+                    "I'm sorry, but I couldn't find a relevant response.",
+                    [],
+                    {}
+                )
+            if quality_checker:
+                metrics = quality_checker.check_response_quality(query_arg, results)
+                if not metrics.get('is_confident', False):
+                    return (
+                        "I need more information to provide a good answer. Could you please clarify?",
+                        results,
+                        metrics
+                    )
+                return results[0][0], results, metrics
+            return results[0][0], results, {}
+        return get_response(self, query)
+    # def chat(
+    #     self,
+    #     query: str,
+    #     conversation_history: Optional[List[Tuple[str, str]]] = None,
+    #     quality_checker: Optional['ResponseQualityChecker'] = None,
+    #     top_k: int = 5,
+    # ) -> Tuple[str, List[Tuple[str, float]], Dict[str, Any]]:
+    #     """
+    #     Example chat method that always uses cross-encoder re-ranking
+    #     if self.reranker is available.
+    #     """
+    #     @self.run_on_device
+    #     def get_response(self_arg, query_arg):  # Add parameters that match decorator's expectations
+    #         # 1) Build conversation context string
+    #         conversation_str = self_arg._build_conversation_context(query_arg, conversation_history)
+    #         # 2) Retrieve + cross-encoder re-rank
+    #         results = self_arg.retrieve_responses_cross_encoder(
+    #             query=conversation_str,
+    #             top_k=top_k,
+    #             reranker=self_arg.reranker,
+    #             summarizer=self_arg.summarizer,
+    #             summarize_threshold=512
+    #         )
+    #         # 3) Handle empty or confidence
+    #         if not results:
+    #             return (
+    #                 "I'm sorry, but I couldn't find a relevant response.",
+    #                 [],
+    #                 {}
+    #             )
+    #         if quality_checker:
+    #             metrics = quality_checker.check_response_quality(query_arg, results)
+    #             if not metrics.get('is_confident', False):
+    #                 return (
+    #                     "I need more information to provide a good answer. Could you please clarify?",
+    #                     results,
+    #                     metrics
+    #                 )
+    #             return results[0][0], results, metrics
+    #         return results[0][0], results, {}
+    #     return get_response(self, query)
+    def _build_conversation_context(
+        self,
+        query: str,
+        conversation_history: Optional[List[Tuple[str, str]]]
+    ) -> str:
+        """Build conversation context with better memory management."""
+        if not conversation_history:
+            return f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
+        conversation_parts = []
+        for user_txt, assistant_txt in conversation_history:
+            conversation_parts.extend([
+                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {user_txt}",
+                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {assistant_txt}"
+            ])
+        conversation_parts.append(f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}")
+        return "\n".join(conversation_parts)
+    # def _build_conversation_context(
+    #     self,
+    #     query: str,
+    #     conversation_history: Optional[List[Tuple[str, str]]]
+    # ) -> str:
+    #     """Build conversation context with better memory management."""
+    #     if not conversation_history:
+    #         return f"{self.special_tokens['user']} {query}"
+    #     conversation_parts = []
+    #     for user_txt, assistant_txt in conversation_history:
+    #         conversation_parts.extend([
+    #             f"{self.special_tokens['user']} {user_txt}",
+    #             f"{self.special_tokens['assistant']} {assistant_txt}"
+    #         ])
+    #     conversation_parts.append(f"{self.special_tokens['user']} {query}")
+    #     return "\n".join(conversation_parts)
+    def train_model(
         self,
         tfrecord_file_path: str,
         epochs: int = 20,
         validation_split: float = 0.2,
         checkpoint_dir: str = "checkpoints/",
         use_lr_schedule: bool = True,
+        peak_lr: float = 1e-5,
         warmup_steps_ratio: float = 0.1,
         early_stopping_patience: int = 3,
         min_delta: float = 1e-4,
+        test_mode: bool = False,
+        initial_epoch: int = 0
     ) -> None:
         """Training using a pre-prepared TFRecord dataset."""
         logger.info("Starting training with pre-prepared TFRecord dataset...")
             negative_ids = tf.cast(parsed_features['negative_ids'], tf.int32)
             negative_ids = tf.reshape(negative_ids, [neg_samples, max_length])
+            return query_ids, positive_ids, negative_ids
         # Calculate total steps by counting the number of records in the TFRecord
         raw_dataset = tf.data.TFRecordDataset(tfrecord_file_path)
         total_pairs = sum(1 for _ in raw_dataset)
         steps_per_epoch = math.ceil(train_size / batch_size)
         val_steps = math.ceil(val_size / batch_size)
         total_steps = steps_per_epoch * epochs
+        buffer_size = total_pairs // 10 # 10% of the dataset
         logger.info(f"Training pairs: {train_size}")
         logger.info(f"Validation pairs: {val_size}")
             logger.info("Using fixed learning rate.")
         # Initialize checkpoint manager
+        checkpoint = tf.train.Checkpoint(
+            epoch=tf.Variable(0),
+            optimizer=self.optimizer,
+            model=self.encoder,
+            variables=self.encoder.variables
+        )
+        manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3, checkpoint_name='ckpt')
+        # Restore from checkpoint if available
+        latest_checkpoint = manager.latest_checkpoint
+        if latest_checkpoint:
+            history_path = Path(checkpoint_dir) / 'training_history.json'
+            if history_path.exists():
+                try:
+                    with open(history_path, 'r') as f:
+                        self.history = json.load(f)
+                    logger.info(f"Loaded previous training history from {history_path}")
+                except Exception as e:
+                    logger.warning(f"Could not load history, starting fresh: {e}")
+                    self.history = {'train_loss': [], 'val_loss': [], 'learning_rate': []}
+            else:
+                self.history = {'train_loss': [], 'val_loss': [], 'learning_rate': []}
+            status = checkpoint.restore(latest_checkpoint)
+            status.expect_partial()
+            logger.info(f"Restored from checkpoint: {latest_checkpoint}")
+            # Get the checkpoint number to validate initial_epoch
+            ckpt_number = int(latest_checkpoint.split('ckpt-')[-1])
+            if initial_epoch == 0:
+                initial_epoch = ckpt_number
+            logger.info(f"Resuming from epoch {initial_epoch}")
+        else:
+            logger.info("Starting training from scratch")
+            initial_epoch = 0
         # Setup TensorBoard
         log_dir = Path(checkpoint_dir) / "tensorboard_logs"
         log_dir.mkdir(parents=True, exist_ok=True)
         # Create the full dataset
         dataset = tf.data.TFRecordDataset(tfrecord_file_path)
+        # Test mode for debugging
+        if test_mode:
+            subset_size = 200
+            dataset = dataset.take(subset_size)
+            logger.info(f"TEST MODE: Using only {subset_size} examples")
+            # Recalculate sizes
+            total_pairs = subset_size
+            train_size = int(total_pairs * (1 - validation_split))
+            val_size = total_pairs - train_size
+            steps_per_epoch = math.ceil(train_size / batch_size)
+            val_steps = math.ceil(val_size / batch_size)
+            total_steps = steps_per_epoch * epochs
+            buffer_size = total_pairs // 10 # 10% of the dataset
+            epochs = min(epochs, 5)  # Limit epochs in test mode
+            early_stopping_patience = 2
+            logger.info(f"New training pairs: {train_size}")
+            logger.info(f"New validation pairs: {val_size}")
         dataset = dataset.map(parse_fn, num_parallel_calls=tf.data.AUTOTUNE)
+        # Split into training and validation sets
         train_dataset = dataset.take(train_size)
         val_dataset = dataset.skip(train_size).take(val_size)
+        # Shuffle the training data
+        train_dataset = train_dataset.shuffle(buffer_size=buffer_size)
+        # Batch both datasets
+        train_dataset = train_dataset.batch(batch_size, drop_remainder=True)
+        train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
+        val_dataset = val_dataset.batch(batch_size, drop_remainder=True)
+        val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
+        val_dataset = val_dataset.cache()
         # Training loop
         best_val_loss = float("inf")
         epochs_no_improve = 0
+        for epoch in range(initial_epoch + 1, epochs + 1):
             # --- Training Phase ---
             epoch_loss_avg = tf.keras.metrics.Mean()
             batches_processed = 0
                 logger.info("Training progress bar disabled")
             for q_batch, p_batch, n_batch in train_dataset:
+                loss, grad_norm, post_clip_norm = self.train_step(q_batch, p_batch, n_batch)
+                # Check for gradient issues
+                grad_norm_value = float(grad_norm.numpy())
+                post_clip_value = float(post_clip_norm.numpy())
+                if grad_norm_value < 1e-7:
+                    logger.warning(f"Potential vanishing gradient detected: norm = {grad_norm_value:.2e}")
+                elif grad_norm_value > 100:
+                    logger.warning(f"Potential exploding gradient detected: norm = {grad_norm_value:.2e}")
+                if grad_norm_value != post_clip_value:
+                    logger.info(f"Gradient clipped: {grad_norm_value:.2e} -> {post_clip_value:.2e}")
                 epoch_loss_avg(loss)
                 batches_processed += 1
                 # Log to TensorBoard
                 with train_summary_writer.as_default():
+                    step = (epoch - 1) * steps_per_epoch + batches_processed
+                    tf.summary.scalar("loss", loss, step=step)
+                    tf.summary.scalar("gradient_norm_pre_clip", grad_norm, step=step)
+                    tf.summary.scalar("gradient_norm_post_clip", post_clip_norm, step=step)
                 # Update progress bar
                 if use_lr_schedule:
                     train_pbar.update(1)
                     train_pbar.set_postfix({
                         "loss": f"{loss.numpy():.4f}",
+                        "pre_clip": f"{grad_norm_value:.2e}",
+                        "post_clip": f"{post_clip_value:.2e}",
                         "lr": f"{current_lr:.2e}",
                         "batches": f"{batches_processed}/{steps_per_epoch}"
                     })
             # Save checkpoint
             manager.save()
+            # Save model after each epoch for testing/inference
+            model_save_path = Path(checkpoint_dir) / f"model_epoch_{epoch}"
+            self.save_models(model_save_path)
+            logger.info(f"Saved model for epoch {epoch} at {model_save_path}")
             # Store metrics in history
             self.history['train_loss'].append(train_loss)
             else:
                 current_lr = float(self.optimizer.learning_rate.numpy())
+            # Log learning rate
             self.history.setdefault('learning_rate', []).append(current_lr)
+            # Save history to file
+            with open(history_path, 'w') as f:
+                json.dump(self.history, f)
+            logger.info(f"Saved training history to {history_path}")
             # Early stopping logic
             if val_loss < best_val_loss - min_delta:
                 best_val_loss = val_loss
             )
             loss = tf.reduce_mean(loss)
+        # Calculate gradients
         gradients = tape.gradient(loss, self.encoder.trainable_variables)
+        gradients_norm = tf.linalg.global_norm(gradients)
+        # Clip gradients if norm exceeds threshold
+        max_grad_norm = 1.0
+        gradients, _ = tf.clip_by_global_norm(gradients, max_grad_norm, gradients_norm)
+        post_clip_norm = tf.linalg.global_norm(gradients)
+        # Apply gradients
         self.optimizer.apply_gradients(zip(gradients, self.encoder.trainable_variables))
+        return loss, gradients_norm, post_clip_norm
     @tf.function
     def validation_step(
         loss = tf.reduce_mean(loss)
         return loss
     def _get_lr_schedule(
         self,
                 }
         return CustomSchedule(total_steps, peak_lr, warmup_steps)

chatbot_validator.py CHANGED Viewed

@@ -1,23 +1,23 @@
 from typing import Dict, List, Tuple, Any, Optional
 import numpy as np
-from logger_config import config_logger
 logger = config_logger(__name__)
 class ChatbotValidator:
     """Handles automated validation and performance analysis for the chatbot."""
     def __init__(self, chatbot, quality_checker):
         """
         Initialize the validator.
         Args:
             chatbot: RetrievalChatbot instance
             quality_checker: ResponseQualityChecker instance
         """
         self.chatbot = chatbot
         self.quality_checker = quality_checker
         # Domain-specific test queries aligned with Taskmaster-1 and Schema-Guided
         self.domain_queries = {
             'restaurant': [
@@ -59,50 +59,50 @@ class ChatbotValidator:
     def run_validation(
         self,
-        num_examples: int = 10,
         top_k: int = 10,
         domains: Optional[List[str]] = None
     ) -> Dict[str, Any]:
         """
         Run comprehensive validation across specified domains.
         Args:
             num_examples: Number of test queries per domain
             top_k: Number of responses to retrieve for each query
             domains: Optional list of specific domains to test
         Returns:
             Dict containing detailed validation metrics and domain-specific performance
         """
         logger.info("\n=== Running Enhanced Automatic Validation ===")
         # Select domains to test
         test_domains = domains if domains else list(self.domain_queries.keys())
         metrics_history = []
         domain_metrics = {}
         # Run validation for each domain
         for domain in test_domains:
             domain_metrics[domain] = []
             queries = self.domain_queries[domain][:num_examples]
             logger.info(f"\n=== Testing {domain.title()} Domain ===")
             for i, query in enumerate(queries, 1):
                 logger.info(f"\nTest Case {i}:")
                 logger.info(f"Query: {query}")
                 # Get responses with increased top_k
                 responses = self.chatbot.retrieve_responses_cross_encoder(query, top_k=top_k)
-                # Enhanced quality checking with context
                 quality_metrics = self.quality_checker.check_response_quality(query, responses)
                 # Add domain info
                 quality_metrics['domain'] = domain
                 metrics_history.append(quality_metrics)
                 domain_metrics[domain].append(quality_metrics)
                 # Detailed logging
                 self._log_validation_results(query, responses, quality_metrics, i)
@@ -110,12 +110,12 @@ class ChatbotValidator:
         aggregate_metrics = self._calculate_aggregate_metrics(metrics_history)
         domain_analysis = self._analyze_domain_performance(domain_metrics)
         confidence_analysis = self._analyze_confidence_distribution(metrics_history)
         aggregate_metrics.update({
             'domain_performance': domain_analysis,
             'confidence_analysis': confidence_analysis
         })
         self._log_validation_summary(aggregate_metrics)
         return aggregate_metrics
@@ -129,7 +129,7 @@ class ChatbotValidator:
             'avg_length_score': np.mean([m.get('response_length_score', 0) for m in metrics_history]),
             'avg_score_gap': np.mean([m.get('top_3_score_gap', 0) for m in metrics_history]),
             'confidence_rate': np.mean([m.get('is_confident', False) for m in metrics_history]),
             # Additional statistical metrics
             'median_top_score': np.median([m.get('top_score', 0) for m in metrics_history]),
             'score_std': np.std([m.get('top_score', 0) for m in metrics_history]),
@@ -141,7 +141,7 @@ class ChatbotValidator:
     def _analyze_domain_performance(self, domain_metrics: Dict[str, List[Dict]]) -> Dict[str, Dict]:
         """Analyze performance by domain."""
         domain_analysis = {}
         for domain, metrics in domain_metrics.items():
             domain_analysis[domain] = {
                 'confidence_rate': np.mean([m.get('is_confident', False) for m in metrics]),
@@ -150,13 +150,13 @@ class ChatbotValidator:
                 'avg_top_score': np.mean([m.get('top_score', 0) for m in metrics]),
                 'num_samples': len(metrics)
             }
         return domain_analysis
     def _analyze_confidence_distribution(self, metrics_history: List[Dict]) -> Dict[str, float]:
         """Analyze the distribution of confidence scores."""
         scores = [m.get('top_score', 0) for m in metrics_history]
         return {
             'percentile_25': np.percentile(scores, 25),
             'percentile_50': np.percentile(scores, 50),
@@ -180,7 +180,7 @@ class ChatbotValidator:
         for metric, value in metrics.items():
             if isinstance(value, (int, float)):
                 logger.info(f"  {metric}: {value:.4f}")
         logger.info("\nTop Responses:")
         for i, (response, score) in enumerate(responses[:3], 1):
             logger.info(f"{i}. Score: {score:.4f}. Response: {response}")
@@ -190,18 +190,18 @@ class ChatbotValidator:
     def _log_validation_summary(self, metrics: Dict[str, Any]):
         """Log comprehensive validation summary."""
         logger.info("\n=== Validation Summary ===")
         logger.info("\nOverall Metrics:")
         for metric, value in metrics.items():
             if isinstance(value, (int, float)):
                 logger.info(f"{metric}: {value:.4f}")
         logger.info("\nDomain Performance:")
         for domain, domain_metrics in metrics['domain_performance'].items():
             logger.info(f"\n{domain.title()}:")
             for metric, value in domain_metrics.items():
                 logger.info(f"  {metric}: {value:.4f}")
         logger.info("\nConfidence Distribution:")
         for percentile, value in metrics['confidence_analysis'].items():
             logger.info(f"{percentile}: {value:.4f}")

 from typing import Dict, List, Tuple, Any, Optional
 import numpy as np
+from logger_config import config_logger
 logger = config_logger(__name__)
 class ChatbotValidator:
     """Handles automated validation and performance analysis for the chatbot."""
     def __init__(self, chatbot, quality_checker):
         """
         Initialize the validator.
         Args:
             chatbot: RetrievalChatbot instance
             quality_checker: ResponseQualityChecker instance
         """
         self.chatbot = chatbot
         self.quality_checker = quality_checker
         # Domain-specific test queries aligned with Taskmaster-1 and Schema-Guided
         self.domain_queries = {
             'restaurant': [
     def run_validation(
         self,
+        num_examples: int = 5,
         top_k: int = 10,
         domains: Optional[List[str]] = None
     ) -> Dict[str, Any]:
         """
         Run comprehensive validation across specified domains.
         Args:
             num_examples: Number of test queries per domain
             top_k: Number of responses to retrieve for each query
             domains: Optional list of specific domains to test
         Returns:
             Dict containing detailed validation metrics and domain-specific performance
         """
         logger.info("\n=== Running Enhanced Automatic Validation ===")
         # Select domains to test
         test_domains = domains if domains else list(self.domain_queries.keys())
         metrics_history = []
         domain_metrics = {}
         # Run validation for each domain
         for domain in test_domains:
             domain_metrics[domain] = []
             queries = self.domain_queries[domain][:num_examples]
             logger.info(f"\n=== Testing {domain.title()} Domain ===")
             for i, query in enumerate(queries, 1):
                 logger.info(f"\nTest Case {i}:")
                 logger.info(f"Query: {query}")
                 # Get responses with increased top_k
                 responses = self.chatbot.retrieve_responses_cross_encoder(query, top_k=top_k)
+                # Enhanced quality checking with context (assuming no context here)
                 quality_metrics = self.quality_checker.check_response_quality(query, responses)
                 # Add domain info
                 quality_metrics['domain'] = domain
                 metrics_history.append(quality_metrics)
                 domain_metrics[domain].append(quality_metrics)
                 # Detailed logging
                 self._log_validation_results(query, responses, quality_metrics, i)
         aggregate_metrics = self._calculate_aggregate_metrics(metrics_history)
         domain_analysis = self._analyze_domain_performance(domain_metrics)
         confidence_analysis = self._analyze_confidence_distribution(metrics_history)
         aggregate_metrics.update({
             'domain_performance': domain_analysis,
             'confidence_analysis': confidence_analysis
         })
         self._log_validation_summary(aggregate_metrics)
         return aggregate_metrics
             'avg_length_score': np.mean([m.get('response_length_score', 0) for m in metrics_history]),
             'avg_score_gap': np.mean([m.get('top_3_score_gap', 0) for m in metrics_history]),
             'confidence_rate': np.mean([m.get('is_confident', False) for m in metrics_history]),
             # Additional statistical metrics
             'median_top_score': np.median([m.get('top_score', 0) for m in metrics_history]),
             'score_std': np.std([m.get('top_score', 0) for m in metrics_history]),
     def _analyze_domain_performance(self, domain_metrics: Dict[str, List[Dict]]) -> Dict[str, Dict]:
         """Analyze performance by domain."""
         domain_analysis = {}
         for domain, metrics in domain_metrics.items():
             domain_analysis[domain] = {
                 'confidence_rate': np.mean([m.get('is_confident', False) for m in metrics]),
                 'avg_top_score': np.mean([m.get('top_score', 0) for m in metrics]),
                 'num_samples': len(metrics)
             }
         return domain_analysis
     def _analyze_confidence_distribution(self, metrics_history: List[Dict]) -> Dict[str, float]:
         """Analyze the distribution of confidence scores."""
         scores = [m.get('top_score', 0) for m in metrics_history]
         return {
             'percentile_25': np.percentile(scores, 25),
             'percentile_50': np.percentile(scores, 50),
         for metric, value in metrics.items():
             if isinstance(value, (int, float)):
                 logger.info(f"  {metric}: {value:.4f}")
         logger.info("\nTop Responses:")
         for i, (response, score) in enumerate(responses[:3], 1):
             logger.info(f"{i}. Score: {score:.4f}. Response: {response}")
     def _log_validation_summary(self, metrics: Dict[str, Any]):
         """Log comprehensive validation summary."""
         logger.info("\n=== Validation Summary ===")
         logger.info("\nOverall Metrics:")
         for metric, value in metrics.items():
             if isinstance(value, (int, float)):
                 logger.info(f"{metric}: {value:.4f}")
         logger.info("\nDomain Performance:")
         for domain, domain_metrics in metrics['domain_performance'].items():
             logger.info(f"\n{domain.title()}:")
             for metric, value in domain_metrics.items():
                 logger.info(f"  {metric}: {value:.4f}")
         logger.info("\nConfidence Distribution:")
         for percentile, value in metrics['confidence_analysis'].items():
             logger.info(f"{percentile}: {value:.4f}")

conversation_summarizer.py CHANGED Viewed

@@ -49,7 +49,15 @@ class Summarizer(DeviceAwareModel):
     Handles long conversations by intelligent chunking and progressive summarization.
     """
-    def __init__(self, model_name="t5-small", max_summary_length=128, device=None, max_summary_rounds=2):
         self.setup_device(device)
         # Initialize model within strategy scope if using distribution
@@ -63,12 +71,11 @@ class Summarizer(DeviceAwareModel):
         self.max_summary_rounds = max_summary_rounds
     def _setup_model(self, model_name):
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
         # Optimize model for inference
-        self.model.predict = tf.function(
-            self.model.predict,
             input_signature=[
                 {
                     'input_ids': tf.TensorSpec(shape=[None, None], dtype=tf.int32),

     Handles long conversations by intelligent chunking and progressive summarization.
     """
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        model_name="t5-small",
+        max_summary_length=128,
+        device=None,
+        max_summary_rounds=2
+    ):
+        self.tokenizer = tokenizer  # Injected tokenizer
         self.setup_device(device)
         # Initialize model within strategy scope if using distribution
         self.max_summary_rounds = max_summary_rounds
     def _setup_model(self, model_name):
         self.model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
         # Optimize model for inference
+        self.model.generate = tf.function(
+            self.model.generate,
             input_signature=[
                 {
                     'input_ids': tf.TensorSpec(shape=[None, None], dtype=tf.int32),

environment_setup.py CHANGED Viewed

@@ -122,15 +122,6 @@ class EnvironmentSetup:
                     except (subprocess.SubprocessError, FileNotFoundError):
                         logger.warning("Could not detect specific GPU model")
-                    # # Enable XLA
-                    # tf.config.optimizer.set_jit(True)
-                    # logger.info("XLA compilation enabled for Colab GPU")
-                    # # Set mixed precision policy
-                    # policy = tf.keras.mixed_precision.Policy('mixed_float16')
-                    # tf.keras.mixed_precision.set_global_policy(policy)
-                    # logger.info("Mixed precision training enabled (float16)")
                     strategy = tf.distribute.OneDeviceStrategy("/GPU:0")
                     return "GPU", strategy

                     except (subprocess.SubprocessError, FileNotFoundError):
                         logger.warning("Could not detect specific GPU model")
                     strategy = tf.distribute.OneDeviceStrategy("/GPU:0")
                     return "GPU", strategy

run_data_preparer.py → prepare_data.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import os
 import sys
 import faiss
 import pickle
 from transformers import AutoTokenizer
 from tqdm.auto import tqdm
@@ -52,36 +53,24 @@ def main():
     config = ChatbotConfig()
     logger.info(f"Chatbot Configuration: {config}")
-    # Initialize tokenizer
     try:
         tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
         logger.info(f"Tokenizer '{config.pretrained_model}' loaded successfully.")
-    except Exception as e:
-        logger.error(f"Failed to load tokenizer: {e}")
-        sys.exit(1)
-    # Add special tokens
-    try:
         tokenizer.add_special_tokens({'additional_special_tokens': ['<EMPTY_NEGATIVE>']})
         logger.info("Added special tokens to tokenizer.")
     except Exception as e:
-        logger.error(f"Failed to add special tokens: {e}")
         sys.exit(1)
-    # Initialize encoder model
     try:
         encoder = EncoderModel(config=config)
         logger.info("EncoderModel initialized successfully.")
-    except Exception as e:
-        logger.error(f"Failed to initialize EncoderModel: {e}")
-        sys.exit(1)
-    # Resize token embeddings in encoder to match tokenizer
-    try:
         encoder.pretrained.resize_token_embeddings(len(tokenizer))
         logger.info(f"Token embeddings resized to: {len(tokenizer)}")
     except Exception as e:
-        logger.error(f"Failed to resize token embeddings: {e}")
         sys.exit(1)
     # Load JSON dialogues
@@ -116,6 +105,8 @@ def main():
             max_length=config.max_context_token_limit,
             neg_samples=config.neg_samples,
             query_embeddings_cache=query_embeddings_cache,
             max_retries=config.max_retries
         )
         logger.info("TFDataPipeline initialized successfully.")
@@ -135,17 +126,22 @@ def main():
     # Compute and add response embeddings to FAISS index
     try:
         logger.info("Computing and adding response embeddings to FAISS index...")
-        data_pipeline._compute_and_index_response_embeddings()
         logger.info("Response embeddings computed and added to FAISS index.")
     except Exception as e:
         logger.error(f"Failed to compute or add response embeddings: {e}")
         sys.exit(1)
-    # Save FAISS index
     try:
         logger.info(f"Saving FAISS index to {FAISS_INDEX_PATH}...")
         faiss.write_index(data_pipeline.index, FAISS_INDEX_PATH)
         logger.info("FAISS index saved successfully.")
     except Exception as e:
         logger.error(f"Failed to save FAISS index: {e}")
         sys.exit(1)

 import os
 import sys
 import faiss
+import json
 import pickle
 from transformers import AutoTokenizer
 from tqdm.auto import tqdm
     config = ChatbotConfig()
     logger.info(f"Chatbot Configuration: {config}")
+    # Initialize tokenizer and add special tokens
     try:
         tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
         logger.info(f"Tokenizer '{config.pretrained_model}' loaded successfully.")
         tokenizer.add_special_tokens({'additional_special_tokens': ['<EMPTY_NEGATIVE>']})
         logger.info("Added special tokens to tokenizer.")
     except Exception as e:
+        logger.error(f"Failed to load tokenizer: {e}")
         sys.exit(1)
+    # Initialize encoder model and resize token embeddings
     try:
         encoder = EncoderModel(config=config)
         logger.info("EncoderModel initialized successfully.")
         encoder.pretrained.resize_token_embeddings(len(tokenizer))
         logger.info(f"Token embeddings resized to: {len(tokenizer)}")
     except Exception as e:
+        logger.error(f"Failed to initialize EncoderModel: {e}")
         sys.exit(1)
     # Load JSON dialogues
             max_length=config.max_context_token_limit,
             neg_samples=config.neg_samples,
             query_embeddings_cache=query_embeddings_cache,
+            index_type='IndexFlatIP',
+            nlist=100,
             max_retries=config.max_retries
         )
         logger.info("TFDataPipeline initialized successfully.")
     # Compute and add response embeddings to FAISS index
     try:
         logger.info("Computing and adding response embeddings to FAISS index...")
+        data_pipeline.compute_and_index_response_embeddings()
         logger.info("Response embeddings computed and added to FAISS index.")
     except Exception as e:
         logger.error(f"Failed to compute or add response embeddings: {e}")
         sys.exit(1)
+    # Save FAISS index and response pool
     try:
         logger.info(f"Saving FAISS index to {FAISS_INDEX_PATH}...")
         faiss.write_index(data_pipeline.index, FAISS_INDEX_PATH)
         logger.info("FAISS index saved successfully.")
+        response_pool_path = FAISS_INDEX_PATH.replace('.index', '_responses.json')
+        with open(response_pool_path, 'w', encoding='utf-8') as f:
+            json.dump(data_pipeline.response_pool, f, indent=2)
+        logger.info(f"Response pool saved to {response_pool_path}.")
     except Exception as e:
         logger.error(f"Failed to save FAISS index: {e}")
         sys.exit(1)

response_quality_checker.py CHANGED Viewed

@@ -6,14 +6,14 @@ from logger_config import config_logger
 logger = config_logger(__name__)
 if TYPE_CHECKING:
-    from chatbot_model import RetrievalChatbot
 class ResponseQualityChecker:
     """Enhanced quality checking with dynamic thresholds."""
     def __init__(
         self,
-        chatbot: 'RetrievalChatbot',
         confidence_threshold: float = 0.6,
         diversity_threshold: float = 0.15,
         min_response_length: int = 5,
@@ -23,15 +23,15 @@ class ResponseQualityChecker:
         self.diversity_threshold = diversity_threshold
         self.min_response_length = min_response_length
         self.similarity_cap = similarity_cap
-        self.chatbot = chatbot
         # Dynamic thresholds based on response patterns
         self.thresholds = {
             'relevance': 0.35,
             'length_score': 0.85,
             'score_gap': 0.07
         }
     def check_response_quality(
         self,
         query: str,
@@ -39,11 +39,11 @@ class ResponseQualityChecker:
     ) -> Dict[str, Any]:
         """
         Evaluate the quality of responses based on various metrics.
         Args:
             query: The user's query
             responses: List of (response_text, score) tuples
         Returns:
             Dict containing quality metrics and confidence assessment
         """
@@ -56,7 +56,7 @@ class ResponseQualityChecker:
                 'response_length_score': 0.0,
                 'top_3_score_gap': 0.0
             }
         # Calculate core metrics
         metrics = {
             'response_diversity': self.calculate_diversity(responses),
@@ -67,10 +67,10 @@ class ResponseQualityChecker:
             'top_score': responses[0][1],
             'top_3_score_gap': self._calculate_score_gap([score for _, score in responses], top_n=3)
         }
         # Determine confidence using thresholds
         metrics['is_confident'] = self._determine_confidence(metrics)
         logger.info(f"Quality metrics: {metrics}")
         return metrics
@@ -78,44 +78,45 @@ class ResponseQualityChecker:
         """Calculate relevance as weighted similarity between query and responses."""
         if not responses:
             return 0.0
         # Get embeddings
-        query_embedding = self.encode_query(query)
-        response_embeddings = [self.encode_text(response) for response, _ in responses]
-        # Compute similarities with decreasing weights for later responses
         similarities = cosine_similarity([query_embedding], response_embeddings)[0]
         weights = np.array([1.0 / (i + 1) for i in range(len(similarities))])
         return np.average(similarities, weights=weights)
     def calculate_diversity(self, responses: List[Tuple[str, float]]) -> float:
         """Calculate diversity with length normalization and similarity capping."""
         if not responses:
             return 0.0
-        embeddings = [self.encode_text(response) for response, _ in responses]
         if len(embeddings) < 2:
             return 1.0
-        # Calculate similarities and apply cap
         similarity_matrix = cosine_similarity(embeddings)
         similarity_matrix = np.minimum(similarity_matrix, self.similarity_cap)
-        # Apply length normalization
-        lengths = [len(resp[0].split()) for resp in responses]
-        length_ratios = np.array([min(a, b) / max(a, b) for a in lengths for b in lengths])
-        length_ratios = length_ratios.reshape(len(responses), len(responses))
-        # Combine factors with weights
-        adjusted_similarity = (similarity_matrix * 0.7 + length_ratios * 0.3)
-        # Calculate final score
-        sum_similarities = np.sum(adjusted_similarity) - len(responses)
-        num_pairs = len(responses) * (len(responses) - 1)
         avg_similarity = sum_similarities / num_pairs if num_pairs > 0 else 0.0
-        return 1 - avg_similarity
     def _determine_confidence(self, metrics: Dict[str, float]) -> bool:
         """Determine confidence using primary and secondary conditions."""
@@ -125,20 +126,20 @@ class ResponseQualityChecker:
             metrics['response_diversity'] >= self.diversity_threshold,
             metrics['response_length_score'] >= self.thresholds['length_score']
         ]
         # Secondary conditions (majority must be met)
         secondary_conditions = [
             metrics['query_response_relevance'] >= self.thresholds['relevance'],
             metrics['top_3_score_gap'] >= self.thresholds['score_gap'],
             metrics['top_score'] >= (self.confidence_threshold * 1.1)  # Extra confidence boost
         ]
         return all(primary_conditions) and sum(secondary_conditions) >= 2
     def _calculate_length_score(self, response: str) -> float:
         """Calculate length score with penalty for very short or long responses."""
         words = len(response.split())
         if words < self.min_response_length:
             return words / self.min_response_length
         elif words > 50:  # Penalty for very long responses
@@ -150,21 +151,4 @@ class ResponseQualityChecker:
         if len(scores) < top_n + 1:
             return 0.0
         gaps = [scores[i] - scores[i + 1] for i in range(min(len(scores) - 1, top_n))]
-        return np.mean(gaps)
-    def encode_text(self, text: str) -> np.ndarray:
-        """Encode response text to embedding."""
-        embedding_tensor = self.chatbot.encode_responses([text])
-        embedding = embedding_tensor.numpy()[0].astype('float32')
-        return self._normalize_embedding(embedding)
-    def encode_query(self, query: str) -> np.ndarray:
-        """Encode query text to embedding."""
-        embedding_tensor = self.chatbot.encode_query(query)
-        embedding = embedding_tensor.numpy()[0].astype('float32')
-        return self._normalize_embedding(embedding)
-    def _normalize_embedding(self, embedding: np.ndarray) -> np.ndarray:
-        """Normalize embedding vector."""
-        norm = np.linalg.norm(embedding)
-        return embedding / norm if norm > 0 else embedding

 logger = config_logger(__name__)
 if TYPE_CHECKING:
+    from tf_data_pipeline import TFDataPipeline
 class ResponseQualityChecker:
     """Enhanced quality checking with dynamic thresholds."""
     def __init__(
         self,
+        data_pipeline: 'TFDataPipeline',
         confidence_threshold: float = 0.6,
         diversity_threshold: float = 0.15,
         min_response_length: int = 5,
         self.diversity_threshold = diversity_threshold
         self.min_response_length = min_response_length
         self.similarity_cap = similarity_cap
+        self.data_pipeline = data_pipeline  # Reference to TFDataPipeline
         # Dynamic thresholds based on response patterns
         self.thresholds = {
             'relevance': 0.35,
             'length_score': 0.85,
             'score_gap': 0.07
         }
     def check_response_quality(
         self,
         query: str,
     ) -> Dict[str, Any]:
         """
         Evaluate the quality of responses based on various metrics.
         Args:
             query: The user's query
             responses: List of (response_text, score) tuples
         Returns:
             Dict containing quality metrics and confidence assessment
         """
                 'response_length_score': 0.0,
                 'top_3_score_gap': 0.0
             }
         # Calculate core metrics
         metrics = {
             'response_diversity': self.calculate_diversity(responses),
             'top_score': responses[0][1],
             'top_3_score_gap': self._calculate_score_gap([score for _, score in responses], top_n=3)
         }
         # Determine confidence using thresholds
         metrics['is_confident'] = self._determine_confidence(metrics)
         logger.info(f"Quality metrics: {metrics}")
         return metrics
         """Calculate relevance as weighted similarity between query and responses."""
         if not responses:
             return 0.0
         # Get embeddings
+        query_embedding = self.data_pipeline.encode_query(query)
+        response_texts = [resp for resp, _ in responses]
+        response_embeddings = self.data_pipeline.encode_responses(response_texts)
+        # Compute similarities
         similarities = cosine_similarity([query_embedding], response_embeddings)[0]
+        # Apply decreasing weights for later responses
         weights = np.array([1.0 / (i + 1) for i in range(len(similarities))])
         return np.average(similarities, weights=weights)
     def calculate_diversity(self, responses: List[Tuple[str, float]]) -> float:
         """Calculate diversity with length normalization and similarity capping."""
         if not responses:
             return 0.0
+        response_texts = [resp for resp, _ in responses]
+        embeddings = self.data_pipeline.encode_responses(response_texts)
         if len(embeddings) < 2:
             return 1.0
+        # Calculate pairwise cosine similarities
         similarity_matrix = cosine_similarity(embeddings)
+        np.fill_diagonal(similarity_matrix, 0)  # Exclude self-similarity
+        # Apply similarity cap
         similarity_matrix = np.minimum(similarity_matrix, self.similarity_cap)
+        # Calculate average similarity
+        sum_similarities = np.sum(similarity_matrix)
+        num_pairs = len(embeddings) * (len(embeddings) - 1)
         avg_similarity = sum_similarities / num_pairs if num_pairs > 0 else 0.0
+        # Diversity is inversely related to average similarity
+        diversity_score = 1 - avg_similarity
+        return diversity_score
     def _determine_confidence(self, metrics: Dict[str, float]) -> bool:
         """Determine confidence using primary and secondary conditions."""
             metrics['response_diversity'] >= self.diversity_threshold,
             metrics['response_length_score'] >= self.thresholds['length_score']
         ]
         # Secondary conditions (majority must be met)
         secondary_conditions = [
             metrics['query_response_relevance'] >= self.thresholds['relevance'],
             metrics['top_3_score_gap'] >= self.thresholds['score_gap'],
             metrics['top_score'] >= (self.confidence_threshold * 1.1)  # Extra confidence boost
         ]
         return all(primary_conditions) and sum(secondary_conditions) >= 2
     def _calculate_length_score(self, response: str) -> float:
         """Calculate length score with penalty for very short or long responses."""
         words = len(response.split())
         if words < self.min_response_length:
             return words / self.min_response_length
         elif words > 50:  # Penalty for very long responses
         if len(scores) < top_n + 1:
             return 0.0
         gaps = [scores[i] - scores[i + 1] for i in range(min(len(scores) - 1, top_n))]
+        return np.mean(gaps)

test_trained_model.py DELETED Viewed

File without changes

tf_data_pipeline.py CHANGED Viewed

@@ -11,6 +11,7 @@ from pathlib import Path
 from typing import Union, Optional, List, Tuple, Generator
 from transformers import AutoTokenizer
 from typing import List, Tuple, Generator
 from gpu_monitor import GPUMemoryMonitor
 from logger_config import config_logger
@@ -31,7 +32,6 @@ class TFDataPipeline:
         nlist: int = 100,
         max_retries: int = 3
     ):
-        #self.embedding_batch_size = embedding_batch_size
         self.config = config
         self.tokenizer = tokenizer
         self.encoder = encoder
@@ -64,14 +64,6 @@ class TFDataPipeline:
             dimension = self.query_embeddings_cache[next(iter(self.query_embeddings_cache))].shape[0]
             self.index.train(np.array(list(self.query_embeddings_cache.values())).astype(np.float32))
             self.index.add(np.array(list(self.query_embeddings_cache.values())).astype(np.float32))
-    def validate_faiss_index(self):
-        """Validates that the FAISS index has the correct dimensionality."""
-        expected_dim = self.encoder.config.embedding_dim
-        if self.index.d != expected_dim:
-            logger.error(f"FAISS index dimension {self.index.d} does not match encoder embedding dimension {expected_dim}.")
-            raise ValueError("FAISS index dimensionality mismatch.")
-        logger.info("FAISS index dimension validated successfully.")
     def save_embeddings_cache_hdf5(self, cache_file_path: str):
         """Save the embeddings cache to an HDF5 file."""
@@ -92,8 +84,21 @@ class TFDataPipeline:
         logger.info(f"FAISS index saved to {index_file_path}")
     def load_faiss_index(self, index_file_path: str):
-        self.index = faiss.read_index(index_file_path)
-        logger.info(f"FAISS index loaded from {index_file_path}")
     def save_tokenizer(self, tokenizer_dir: str):
         self.tokenizer.save_pretrained(tokenizer_dir)
@@ -102,19 +107,6 @@ class TFDataPipeline:
     def load_tokenizer(self, tokenizer_dir: str):
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
         logger.info(f"Tokenizer loaded from {tokenizer_dir}")
-    def estimate_total_pairs(self, dialogues: List[dict]) -> int:
-        """Estimate total number of training pairs including hard negatives."""
-        base_pairs = sum(
-            len([
-                1 for i in range(len(d.get('turns', [])) - 1)
-                if (d['turns'][i].get('speaker') == 'user' and
-                    d['turns'][i+1].get('speaker') == 'assistant')
-            ])
-            for d in dialogues
-        )
-        # Account for hard negatives
-        return base_pairs * (1 + self.neg_samples)
     @staticmethod
     def load_json_training_data(data_path: Union[str, Path], debug_samples: Optional[int] = None) -> List[dict]:
@@ -179,7 +171,7 @@ class TFDataPipeline:
         return pairs
-    def _compute_and_index_response_embeddings(self):
         """
         Computes embeddings for the response pool and adds them to the FAISS index with progress bars.
         """
@@ -239,49 +231,6 @@ class TFDataPipeline:
         # **Sanity Check:** Verify the number of embeddings in FAISS index
         logger.info(f"Total embeddings in FAISS index after addition: {self.index.ntotal}")
-    # def _compute_and_index_response_embeddings(self):
-    #     """
-    #     Computes embeddings for the response pool and adds them to the FAISS index.
-    #     """
-    #     logger.info("Computing embeddings for the response pool...")
-    #     # Ensure all responses are strings
-    #     if not all(isinstance(response, str) for response in self.response_pool):
-    #         logger.error("All elements in response_pool must be strings.")
-    #         raise ValueError("Invalid data type in response_pool.")
-    #     # Proceed with tokenization
-    #     encoded_responses = self.tokenizer(
-    #         self.response_pool,
-    #         padding=True,
-    #         truncation=True,
-    #         max_length=self.max_length,
-    #         return_tensors='tf'
-    #     )
-    #     response_ids = encoded_responses['input_ids']
-    #     # Compute embeddings in batches
-    #     batch_size = getattr(self, 'embedding_batch_size', 64)  # Default to 64 if not set
-    #     embeddings = []
-    #     for i in range(0, len(response_ids), batch_size):
-    #         batch_ids = response_ids[i:i+batch_size]
-    #         # Compute embeddings
-    #         batch_embeddings = self.encoder(batch_ids, training=False).numpy()
-    #         # Normalize embeddings if using inner product or cosine similarity
-    #         faiss.normalize_L2(batch_embeddings)
-    #         embeddings.append(batch_embeddings)
-    #     if embeddings:
-    #         embeddings = np.vstack(embeddings).astype(np.float32)
-    #         # Add embeddings to FAISS index
-    #         logger.info(f"Adding {len(embeddings)} response embeddings to FAISS index...")
-    #         self.index.add(embeddings)
-    #         logger.info("Response embeddings added to FAISS index.")
-    #     else:
-    #         logger.warning("No embeddings to add to FAISS index.")
-    #     # **Sanity Check:** Verify the number of embeddings in FAISS index
-    #     logger.info(f"Total embeddings in FAISS index after addition: {self.index.ntotal}")
     def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
         """Find hard negatives for a batch of queries with error handling and retries."""
@@ -355,58 +304,109 @@ class TFDataPipeline:
                 if tf.config.list_physical_devices('GPU'):
                     tf.keras.backend.clear_session()
-    def _tokenize_and_encode(self, queries: List[str], positives: List[str], negatives: List[List[str]]) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
         """
-        Tokenize and encode the queries, positives, and negatives.
         Returns:
-            query_ids: [batch_size, max_length]
-            positive_ids: [batch_size, max_length]
-            negative_ids: [batch_size, neg_samples, max_length]
         """
-        # Tokenize queries
-        q_enc = self.tokenizer(
-            queries,
-            padding="max_length",
-            truncation=True,
-            max_length=self.max_length,
-            return_tensors="np"
-        )
-        # Tokenize positives
-        p_enc = self.tokenizer(
-            positives,
-            padding="max_length",
             truncation=True,
             max_length=self.max_length,
-            return_tensors="np"
         )
-        # Tokenize negatives
-        # Flatten negatives
-        flattened_negatives = [neg for sublist in negatives for neg in sublist]
-        if len(flattened_negatives) == 0:
-            # No negatives at all: return a zero array
-            n_ids = np.zeros((len(queries), self.neg_samples, self.max_length), dtype=np.int32)
-        else:
-            n_enc = self.tokenizer(
-                flattened_negatives,
-                padding="max_length",
-                truncation=True,
-                max_length=self.max_length,
-                return_tensors="np"
-            )
-            n_input_ids = n_enc["input_ids"]
-            # Reshape to [batch_size, neg_samples, max_length]
-            batch_size = len(queries)
-            n_ids = n_input_ids.reshape(batch_size, self.neg_samples, self.max_length)
-        # Convert to int32
-        query_ids = q_enc["input_ids"].astype(np.int32)
-        positive_ids = p_enc["input_ids"].astype(np.int32)
-        negative_ids = n_ids.astype(np.int32)
-        return query_ids, positive_ids, negative_ids
-    # Testing updated batch tokenization
     def prepare_and_save_data(self, dialogues: List[dict], tf_record_path: str, batch_size: int = 32):
         """
         Processes dialogues in batches and saves to a TFRecord file using optimized batch tokenization and encoding.
@@ -522,83 +522,6 @@ class TFDataPipeline:
                     pbar.update(1)
         logger.info(f"Data preparation complete. TFRecord saved.")
-    # def prepare_and_save_data(self, dialogues: List[dict], tfrecord_file_path: str, batch_size: int = 32):
-    #     """Processes dialogues in batches and saves to a TFRecord file."""
-    #     with tf.io.TFRecordWriter(tfrecord_file_path) as writer:
-    #         total_dialogues = len(dialogues)
-    #         logger.debug(f"Total dialogues to process: {total_dialogues}")
-    #         with tqdm(total=total_dialogues, desc="Processing Dialogues", unit="dialogue") as pbar:
-    #             for i in range(0, total_dialogues, batch_size):
-    #                 batch_dialogues = dialogues[i:i+batch_size]
-    #                 # Process each batch_dialogues
-    #                 # Extract pairs, find negatives, tokenize, and serialize
-    #                 # Example:
-    #                 for dialogue in batch_dialogues:
-    #                     pairs = self._extract_pairs_from_dialogue(dialogue)
-    #                     queries = []
-    #                     positives = []
-    #                     for query, positive in pairs:
-    #                         queries.append(query)
-    #                         positives.append(positive)
-    #                     if queries:
-    #                         # **Compute and cache query embeddings before searching**
-    #                         self._compute_embeddings(queries)
-    #                         # Find hard negatives
-    #                         hard_negatives = self._find_hard_negatives_batch(queries, positives)
-    #                         # for idx, negatives in enumerate(hard_negatives[:5]):  # Log first 5 examples
-    #                         #     logger.debug(f"Query: {queries[idx]}")
-    #                         #     logger.debug(f"Positive: {positives[idx]}")
-    #                         #     logger.debug(f"Hard Negatives: {negatives}")
-    #                         # Tokenize and encode
-    #                         query_ids, positive_ids, negative_ids = self._tokenize_and_encode(queries, positives, hard_negatives)
-    #                         # Serialize each example and write to TFRecord
-    #                         for q_id, p_id, n_id in zip(query_ids, positive_ids, negative_ids):
-    #                             feature = {
-    #                                 'query_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=q_id)),
-    #                                 'positive_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=p_id)),
-    #                                 'negative_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=n_id.flatten())),
-    #                             }
-    #                             example = tf.train.Example(features=tf.train.Features(feature=feature))
-    #                             writer.write(example.SerializeToString())
-    #                 pbar.update(len(batch_dialogues))
-    #     logger.info(f"Data preparation complete. TFRecord saved at {tfrecord_file_path}")
-    def _tokenize_negatives_tf(self, negatives):
-        """Tokenizes negatives using tf.py_function."""
-        # Handle the case where negatives is an empty tensor
-        if tf.size(negatives) == 0:
-            return tf.zeros([0, self.neg_samples, self.max_length], dtype=tf.int32)
-        # Convert EagerTensor to a list of strings
-        negatives_list = []
-        for neg_list in negatives.numpy():
-            decoded_negs = [neg.decode("utf-8") for neg in neg_list if neg]  # Filter out empty strings
-            negatives_list.append(decoded_negs)
-        # Flatten the list of lists
-        flattened_negatives = [neg for sublist in negatives_list for neg in sublist]
-        # Tokenize the flattened negatives
-        if flattened_negatives:
-            n_tokens = self.tokenizer(
-                flattened_negatives,
-                padding='max_length',
-                truncation=True,
-                max_length=self.max_length,
-                return_tensors='tf'
-            )
-            # Reshape the tokens
-            n_tokens_reshaped = tf.reshape(n_tokens['input_ids'], [-1, self.neg_samples, self.max_length])
-            return n_tokens_reshaped
-        else:
-            return tf.zeros([0, self.neg_samples, self.max_length], dtype=tf.int32)
     def _compute_embeddings(self, queries: List[str]) -> None:
         new_queries = [q for q in queries if q not in self.query_embeddings_cache]
@@ -642,51 +565,6 @@ class TFDataPipeline:
                     hard_negatives = self._find_hard_negatives_batch([query], [positive])[0]
                     yield (query, positive, hard_negatives)
                 pbar.update(1)
-    def _prepare_batch(self, queries: tf.Tensor, positives: tf.Tensor, negatives: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
-        """Prepares a batch of data for training."""
-        # Convert EagerTensors to lists of strings
-        queries_list = [query.decode("utf-8") for query in queries.numpy()]
-        positives_list = [pos.decode("utf-8") for pos in positives.numpy()]
-        # Tokenize queries and positives
-        q_tokens = self.tokenizer(queries_list, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='tf')
-        p_tokens = self.tokenizer(positives_list, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='tf')
-        # Decode negatives and ensure they are lists of strings
-        negatives_list = []
-        for neg_list in negatives.numpy():
-            decoded_negs = [neg.decode("utf-8") for neg in neg_list if neg]  # Filter out empty strings
-            negatives_list.append(decoded_negs)
-        # Flatten negatives for tokenization if there are any valid negatives
-        flattened_negatives = [neg for sublist in negatives_list for neg in sublist if neg]
-        # Tokenize negatives if there are any
-        n_tokens_reshaped = None
-        if flattened_negatives:
-            n_tokens = self.tokenizer(flattened_negatives, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='tf')
-            # Reshape n_tokens to match the expected shape based on the number of negatives per query
-            # This part may need adjustment if the number of negatives varies per query
-            n_tokens_reshaped = tf.reshape(n_tokens['input_ids'], [len(queries_list), -1, self.max_length])
-        else:
-            # Create a placeholder tensor for the case where there are no negatives
-            n_tokens_reshaped = tf.zeros([len(queries_list), 0, self.max_length], dtype=tf.int32)
-        # Ensure n_tokens_reshaped has a consistent shape even when there are no negatives
-        # Adjust shape to [batch_size, num_neg_samples, max_length]
-        if n_tokens_reshaped.shape[1] != self.neg_samples:
-            # Pad or truncate the second dimension to match neg_samples
-            padding = tf.zeros([len(queries_list), tf.maximum(0, self.neg_samples - n_tokens_reshaped.shape[1]), self.max_length], dtype=tf.int32)
-            n_tokens_reshaped = tf.concat([n_tokens_reshaped, padding], axis=1)
-            n_tokens_reshaped = n_tokens_reshaped[:, :self.neg_samples, :]
-        # Concatenate the positive and negative examples along the 'neg_samples' dimension
-        combined_p_n_tokens = tf.concat([tf.expand_dims(p_tokens['input_ids'], axis=1), n_tokens_reshaped], axis=1)
-        return q_tokens['input_ids'], combined_p_n_tokens
     def get_tf_dataset(self, dialogues: List[dict], batch_size: int) -> tf.data.Dataset:
         """
@@ -714,32 +592,6 @@ class TFDataPipeline:
         dataset = dataset.prefetch(tf.data.AUTOTUNE)
         return dataset
-    # def get_tf_dataset(self, dialogues: List[dict], batch_size: int) -> tf.data.Dataset:
-    #     """
-    #     Creates a tf.data.Dataset for streaming training that yields
-    #     (input_ids_query, input_ids_positive, input_ids_negatives).
-    #     """
-    #     # 1) Start with a generator dataset
-    #     dataset = tf.data.Dataset.from_generator(
-    #         lambda: self.data_generator(dialogues),
-    #         output_signature=(
-    #             tf.TensorSpec(shape=(), dtype=tf.string),        # Query (single string)
-    #             tf.TensorSpec(shape=(), dtype=tf.string),        # Positive (single string)
-    #             tf.TensorSpec(shape=(None,), dtype=tf.string)    # Hard Negatives (list of strings)
-    #         )
-    #     )
-    #     # 2) Batch the raw strings
-    #     dataset = dataset.batch(batch_size)
-    #     # 3) Now map them through a tokenize step (via py_function)
-    #     dataset = dataset.map(
-    #         lambda q, p, n: self._tokenize_triple(q, p, n),
-    #         num_parallel_calls=1 #tf.data.AUTOTUNE
-    #     )
-    #     dataset = dataset.prefetch(tf.data.AUTOTUNE)
-    #     return dataset
     def _tokenize_triple(
         self,
@@ -861,71 +713,3 @@ class TFDataPipeline:
         n_ids = n_ids.astype(np.int32)               # shape [batch_size, neg_samples, max_len]
         return q_ids, p_ids, n_ids
-    # def parse_tfrecord_fn(example_proto, max_length, neg_samples):
-    #     """
-    #     Parses a single TFRecord example.
-    #     Args:
-    #         example_proto: A serialized TFRecord example.
-    #         max_length: The maximum sequence length for tokenization.
-    #         neg_samples: The number of hard negatives per query.
-    #     Returns:
-    #         A tuple of (query_ids, positive_ids, negative_ids).
-    #     """
-    #     feature_description = {
-    #         'query_ids': tf.io.FixedLenFeature([max_length], tf.int64),
-    #         'positive_ids': tf.io.FixedLenFeature([max_length], tf.int64),
-    #         'negative_ids': tf.io.FixedLenFeature([neg_samples * max_length], tf.int64),
-    #     }
-    #     parsed_features = tf.io.parse_single_example(example_proto, feature_description)
-    #     query_ids = tf.cast(parsed_features['query_ids'], tf.int32)
-    #     positive_ids = tf.cast(parsed_features['positive_ids'], tf.int32)
-    #     negative_ids = tf.cast(parsed_features['negative_ids'], tf.int32)
-    #     negative_ids = tf.reshape(negative_ids, [neg_samples, max_length])
-    #     return query_ids, positive_ids, negative_ids
-        # def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
-    #     """Find hard negatives for a batch of queries with error handling and retries."""
-    #     retry_count = 0
-    #     total_responses = len(self.response_pool)
-    #     while retry_count < self.max_retries:
-    #         try:
-    #             query_embeddings = np.vstack([
-    #                 self.query_embeddings_cache[q] for q in queries
-    #             ]).astype(np.float32)
-    #             query_embeddings = np.ascontiguousarray(query_embeddings)
-    #             faiss.normalize_L2(query_embeddings)
-    #             k = 1  # TODO: try higher k for better results
-    #             #logger.debug(f"Searching with k={k} among {total_responses} responses")
-    #             distances, indices = self.index.search(query_embeddings, k)
-    #             all_negatives = []
-    #             for query_indices, query, positive in zip(indices, queries, positives):
-    #                 negatives = []
-    #                 positive_strip = positive.strip()
-    #                 seen = {positive_strip}
-    #                 for idx in query_indices:
-    #                     if idx >= 0 and idx < total_responses:
-    #                         candidate = self.response_pool[idx].strip()
-    #                         if candidate and candidate not in seen:
-    #                             seen.add(candidate)
-    #                             negatives.append(candidate)
-    #                             if len(negatives) >= self.neg_samples:
-    #                                 break
-    #                 # Pad with a special empty negative if necessary
-    #                 while len(negatives) < self.neg_samples:
-    #                     negatives.append("<EMPTY_NEGATIVE>")  # Use a special token
-    #                 all_negatives.append(negatives)
-    #             return all_negatives

 from typing import Union, Optional, List, Tuple, Generator
 from transformers import AutoTokenizer
 from typing import List, Tuple, Generator
+from transformers import AutoTokenizer
 from gpu_monitor import GPUMemoryMonitor
 from logger_config import config_logger
         nlist: int = 100,
         max_retries: int = 3
     ):
         self.config = config
         self.tokenizer = tokenizer
         self.encoder = encoder
             dimension = self.query_embeddings_cache[next(iter(self.query_embeddings_cache))].shape[0]
             self.index.train(np.array(list(self.query_embeddings_cache.values())).astype(np.float32))
             self.index.add(np.array(list(self.query_embeddings_cache.values())).astype(np.float32))
     def save_embeddings_cache_hdf5(self, cache_file_path: str):
         """Save the embeddings cache to an HDF5 file."""
         logger.info(f"FAISS index saved to {index_file_path}")
     def load_faiss_index(self, index_file_path: str):
+        """Load the FAISS index from the specified file path."""
+        if os.path.exists(index_file_path):
+            self.index = faiss.read_index(index_file_path)
+            logger.info(f"FAISS index loaded from {index_file_path}.")
+        else:
+            logger.error(f"FAISS index file not found at {index_file_path}.")
+            raise FileNotFoundError(f"FAISS index file not found at {index_file_path}.")
+    def validate_faiss_index(self):
+        """Validates that the FAISS index has the correct dimensionality."""
+        expected_dim = self.encoder.config.embedding_dim
+        if self.index.d != expected_dim:
+            logger.error(f"FAISS index dimension {self.index.d} does not match encoder embedding dimension {expected_dim}.")
+            raise ValueError("FAISS index dimensionality mismatch.")
+        logger.info("FAISS index dimension validated successfully.")
     def save_tokenizer(self, tokenizer_dir: str):
         self.tokenizer.save_pretrained(tokenizer_dir)
     def load_tokenizer(self, tokenizer_dir: str):
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
         logger.info(f"Tokenizer loaded from {tokenizer_dir}")
     @staticmethod
     def load_json_training_data(data_path: Union[str, Path], debug_samples: Optional[int] = None) -> List[dict]:
         return pairs
+    def compute_and_index_response_embeddings(self):
         """
         Computes embeddings for the response pool and adds them to the FAISS index with progress bars.
         """
         # **Sanity Check:** Verify the number of embeddings in FAISS index
         logger.info(f"Total embeddings in FAISS index after addition: {self.index.ntotal}")
     def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
         """Find hard negatives for a batch of queries with error handling and retries."""
                 if tf.config.list_physical_devices('GPU'):
                     tf.keras.backend.clear_session()
+    def encode_query(self, query: str, context: Optional[List[Tuple[str, str]]] = None) -> np.ndarray:
         """
+        Encode a query with optional conversation context into an embedding vector.
+        Args:
+            query (str): The user query.
+            context (Optional[List[Tuple[str, str]]]): Optional conversation history as a list of (user, assistant) tuples.
         Returns:
+            np.ndarray: The normalized embedding vector for the query.
         """
+        # Prepare query with context
+        if context:
+            context_str = ' '.join([
+                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {q} "
+                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {r}"
+                for q, r in context[-self.config.max_context_turns:]
+            ])
+            query = f"{context_str} {self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]}" \
+                    f" {query}"
+        else:
+            query = f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
+        # Tokenize and encode
+        encodings = self.tokenizer(
+            [query],
+            padding='max_length',
             truncation=True,
             max_length=self.max_length,
+            return_tensors='np'  # Use NumPy arrays for compatibility with FAISS
         )
+        input_ids = encodings['input_ids']
+        # Verify token IDs
+        max_id = np.max(input_ids)
+        new_vocab_size = len(self.tokenizer)
+        if max_id >= new_vocab_size:
+            logger.error(f"Token ID {max_id} exceeds the vocabulary size {new_vocab_size}.")
+            raise ValueError("Token ID exceeds vocabulary size.")
+        # Get embeddings from the shared encoder
+        embeddings = self.encoder(input_ids, training=False).numpy()
+        # Normalize embeddings for cosine similarity
+        faiss.normalize_L2(embeddings)
+        return embeddings[0]  # Return as a 1D array
+    def encode_responses(self, responses: List[str], context: Optional[List[Tuple[str, str]]] = None) -> np.ndarray:
+        """
+        Encode a list of responses into embedding vectors.
+        Args:
+            responses (List[str]): List of response texts.
+            context (Optional[List[Tuple[str, str]]]): Optional conversation history as a list of (user, assistant) tuples.
+        Returns:
+            np.ndarray: Array of normalized embedding vectors.
+        """
+        # Prepare responses with context if provided
+        if context:
+            prepared_responses = []
+            for response in responses:
+                context_str = ' '.join([
+                    f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {q} "
+                    f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {r}"
+                    for q, r in context[-self.config.max_context_turns:]
+                ])
+                full_response = f"{context_str} {self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {response}"
+                prepared_responses.append(full_response)
+        else:
+            prepared_responses = [
+                f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {resp}"
+                for resp in responses
+            ]
+        # Tokenize and encode
+        encodings = self.tokenizer(
+            prepared_responses,
+            padding='max_length',
+            truncation=True,
+            max_length=self.max_length,
+            return_tensors='np'  # Use NumPy arrays for compatibility with FAISS
+        )
+        input_ids = encodings['input_ids']
+        # Verify token IDs
+        max_id = np.max(input_ids)
+        new_vocab_size = len(self.tokenizer)
+        if max_id >= new_vocab_size:
+            logger.error(f"Token ID {max_id} exceeds the vocabulary size {new_vocab_size}.")
+            raise ValueError("Token ID exceeds vocabulary size.")
+        # Get embeddings from the shared encoder
+        embeddings = self.encoder(input_ids, training=False).numpy()
+        # Normalize embeddings for cosine similarity
+        faiss.normalize_L2(embeddings)
+        return embeddings.astype('float32')
     def prepare_and_save_data(self, dialogues: List[dict], tf_record_path: str, batch_size: int = 32):
         """
         Processes dialogues in batches and saves to a TFRecord file using optimized batch tokenization and encoding.
                     pbar.update(1)
         logger.info(f"Data preparation complete. TFRecord saved.")
     def _compute_embeddings(self, queries: List[str]) -> None:
         new_queries = [q for q in queries if q not in self.query_embeddings_cache]
                     hard_negatives = self._find_hard_negatives_batch([query], [positive])[0]
                     yield (query, positive, hard_negatives)
                 pbar.update(1)
     def get_tf_dataset(self, dialogues: List[dict], batch_size: int) -> tf.data.Dataset:
         """
         dataset = dataset.prefetch(tf.data.AUTOTUNE)
         return dataset
     def _tokenize_triple(
         self,
         n_ids = n_ids.astype(np.int32)               # shape [batch_size, neg_samples, max_len]
         return q_ids, p_ids, n_ids

run_model_train.py → train_model.py RENAMED Viewed

@@ -1,36 +1,11 @@
 import tensorflow as tf
 from chatbot_model import RetrievalChatbot, ChatbotConfig
 from environment_setup import EnvironmentSetup
-from response_quality_checker import ResponseQualityChecker
-from chatbot_validator import ChatbotValidator
 from training_plotter import TrainingPlotter
-# Configure logging
 from logger_config import config_logger
 logger = config_logger(__name__)
-def run_interactive_chat(chatbot, quality_checker):
-    """Separate function for interactive chat loop"""
-    while True:
-        user_input = input("You: ")
-        if user_input.lower() in ['quit', 'exit', 'bye']:
-            print("Assistant: Goodbye!")
-            break
-        response, candidates, metrics = chatbot.chat(
-            query=user_input,
-            conversation_history=None,
-            quality_checker=quality_checker,
-            top_k=5
-        )
-        print(f"Assistant: {response}")
-        if metrics.get('is_confident', False):
-            print("\nAlternative responses:")
-            for resp, score in candidates[1:4]:
-                print(f"Score: {score:.4f} - {resp}")
 def inspect_tfrecord(tfrecord_file_path, num_examples=3):
     def parse_example(example_proto):
         feature_description = {
@@ -53,7 +28,7 @@ def inspect_tfrecord(tfrecord_file_path, num_examples=3):
 def main():
     # Quick test to inspect TFRecord
-    #inspect_tfrecord('training_data/training_data.tfrecord', num_examples=3)
     # Initialize environment
     tf.keras.backend.clear_session()
@@ -65,49 +40,40 @@ def main():
     TF_RECORD_FILE_PATH = 'training_data/training_data.tfrecord'
     # Optimize batch size for Colab
-    batch_size = env.optimize_batch_size(base_batch_size=16)
-    # Initialize configuration
-    config = ChatbotConfig(
-        embedding_dim=768, # DistilBERT
-        max_context_token_limit=512,
-        freeze_embeddings=False,
-    )
     # Initialize chatbot
-    #with env.strategy.scope():
     chatbot = RetrievalChatbot(config, mode='training')
-    chatbot.build_models()
-    if chatbot.mode == 'preparation':
-        chatbot.verify_faiss_index()
-    chatbot.train_streaming(
         tfrecord_file_path=TF_RECORD_FILE_PATH,
         epochs=EPOCHS,
         batch_size=batch_size,
         use_lr_schedule=True,
     )
     # Save final model
     model_save_path = env.training_dirs['base'] / 'final_model'
     chatbot.save_models(model_save_path)
-    # Run automatic validation
-    quality_checker = ResponseQualityChecker(chatbot=chatbot)
-    validator = ChatbotValidator(chatbot, quality_checker)
-    validation_metrics = validator.run_validation(num_examples=5)
-    logger.info(f"Validation Metrics: {validation_metrics}")
     # Plot and save training history
     plotter = TrainingPlotter(save_dir=env.training_dirs['plots'])
     plotter.plot_training_history(chatbot.history)
-    plotter.plot_validation_metrics(validation_metrics)
-    # Run interactive chat
-    logger.info("\nStarting interactive chat session...")
-    run_interactive_chat(chatbot, quality_checker)
 if __name__ == "__main__":
     main()

 import tensorflow as tf
 from chatbot_model import RetrievalChatbot, ChatbotConfig
 from environment_setup import EnvironmentSetup
 from training_plotter import TrainingPlotter
 from logger_config import config_logger
 logger = config_logger(__name__)
 def inspect_tfrecord(tfrecord_file_path, num_examples=3):
     def parse_example(example_proto):
         feature_description = {
 def main():
     # Quick test to inspect TFRecord
+    # inspect_tfrecord('training_data/training_data.tfrecord', num_examples=3)
     # Initialize environment
     tf.keras.backend.clear_session()
     TF_RECORD_FILE_PATH = 'training_data/training_data.tfrecord'
     # Optimize batch size for Colab
+    batch_size = 32 # env.optimize_batch_size(base_batch_size=16)
+    # Initialize config
+    config = ChatbotConfig()
     # Initialize chatbot
     chatbot = RetrievalChatbot(config, mode='training')
+    # Load from a checkpoint
+    checkpoint_dir = 'checkpoints/'
+    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
+    initial_epoch = 0
+    if latest_checkpoint:
+        ckpt_number = int(latest_checkpoint.split('ckpt-')[-1])
+        initial_epoch = ckpt_number
+        logger.info(f"Found checkpoint {latest_checkpoint}, resuming from epoch {initial_epoch}")
+    # Train the model
+    chatbot.train_model(
         tfrecord_file_path=TF_RECORD_FILE_PATH,
         epochs=EPOCHS,
         batch_size=batch_size,
         use_lr_schedule=True,
+        test_mode=False,
+        initial_epoch=initial_epoch
     )
     # Save final model
     model_save_path = env.training_dirs['base'] / 'final_model'
     chatbot.save_models(model_save_path)
     # Plot and save training history
     plotter = TrainingPlotter(save_dir=env.training_dirs['plots'])
     plotter.plot_training_history(chatbot.history)
 if __name__ == "__main__":
     main()

validate_model.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import json
+from chatbot_model import ChatbotConfig, RetrievalChatbot
+from response_quality_checker import ResponseQualityChecker
+from chatbot_validator import ChatbotValidator
+from training_plotter import TrainingPlotter
+from environment_setup import EnvironmentSetup
+from logger_config import config_logger
+logger = config_logger(__name__)
+def run_interactive_chat(chatbot, quality_checker):
+    """Separate function for interactive chat loop"""
+    while True:
+        try:
+            user_input = input("You: ")
+        except (KeyboardInterrupt, EOFError):
+            print("\nAssistant: Goodbye!")
+            break
+        if user_input.lower() in ['quit', 'exit', 'bye']:
+            print("Assistant: Goodbye!")
+            break
+        response, candidates, metrics = chatbot.chat(
+            query=user_input,
+            conversation_history=None,
+            quality_checker=quality_checker,
+            top_k=5
+        )
+        print(f"Assistant: {response}")
+        if metrics.get('is_confident', False):
+            print("\nAlternative responses:")
+            for resp, score in candidates[1:4]:
+                print(f"Score: {score:.4f} - {resp}")
+        else:
+            print("\n[Low Confidence]: Consider rephrasing your query for better assistance.")
+# TODO:
+def validate_chatbot():
+    # Initialize environment
+    env = EnvironmentSetup()
+    env.initialize()
+    MODEL_DIR = 'models'
+    FAISS_INDICES_DIR = os.path.join(MODEL_DIR, 'faiss_indices')
+    FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_production.index')
+    FAISS_INDEX_TEST_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_test.index')
+    RESPONSE_POOL_PRODUCTION_PATH = FAISS_INDEX_PRODUCTION_PATH.replace('.index', '_responses.json')
+    RESPONSE_POOL_TEST_PATH = FAISS_INDEX_TEST_PATH.replace('.index', '_responses.json')
+    ENVIRONMENT = 'production'  # or 'test'
+    if ENVIRONMENT == 'test':
+        FAISS_INDEX_PATH = FAISS_INDEX_TEST_PATH
+        RESPONSE_POOL_PATH = RESPONSE_POOL_TEST_PATH
+    else:
+        FAISS_INDEX_PATH = FAISS_INDEX_PRODUCTION_PATH
+        RESPONSE_POOL_PATH = RESPONSE_POOL_PRODUCTION_PATH
+    # Load config
+    config = ChatbotConfig()
+    # Initialize RetrievalChatbot in 'inference' mode
+    try:
+        chatbot = RetrievalChatbot(config=config, mode='inference')
+        logger.info("RetrievalChatbot initialized in 'inference' mode.")
+    except Exception as e:
+        logger.error(f"Failed to initialize RetrievalChatbot: {e}")
+        return
+    # Ensure FAISS index and response pool are accessible, then load
+    if not os.path.exists(FAISS_INDEX_PATH) or not os.path.exists(RESPONSE_POOL_PATH):
+        logger.error("FAISS index or response pool file is missing.")
+        return
+    try:
+        chatbot.data_pipeline.load_faiss_index(FAISS_INDEX_PATH)
+        logger.info(f"FAISS index loaded from {FAISS_INDEX_PATH}.")
+        with open(RESPONSE_POOL_PATH, 'r', encoding='utf-8') as f:
+            chatbot.data_pipeline.response_pool = json.load(f)
+        logger.info(f"Response pool loaded from {RESPONSE_POOL_PATH}.")
+        chatbot.data_pipeline.validate_faiss_index()
+        logger.info("FAISS index and response pool validated successfully.")
+    except Exception as e:
+        logger.error(f"Failed to load FAISS index: {e}")
+        return
+    # Initialize ResponseQualityChecker and ChatbotValidator
+    quality_checker = ResponseQualityChecker(data_pipeline=chatbot.data_pipeline)
+    validator = ChatbotValidator(chatbot=chatbot, quality_checker=quality_checker)
+    logger.info("ResponseQualityChecker and ChatbotValidator initialized.")
+    # Run validation
+    try:
+        validation_metrics = validator.run_validation(num_examples=5)
+        logger.info(f"Validation Metrics: {validation_metrics}")
+    except Exception as e:
+        logger.error(f"Validation process failed: {e}")
+        return
+    # Plot validation_metrics
+    try:
+        plotter = TrainingPlotter(save_dir=env.training_dirs['plots'])
+        plotter.plot_validation_metrics(validation_metrics)
+        logger.info("Validation metrics plotted successfully.")
+    except Exception as e:
+        logger.error(f"Failed to plot validation metrics: {e}")
+    # Run interactive chat
+    logger.info("\nStarting interactive chat session...")
+    run_interactive_chat(chatbot, quality_checker)
+if __name__ == '__main__':
+    validate_chatbot()