Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Dec 29, 2024

Commit

f7b283c

1 Parent(s): 300fe5d

summarization, reranker, environment setup, and response quality checker

Browse files

Files changed (19) hide show

chatbot.py +0 -261
chatbot2.py +0 -839
chatbot3.py +0 -824
chatbot4.py → chatbot_model.py +595 -629
chatbot_validator.py +207 -0
conversation_summarizer.py +147 -0
cross_encoder_reranker.py +51 -0
dialogue_augmenter.py +0 -1
environment_setup.py +207 -0
logger_config.py +10 -0
requirements.txt +13 -1
response_quality_checker.py +125 -119
run_model.py +0 -162
run_model2.py +0 -340
run_model3.py +0 -434
run_model4.py +0 -237
run_model_train.py +96 -0
setup.py +40 -4
training_plotter.py +127 -0

chatbot.py DELETED Viewed

@@ -1,261 +0,0 @@
-import numpy as np
-import tensorflow as tf
-import keras
-print(tf.__version__)
-print(keras.__version__)
-import spacy
-import random
-from tqdm import trange
-class RetrievalChatbot:
-    def __init__(
-        self,
-        vocab_size: int = 10000,
-        max_sequence_length: int = 80,
-        embedding_dim: int = 256,
-        lstm_units: int = 256,
-        num_attention_heads: int = 8,
-        margin: float = 0.3
-    ):
-        self.vocab_size = vocab_size
-        self.max_sequence_length = max_sequence_length
-        self.embedding_dim = embedding_dim
-        self.lstm_units = lstm_units
-        self.num_attention_heads = num_attention_heads
-        self.margin = margin
-        self.nlp = spacy.load('en_core_web_md')
-        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(
-            num_words=vocab_size,
-            oov_token="<OOV>"
-        )
-        self.query_encoder_model, self.response_encoder_model = self._build_encoders()
-    def _positional_encoding(self, position: int, d_model: int) -> tf.Tensor:
-        angles = np.arange(position)[:, np.newaxis] / np.power(
-            10000,
-            (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / d_model
-        )
-        sines = np.sin(angles[:, 0::2])
-        cosines = np.cos(angles[:, 1::2])
-        pos_encoding = np.concatenate([sines, cosines], axis=-1)
-        pos_encoding = pos_encoding[np.newaxis, ...]
-        return tf.cast(pos_encoding, dtype=tf.float32)
-    def _build_single_encoder(self, name_prefix: str):
-        input_layer = tf.keras.Input(shape=(self.max_sequence_length,), name=f"{name_prefix}_input")
-        embedding = tf.keras.layers.Embedding(
-            self.vocab_size,
-            self.embedding_dim,
-            mask_zero=True,
-            name=f"{name_prefix}_embedding"
-        )(input_layer)
-        pos_encoding = self._positional_encoding(self.max_sequence_length, self.embedding_dim)
-        x = embedding + pos_encoding
-        # # Multi-head attention
-        # attention_output = tf.keras.layers.MultiHeadAttention(
-        #     num_heads=self.num_attention_heads,
-        #     key_dim=self.embedding_dim // self.num_attention_heads
-        # )(x, x)
-        # x = tf.keras.layers.LayerNormalization()(x + attention_output)
-        for i in range(2):
-            lstm_out = tf.keras.layers.LSTM(
-                self.lstm_units,
-                return_sequences=True,
-                kernel_regularizer=tf.keras.regularizers.l2(0.01),
-                name=f"{name_prefix}_lstm_{i}"
-            )(x)
-            x = tf.keras.layers.LayerNormalization()(x + lstm_out)
-        encoder_output = tf.keras.layers.LSTM(
-            self.lstm_units,
-            name=f"{name_prefix}_final_lstm"
-        )(x)
-        encoder_output = tf.keras.layers.Dropout(0.2)(encoder_output)
-        encoder_output = tf.keras.layers.Lambda(lambda t: tf.nn.l2_normalize(t, axis=1))(encoder_output)
-        return tf.keras.Model(input_layer, encoder_output, name=f"{name_prefix}_encoder")
-    def _build_encoders(self):
-        query_encoder = self._build_single_encoder("query")
-        response_encoder = self._build_single_encoder("response")
-        return query_encoder, response_encoder
-    def _spacy_similarity(self, text1: str, text2: str) -> float:
-        doc1 = self.nlp(text1)
-        doc2 = self.nlp(text2)
-        print('doc1:', doc1)
-        print('doc2:', doc2)
-        print('doc1.similarity(doc2):', doc1.similarity(doc2))
-        return doc1.similarity(doc2)
-    def prepare_dataset(self, dialogues: list, neg_samples_per_pos=3):
-        # Create triplets: (query, positive, negative)
-        response_pool = [
-            turn['text'] for d in dialogues for turn in d['turns'] if turn['speaker'] == 'assistant'
-        ]
-        queries, positives, negatives = [], [], []
-        for dialogue in dialogues:
-            turns = dialogue['turns']
-            for i in range(0, len(turns)-1):
-                if turns[i]['speaker'] == 'user' and turns[i+1]['speaker'] == 'assistant':
-                    q = turns[i]['text']
-                    p = turns[i+1]['text']
-                    # Find negatives using spaCy similarity
-                    neg_candidates = []
-                    attempts = 0
-                    while len(neg_candidates) < neg_samples_per_pos and attempts < 200:
-                        cand = random.choice(response_pool)
-                        if cand != p:
-                            sim = self._spacy_similarity(cand, p)
-                            # Choose thresholds that produce hard negatives
-                            if 0.4 < sim < 0.9:
-                                neg_candidates.append(cand)
-                        attempts += 1
-                    if len(neg_candidates) == neg_samples_per_pos:
-                        for neg in neg_candidates:
-                            queries.append(q)
-                            positives.append(p)
-                            negatives.append(neg)
-        # Fit tokenizer
-        all_text = queries + positives + negatives
-        self.tokenizer.fit_on_texts(all_text)
-        def seq_pad(txts):
-            seq = self.tokenizer.texts_to_sequences(txts)
-            return tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=self.max_sequence_length, padding='post')
-        q_pad = seq_pad(queries)
-        p_pad = seq_pad(positives)
-        n_pad = seq_pad(negatives)
-        return q_pad, p_pad, n_pad
-    def triplet_loss(self, q_emb, p_emb, n_emb):
-        pos_dist = tf.reduce_sum(tf.square(q_emb - p_emb), axis=1)
-        neg_dist = tf.reduce_sum(tf.square(q_emb - n_emb), axis=1)
-        loss = tf.maximum(0.0, self.margin + pos_dist - neg_dist)
-        return tf.reduce_mean(loss)
-    def train_with_triplet_loss(
-        self, q_pad, p_pad, n_pad,
-        epochs=3,
-        batch_size=16,
-        validation_split=0.2,
-        early_stopping_patience=3,
-        use_tqdm=True
-    ):
-        train_losses = []
-        val_losses = []
-        total_samples = len(q_pad)
-        idxs = np.arange(total_samples)
-        np.random.shuffle(idxs)
-        train_size = int((1 - validation_split) * total_samples)
-        train_idxs = idxs[:train_size]
-        val_idxs = idxs[train_size:]
-        q_train, p_train, n_train = q_pad[train_idxs], p_pad[train_idxs], n_pad[train_idxs]
-        q_val, p_val, n_val = q_pad[val_idxs], p_pad[val_idxs], n_pad[val_idxs]
-        optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
-        best_val_loss = float('inf')
-        wait = 0
-        for epoch in range(epochs):
-            # Shuffle training data each epoch
-            perm = np.random.permutation(len(q_train))
-            q_train, p_train, n_train = q_train[perm], p_train[perm], n_train[perm]
-            num_batches = len(q_train) // batch_size
-            epoch_train_loss = 0.0
-            batch_iter = range(num_batches)
-            if use_tqdm:
-                batch_iter = trange(num_batches, desc=f"Epoch {epoch+1}/{epochs}")
-            for i in batch_iter:
-                q_batch = q_train[i*batch_size:(i+1)*batch_size]
-                p_batch = p_train[i*batch_size:(i+1)*batch_size]
-                n_batch = n_train[i*batch_size:(i+1)*batch_size]
-                with tf.GradientTape() as tape:
-                    q_emb = self.query_encoder_model(q_batch, training=True)
-                    p_emb = self.response_encoder_model(p_batch, training=True)
-                    n_emb = self.response_encoder_model(n_batch, training=True)
-                    loss = self.triplet_loss(q_emb, p_emb, n_emb)
-                grads = tape.gradient(
-                    loss,
-                    self.query_encoder_model.trainable_variables +
-                    self.response_encoder_model.trainable_variables
-                )
-                optimizer.apply_gradients(zip(
-                    grads,
-                    self.query_encoder_model.trainable_variables +
-                    self.response_encoder_model.trainable_variables
-                ))
-                epoch_train_loss += loss.numpy()
-            epoch_train_loss /= num_batches
-            # Validation loss
-            val_batches = len(q_val) // batch_size
-            epoch_val_loss = 0.0
-            for i in range(val_batches):
-                q_batch = q_val[i*batch_size:(i+1)*batch_size]
-                p_batch = p_val[i*batch_size:(i+1)*batch_size]
-                n_batch = n_val[i*batch_size:(i+1)*batch_size]
-                q_emb = self.query_encoder_model(q_batch, training=False)
-                p_emb = self.response_encoder_model(p_batch, training=False)
-                n_emb = self.response_encoder_model(n_batch, training=False)
-                v_loss = self.triplet_loss(q_emb, p_emb, n_emb)
-                epoch_val_loss += v_loss.numpy()
-            if val_batches > 0:
-                epoch_val_loss /= val_batches
-            train_losses.append(epoch_train_loss)
-            val_losses.append(epoch_val_loss)
-            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}")
-            # Early Stopping logic
-            if epoch_val_loss < best_val_loss:
-                best_val_loss = epoch_val_loss
-                wait = 0
-                # (Optional) Save best weights
-            else:
-                wait += 1
-                if wait >= early_stopping_patience:
-                    print("Early stopping triggered.")
-                    break
-        return train_losses, val_losses
-    def encode_texts(self, texts, is_query=True):
-        seq = self.tokenizer.texts_to_sequences(texts)
-        pad_seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=self.max_sequence_length, padding='post')
-        if is_query:
-            return self.query_encoder_model(pad_seq, training=False)
-        else:
-            return self.response_encoder_model(pad_seq, training=False)
-    def retrieve_top_n(self, query: str, candidates: list, top_n=5):
-        q_emb = self.encode_texts([query], is_query=True)  # shape (1, d)
-        c_emb = self.encode_texts(candidates, is_query=False)  # shape (num_cand, d)
-        sim = tf.matmul(q_emb, c_emb, transpose_b=True).numpy()[0]  # dot product similarity
-        top_indices = np.argsort(sim)[::-1][:top_n]
-        return [(candidates[i], sim[i]) for i in top_indices]

chatbot2.py DELETED Viewed

@@ -1,839 +0,0 @@
-import numpy as np
-import tensorflow as tf
-import spacy
-import random
-from typing import List, Tuple, Dict, Optional, Union
-from dataclasses import dataclass
-from tqdm import tqdm
-import logging
-from pathlib import Path
-import json
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-@dataclass
-class ChatbotConfig:
-    """Configuration for the retrieval chatbot."""
-    vocab_size: int = 10000
-    max_sequence_length: int = 512
-    embedding_dim: int = 256
-    encoder_units: int = 256
-    num_attention_heads: int = 8
-    dropout_rate: float = 0.2
-    l2_reg_weight: float = 0.001
-    margin: float = 0.3
-    learning_rate: float = 0.001
-    min_text_length: int = 3      # Reduced from 10 to allow shorter responses
-    max_context_turns: int = 5
-    warmup_steps: int = 200
-    spacy_model: str = 'en_core_web_md'
-    def to_dict(self) -> dict:
-        """Convert config to dictionary."""
-        return {k: str(v) if isinstance(v, Path) else v
-                for k, v in self.__dict__.items()}
-    @classmethod
-    def from_dict(cls, config_dict: dict) -> 'ChatbotConfig':
-        """Create config from dictionary."""
-        return cls(**{k: v for k, v in config_dict.items()
-                     if k in cls.__dataclass_fields__})
-class TransformerBlock(tf.keras.layers.Layer):
-    """Custom Transformer block with pre-layer normalization."""
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        ff_dim: int,
-        dropout: float = 0.1,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.ff_dim = ff_dim
-        self.dropout = dropout
-        self.attention = tf.keras.layers.MultiHeadAttention(
-            num_heads=num_heads,
-            key_dim=embed_dim // num_heads,
-            dropout=dropout
-        )
-        self.ffn = tf.keras.Sequential([
-            tf.keras.layers.Dense(ff_dim, activation="gelu"),
-            tf.keras.layers.Dense(embed_dim),
-        ])
-        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.dropout1 = tf.keras.layers.Dropout(dropout)
-        self.dropout2 = tf.keras.layers.Dropout(dropout)
-    def call(self, inputs: tf.Tensor, training: bool, mask: Optional[tf.Tensor] = None) -> tf.Tensor:
-        # Pre-layer normalization
-        norm_inputs = self.layernorm1(inputs)
-        # Self-attention
-        attention_output = self.attention(
-            query=norm_inputs,
-            value=norm_inputs,
-            key=norm_inputs,
-            attention_mask=mask,
-            training=training
-        )
-        attention_output = self.dropout1(attention_output, training=training)
-        attention_output = inputs + attention_output
-        # Feed-forward network
-        norm_attention = self.layernorm2(attention_output)
-        ffn_output = self.ffn(norm_attention)
-        ffn_output = self.dropout2(ffn_output, training=training)
-        return attention_output + ffn_output
-    def get_config(self) -> dict:
-        config = super().get_config()
-        config.update({
-            "embed_dim": self.embed_dim,
-            "num_heads": self.num_heads,
-            "ff_dim": self.ff_dim,
-            "dropout": self.dropout,
-        })
-        return config
-class EncoderModel(tf.keras.Model):
-    """Dual encoder model with shared weights option."""
-    def __init__(
-        self,
-        config: ChatbotConfig,
-        name: str = "encoder",
-        shared_weights: bool = False,
-        **kwargs
-    ):
-        super().__init__(name=name, **kwargs)
-        self.config = config
-        self.shared_weights = shared_weights
-        # Input embedding layer
-        self.embedding = tf.keras.layers.Embedding(
-            config.vocab_size,
-            config.embedding_dim,
-            mask_zero=True,
-            name=f"{name}_embedding"
-        )
-        # Positional encoding
-        self.pos_encoding = self._get_positional_encoding()
-        # Transformer blocks
-        self.transformer_blocks = [
-            TransformerBlock(
-                config.embedding_dim,
-                config.num_attention_heads,
-                config.encoder_units * 4,
-                config.dropout_rate,
-                name=f"{name}_transformer_{i}"
-            ) for i in range(3)
-        ]
-        # Final LSTM layer
-        self.final_lstm = tf.keras.layers.LSTM(
-            config.encoder_units,
-            kernel_regularizer=tf.keras.regularizers.l2(config.l2_reg_weight),
-            name=f"{name}_final_lstm"
-        )
-        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
-        self.normalize = tf.keras.layers.Lambda(
-            lambda x: tf.nn.l2_normalize(x, axis=1)
-        )
-    def _get_positional_encoding(self) -> tf.Tensor:
-        """Generate positional encoding matrix."""
-        pos = np.arange(self.config.max_sequence_length)[:, np.newaxis]
-        i = np.arange(self.config.embedding_dim)[np.newaxis, :]
-        angle = pos / np.power(10000, (2 * (i // 2)) / self.config.embedding_dim)
-        pos_encoding = np.zeros_like(angle)
-        pos_encoding[:, 0::2] = np.sin(angle[:, 0::2])
-        pos_encoding[:, 1::2] = np.cos(angle[:, 1::2])
-        return tf.cast(pos_encoding[np.newaxis, ...], dtype=tf.float32)
-    def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
-        # Get input mask
-        mask = self.embedding.compute_mask(inputs)
-        mask = mask[:, tf.newaxis, tf.newaxis, :]  # Add attention dims
-        # Embedding + positional encoding
-        x = self.embedding(inputs)
-        x = x + self.pos_encoding
-        # Apply transformer blocks
-        for transformer_block in self.transformer_blocks:
-            x = transformer_block(x, training=training, mask=mask)
-        # Final processing
-        x = self.final_lstm(x)
-        x = self.dropout(x, training=training)
-        return self.normalize(x)
-class RetrievalChatbot:
-    """Professional implementation of a retrieval-based chatbot."""
-    def __init__(self, config: ChatbotConfig):
-        self.config = config
-        self.nlp = spacy.load(config.spacy_model)
-        # Initialize tokenizer
-        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(
-            num_words=config.vocab_size,
-            oov_token="<OOV>"
-        )
-        # Special tokens
-        self.special_tokens = {
-            "user": "<USER>",
-            "assistant": "<ASSISTANT>",
-            "context": "<CONTEXT>",
-            "sep": "<SEP>"
-        }
-        # Build models
-        self._build_models()
-        # Training history
-        self.history = {
-            "train_loss": [],
-            "val_loss": [],
-            "train_metrics": {},
-            "val_metrics": {}
-        }
-        # Initialize similarity cache
-        self.similarity_cache = {}
-    def _build_models(self):
-        """Initialize the encoder models."""
-        # Query encoder
-        self.query_encoder = EncoderModel(
-            self.config,
-            name="query_encoder",
-            shared_weights=False
-        )
-        # Response encoder (can share weights with query encoder)
-        self.response_encoder = EncoderModel(
-            self.config,
-            name="response_encoder",
-            shared_weights=False
-        )
-    def save_models(self, save_dir: Union[str, Path]):
-        """Save models and configuration."""
-        save_dir = Path(save_dir)
-        save_dir.mkdir(parents=True, exist_ok=True)
-        # Save config
-        with open(save_dir / "config.json", "w") as f:
-            json.dump(self.config.to_dict(), f, indent=2)
-        # Save models with proper extension
-        self.query_encoder.save(save_dir / "query_encoder.keras")
-        self.response_encoder.save(save_dir / "response_encoder.keras")
-        # Save tokenizer config
-        tokenizer_config = {
-            "word_index": self.tokenizer.word_index,
-            "word_counts": self.tokenizer.word_counts,
-            "document_count": self.tokenizer.document_count,
-            "index_docs": self.tokenizer.index_docs,
-            "index_word": self.tokenizer.index_word
-        }
-        with open(save_dir / "tokenizer_config.json", "w") as f:
-            json.dump(tokenizer_config, f)
-    @classmethod
-    def load_models(cls, load_dir: Union[str, Path]) -> 'RetrievalChatbot':
-        """Load saved models and configuration."""
-        load_dir = Path(load_dir)
-        # Load config
-        with open(load_dir / "config.json", "r") as f:
-            config = ChatbotConfig.from_dict(json.load(f))
-        # Initialize chatbot
-        chatbot = cls(config)
-        # Load models with proper extension
-        chatbot.query_encoder = tf.keras.models.load_model(
-            load_dir / "query_encoder.keras",
-            custom_objects={"TransformerBlock": TransformerBlock}
-        )
-        chatbot.response_encoder = tf.keras.models.load_model(
-            load_dir / "response_encoder.keras",
-            custom_objects={"TransformerBlock": TransformerBlock}
-        )
-        # Load tokenizer config
-        with open(load_dir / "tokenizer_config.json", "r") as f:
-            tokenizer_config = json.load(f)
-        chatbot.tokenizer = tf.keras.preprocessing.text.Tokenizer(
-            num_words=config.vocab_size,
-            oov_token="<OOV>"
-        )
-        chatbot.tokenizer.word_index = tokenizer_config["word_index"]
-        chatbot.tokenizer.word_counts = tokenizer_config["word_counts"]
-        chatbot.tokenizer.document_count = tokenizer_config["document_count"]
-        chatbot.tokenizer.index_docs = tokenizer_config["index_docs"]
-        chatbot.tokenizer.index_word = tokenizer_config["index_word"]
-        return chatbot
-    def _improved_spacy_similarity(self, text1: str, text2: str) -> float:
-        """Calculate semantic similarity between texts with preprocessing."""
-        def preprocess(text: str) -> str:
-            # Basic cleaning
-            text = ' '.join(text.split())
-            return text if text.strip() else "empty_document"
-        # Get cache key
-        cache_key = f"{hash(text1)}_{hash(text2)}"
-        if cache_key in self.similarity_cache:
-            return self.similarity_cache[cache_key]
-        # Process texts
-        text1, text2 = preprocess(text1), preprocess(text2)
-        doc1, doc2 = self.nlp(text1), self.nlp(text2)
-        # Calculate similarity
-        if doc1.has_vector and doc2.has_vector:
-            sim = doc1.similarity(doc2)
-        else:
-            # Fallback to token overlap similarity
-            tokens1 = {t.lower_ for t in doc1 if not t.is_stop and not t.is_punct}
-            tokens2 = {t.lower_ for t in doc2 if not t.is_stop and not t.is_punct}
-            intersection = len(tokens1.intersection(tokens2))
-            union = len(tokens1.union(tokens2))
-            sim = intersection / union if union > 0 else 0.0
-        # Cache result
-        self.similarity_cache[cache_key] = sim
-        return sim
-    def _smart_negative_sampling(
-        self,
-        positive: str,
-        response_pool: List[str],
-        n_samples: int,
-        max_attempts: int = 200,
-        similarity_bounds: Tuple[float, float] = (0.3, 0.8),
-        batch_size: int = 10
-    ) -> List[str]:
-        """Smart negative sampling with similarity bounds and batching."""
-        candidates = []
-        seen = set()
-        attempts = 0
-        while len(candidates) < n_samples and attempts < max_attempts:
-            # Batch process candidates
-            batch = random.sample(
-                response_pool,
-                min(batch_size, max_attempts - attempts)
-            )
-            for candidate in batch:
-                if candidate != positive and candidate not in seen:
-                    seen.add(candidate)
-                    sim = self._improved_spacy_similarity(candidate, positive)
-                    # Check similarity bounds
-                    if similarity_bounds[0] < sim < similarity_bounds[1]:
-                        candidates.append(candidate)
-                        if len(candidates) == n_samples:
-                            break
-            attempts += len(batch)
-        return candidates
-    def train(
-        self,
-        q_pad: tf.Tensor,
-        p_pad: tf.Tensor,
-        n_pad: tf.Tensor,
-        epochs: int = 3,
-        batch_size: int = 32,
-        validation_split: float = 0.2,
-        checkpoint_dir: Optional[Union[str, Path]] = None
-    ):
-        """Train the model with improved training loop."""
-        # Setup training
-        total_samples = len(q_pad)
-        train_size = int((1 - validation_split) * total_samples)
-        # Split data
-        indices = np.random.permutation(total_samples)
-        train_idx, val_idx = indices[:train_size], indices[train_size:]
-        train_data = (q_pad[train_idx], p_pad[train_idx], n_pad[train_idx])
-        val_data = (q_pad[val_idx], p_pad[val_idx], n_pad[val_idx])
-        # Setup optimizer with learning rate schedule
-        steps_per_epoch = train_size // batch_size
-        total_steps = steps_per_epoch * epochs
-        lr_schedule = self._get_lr_schedule(
-            total_steps,
-            self.config.learning_rate,
-            self.config.warmup_steps
-        )
-        optimizer = tf.keras.optimizers.Adam(lr_schedule)
-        # Setup checkpointing
-        if checkpoint_dir:
-            checkpoint_dir = Path(checkpoint_dir)
-            checkpoint_dir.mkdir(parents=True, exist_ok=True)
-            # Setup checkpoint callback with correct file format
-            checkpoint_template = str(checkpoint_dir / "model_epoch_{epoch:04d}.weights.h5")
-            checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
-                checkpoint_template,
-                save_weights_only=True,
-                save_best_only=True,
-                monitor='val_loss',
-                mode='min',
-                verbose=1
-            )
-        # Training loop
-        best_val_loss = float('inf')
-        patience = 5
-        wait = 0
-        for epoch in range(epochs):
-            # Training
-            train_loss = self._train_epoch(
-                train_data,
-                optimizer,
-                batch_size,
-                training=True
-            )
-            # Validation
-            val_loss = self._train_epoch(
-                val_data,
-                optimizer,
-                batch_size,
-                training=False
-            )
-            # Update history
-            self.history['train_loss'].append(train_loss)
-            self.history['val_loss'].append(val_loss)
-            logger.info(
-                f"Epoch {epoch + 1}/{epochs} - "
-                f"train_loss: {train_loss:.4f} - "
-                f"val_loss: {val_loss:.4f}"
-            )
-            # Early stopping
-            if val_loss < best_val_loss:
-                best_val_loss = val_loss
-                wait = 0
-                if checkpoint_dir:
-                    self.save_models(checkpoint_dir / f"best_model")
-            else:
-                wait += 1
-                if wait >= patience:
-                    logger.info("Early stopping triggered")
-                    break
-    def _get_lr_schedule(
-        self,
-        total_steps: int,
-        peak_lr: float,
-        warmup_steps: int
-    ) -> tf.keras.optimizers.schedules.LearningRateSchedule:
-        """Enhanced learning rate schedule with better error handling and logging."""
-        class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
-            def __init__(
-                self,
-                total_steps: int,
-                peak_lr: float,
-                warmup_steps: int
-            ):
-                super().__init__()
-                self.total_steps = tf.cast(total_steps, tf.float32)
-                self.peak_lr = tf.cast(peak_lr, tf.float32)
-                self.warmup_steps = tf.cast(max(1, warmup_steps), tf.float32)  # Prevent 0
-                # Calculate and store constants
-                self.initial_lr = self.peak_lr * 0.1  # Start at 10% of peak
-                self.min_lr = self.peak_lr * 0.01     # Minimum 1% of peak
-                logger.info(f"Learning rate schedule initialized:")
-                logger.info(f"  Initial LR: {float(self.initial_lr):.6f}")
-                logger.info(f"  Peak LR: {float(self.peak_lr):.6f}")
-                logger.info(f"  Min LR: {float(self.min_lr):.6f}")
-                logger.info(f"  Warmup steps: {int(self.warmup_steps)}")
-                logger.info(f"  Total steps: {int(self.total_steps)}")
-            def __call__(self, step):
-                step = tf.cast(step, tf.float32)
-                # Warmup phase
-                warmup_factor = tf.minimum(1.0, step / self.warmup_steps)
-                warmup_lr = self.initial_lr + (self.peak_lr - self.initial_lr) * warmup_factor
-                # Decay phase
-                decay_steps = tf.maximum(1.0, self.total_steps - self.warmup_steps)
-                decay_factor = (step - self.warmup_steps) / decay_steps
-                decay_factor = tf.minimum(tf.maximum(0.0, decay_factor), 1.0)  # Clip to [0,1]
-                cosine_decay = 0.5 * (1.0 + tf.cos(tf.constant(np.pi) * decay_factor))
-                decay_lr = self.min_lr + (self.peak_lr - self.min_lr) * cosine_decay
-                # Choose between warmup and decay
-                final_lr = tf.where(step < self.warmup_steps, warmup_lr, decay_lr)
-                # Ensure learning rate is valid
-                final_lr = tf.maximum(self.min_lr, final_lr)
-                final_lr = tf.where(tf.math.is_finite(final_lr), final_lr, self.min_lr)
-                return final_lr
-            def get_config(self):
-                return {
-                    "total_steps": self.total_steps,
-                    "peak_lr": self.peak_lr,
-                    "warmup_steps": self.warmup_steps,
-                }
-        return CustomSchedule(total_steps, peak_lr, warmup_steps)
-    @tf.function
-    def _train_step(
-        self,
-        q_batch: tf.Tensor,
-        p_batch: tf.Tensor,
-        n_batch: tf.Tensor,
-        optimizer: tf.keras.optimizers.Optimizer,
-        training: bool = True
-    ) -> tf.Tensor:
-        """Single training step with triplet loss."""
-        with tf.GradientTape() as tape:
-            # Get embeddings
-            q_emb = self.query_encoder(q_batch, training=training)
-            p_emb = self.response_encoder(p_batch, training=training)
-            n_emb = self.response_encoder(n_batch, training=training)
-            # Calculate triplet loss
-            pos_dist = tf.reduce_sum(tf.square(q_emb - p_emb), axis=1)
-            neg_dist = tf.reduce_sum(tf.square(q_emb - n_emb), axis=1)
-            loss = tf.maximum(0.0, self.config.margin + pos_dist - neg_dist)
-            loss = tf.reduce_mean(loss)
-        if training:
-            # Apply gradients
-            gradients = tape.gradient(
-                loss,
-                self.query_encoder.trainable_variables +
-                self.response_encoder.trainable_variables
-            )
-            optimizer.apply_gradients(zip(
-                gradients,
-                self.query_encoder.trainable_variables +
-                self.response_encoder.trainable_variables
-            ))
-        return loss
-    def _train_epoch(
-        self,
-        data: Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
-        optimizer: tf.keras.optimizers.Optimizer,
-        batch_size: int,
-        training: bool = True
-    ) -> float:
-        """Train for one epoch with enhanced logging and progress tracking."""
-        q_data, p_data, n_data = data
-        total_loss = 0
-        num_batches = len(q_data) // batch_size
-        # Log current learning rate at start of epoch
-        if training:
-            if hasattr(optimizer.learning_rate, '__call__'):
-                current_lr = optimizer.learning_rate(optimizer.iterations)
-            else:
-                current_lr = optimizer.learning_rate
-            logger.info(f"Current learning rate: {float(current_lr):.6f}")
-            # Shuffle data
-            indices = np.random.permutation(len(q_data))
-            q_data = q_data[indices]
-            p_data = p_data[indices]
-            n_data = n_data[indices]
-        # Create progress bar
-        mode = "Training" if training else "Validation"
-        pbar = tqdm(
-            total=num_batches,
-            desc=f"{mode} batches",
-            unit="batch",
-            dynamic_ncols=True  # Automatically adjust width
-        )
-        # Process batches
-        for i in range(num_batches):
-            start_idx = i * batch_size
-            end_idx = start_idx + batch_size
-            batch_loss = self._train_step(
-                q_data[start_idx:end_idx],
-                p_data[start_idx:end_idx],
-                n_data[start_idx:end_idx],
-                optimizer,
-                training
-            )
-            total_loss += batch_loss
-            # Update progress bar with current loss
-            avg_loss = total_loss / (i + 1)
-            pbar.set_postfix({
-                'loss': f'{avg_loss:.4f}',
-                'lr': f'{float(current_lr):.6f}' if training else 'N/A'
-            })
-            pbar.update(1)
-        pbar.close()
-        return total_loss / num_batches if num_batches > 0 else 0
-    def _prepare_sequences(
-        self,
-        queries: List[str],
-        positives: List[str],
-        negatives: List[str]
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-        """Enhanced sequence preparation with logging and text preprocessing."""
-        logger.info("Preparing sequences...")
-        # Text cleaning function from old version
-        def clean_text(text: str) -> str:
-            # Remove excessive whitespace
-            text = ' '.join(text.split())
-            # Remove very long repetitive sequences
-            if len(text) > 500:  # Add length limit
-                text = ' '.join(dict.fromkeys(text.split()))
-            return text
-        # Process texts with special tokens and cleaning
-        queries = [f"{self.special_tokens['user']} {clean_text(q)}" for q in queries]
-        positives = [f"{self.special_tokens['assistant']} {clean_text(p)}" for p in positives]
-        negatives = [f"{self.special_tokens['assistant']} {clean_text(n)}" for n in negatives]
-        # Fit tokenizer and log vocabulary statistics
-        all_texts = queries + positives + negatives
-        self.tokenizer.fit_on_texts(all_texts)
-        # Log vocabulary statistics
-        vocab_size = len(self.tokenizer.word_index)
-        logger.info(f"Vocabulary statistics:")
-        logger.info(f"  Total unique tokens: {vocab_size}")
-        logger.info(f"  Vocab limit: {self.config.vocab_size}")
-        # Log most common tokens
-        word_freq = sorted(
-            self.tokenizer.word_counts.items(),
-            key=lambda x: x[1],
-            reverse=True
-        )[:10]
-        logger.info("Most common tokens:")
-        for word, freq in word_freq:
-            logger.info(f"  {word}: {freq}")
-        # Padding function from old version
-        def pad_sequences(texts: List[str]) -> tf.Tensor:
-            sequences = self.tokenizer.texts_to_sequences(texts)
-            return tf.keras.preprocessing.sequence.pad_sequences(
-                sequences,
-                maxlen=self.config.max_sequence_length,
-                padding='post',
-                truncating='post'
-            )
-        # Return padded sequences
-        return (
-            pad_sequences(queries),
-            pad_sequences(positives),
-            pad_sequences(negatives)
-        )
-    def prepare_dataset(
-        self,
-        dialogues: List[dict],
-        neg_samples_per_pos: int = 3,
-        debug_samples: Optional[int] = None
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-        """Prepare dataset with enhanced logging and statistics."""
-        logger.info("Preparing dataset...")
-        # Log dataset statistics
-        total_dialogues = len(dialogues)
-        total_turns = sum(len(d['turns']) for d in dialogues)
-        avg_turns = total_turns / total_dialogues
-        logger.info(f"Dataset statistics:")
-        logger.info(f"  Total dialogues: {total_dialogues}")
-        logger.info(f"  Total turns: {total_turns}")
-        logger.info(f"  Average turns per dialogue: {avg_turns:.2f}")
-        # Extract and filter responses with logging
-        response_pool = []
-        skipped_short = 0
-        skipped_long = 0
-        for d in dialogues:
-            for turn in d['turns']:
-                if turn['speaker'] == 'assistant':
-                    text = turn['text'].strip()
-                    length = len(text.split())
-                    if length < self.config.min_text_length:
-                        skipped_short += 1
-                        continue
-                    if length > self.config.max_sequence_length:
-                        skipped_long += 1
-                        continue
-                    response_pool.append(text)
-        logger.info(f"Response pool statistics:")
-        logger.info(f"  Total responses: {len(response_pool)}")
-        logger.info(f"  Skipped (too short): {skipped_short}")
-        logger.info(f"  Skipped (too long): {skipped_long}")
-        # Process dialogues and create training examples
-        queries, positives, negatives = [], [], []
-        for dialogue in tqdm(dialogues, desc="Processing dialogues"):
-            turns = dialogue['turns']
-            for i in range(len(turns) - 1):
-                if turns[i]['speaker'] == 'user' and turns[i+1]['speaker'] == 'assistant':
-                    query = turns[i]['text'].strip()
-                    positive = turns[i+1]['text'].strip()
-                    # Skip short texts
-                    if (len(query.split()) < self.config.min_text_length or
-                        len(positive.split()) < self.config.min_text_length):  # Fixed
-                        continue
-                    # Get negative samples
-                    neg_samples = self._smart_negative_sampling(
-                        positive,
-                        response_pool,
-                        neg_samples_per_pos
-                    )
-                    if len(neg_samples) == neg_samples_per_pos:
-                        for neg in neg_samples:
-                            queries.append(query)
-                            positives.append(positive)
-                            negatives.append(neg)
-        # Log final dataset statistics
-        logger.info(f"Final dataset statistics:")
-        logger.info(f"  Training examples: {len(queries)}")
-        logger.info(f"  Unique queries: {len(set(queries))}")
-        logger.info(f"  Unique responses: {len(set(positives))}")
-        return self._prepare_sequences(queries, positives, negatives)
-    def encode_query(self, query: str, context: Optional[List[Tuple[str, str]]] = None) -> tf.Tensor:
-        """Encode a query with optional conversation context."""
-        # Prepare query with context
-        if context:
-            context_str = ' '.join([
-                f"{self.special_tokens['user']} {q} "
-                f"{self.special_tokens['assistant']} {r}"
-                for q, r in context[-self.config.max_context_turns:]
-            ])
-            query = f"{context_str} {self.special_tokens['user']} {query}"
-        else:
-            query = f"{self.special_tokens['user']} {query}"
-        # Tokenize and pad
-        seq = self.tokenizer.texts_to_sequences([query])
-        padded_seq = tf.keras.preprocessing.sequence.pad_sequences(
-            seq,
-            maxlen=self.config.max_sequence_length,
-            padding='post',
-            truncating='post'
-        )
-        return self.query_encoder(padded_seq, training=False)
-    def encode_responses(self, responses: List[str]) -> tf.Tensor:
-        """Encode a batch of responses."""
-        # Prepare responses
-        responses = [
-            f"{self.special_tokens['assistant']} {r}"
-            for r in responses
-        ]
-        # Tokenize and pad
-        sequences = self.tokenizer.texts_to_sequences(responses)
-        padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
-            sequences,
-            maxlen=self.config.max_sequence_length,
-            padding='post',
-            truncating='post'
-        )
-        return self.response_encoder(padded_sequences, training=False)
-    def retrieve_responses(
-        self,
-        query: str,
-        candidates: List[str],
-        context: Optional[List[Tuple[str, str]]] = None,
-        top_k: int = 5
-    ) -> List[Tuple[str, float]]:
-        """Retrieve top-k responses for a query."""
-        # Encode query and candidates
-        q_emb = self.encode_query(query, context)
-        c_emb = self.encode_responses(candidates)
-        # Calculate similarities
-        similarities = tf.matmul(q_emb, c_emb, transpose_b=True).numpy()[0]
-        # Get top-k responses
-        top_indices = np.argsort(similarities)[::-1][:top_k]
-        return [(candidates[i], similarities[i]) for i in top_indices]
-    def chat(
-        self,
-        query: str,
-        response_pool: List[str],
-        conversation_history: Optional[List[Tuple[str, str]]] = None,
-        top_k: int = 5
-    ) -> Tuple[str, List[Tuple[str, float]]]:
-        """Interactive chat with response selection."""
-        # Get responses with scores
-        responses = self.retrieve_responses(
-            query,
-            response_pool,
-            conversation_history,
-            top_k
-        )
-        # Return best response and all candidates with scores
-        return responses[0][0], responses

chatbot3.py DELETED Viewed

@@ -1,824 +0,0 @@
-from transformers import TFAutoModel, AutoTokenizer
-import tensorflow as tf
-import numpy as np
-from typing import List, Tuple, Dict, Optional, Union
-from dataclasses import dataclass
-import logging
-import spacy
-import random
-import json
-from tqdm import tqdm
-from pathlib import Path
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-@dataclass
-class ChatbotConfig:
-    """Enhanced configuration with pretrained model settings."""
-    vocab_size: int = 10000
-    max_sequence_length: int = 512
-    embedding_dim: int = 768  # Match DistilBERT's dimension
-    encoder_units: int = 256
-    num_attention_heads: int = 8
-    dropout_rate: float = 0.2
-    l2_reg_weight: float = 0.001
-    margin: float = 0.3
-    learning_rate: float = 0.001
-    min_text_length: int = 3
-    max_context_turns: int = 5
-    warmup_steps: int = 200
-    pretrained_model: str = 'distilbert-base-uncased'
-    freeze_embeddings: bool = True
-    spacy_model: str = 'en_core_web_md'
-    def to_dict(self) -> dict:
-        """Convert config to dictionary."""
-        return {k: str(v) if isinstance(v, Path) else v
-                for k, v in self.__dict__.items()}
-    @classmethod
-    def from_dict(cls, config_dict: dict) -> 'ChatbotConfig':
-        """Create config from dictionary."""
-        return cls(**{k: v for k, v in config_dict.items()
-                     if k in cls.__dataclass_fields__})
-class TransformerBlock(tf.keras.layers.Layer):
-    """Custom Transformer block with pre-layer normalization."""
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        ff_dim: int,
-        dropout: float = 0.1,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.ff_dim = ff_dim
-        self.dropout = dropout
-        self.attention = tf.keras.layers.MultiHeadAttention(
-            num_heads=num_heads,
-            key_dim=embed_dim // num_heads,
-            dropout=dropout
-        )
-        self.ffn = tf.keras.Sequential([
-            tf.keras.layers.Dense(ff_dim, activation="gelu"),
-            tf.keras.layers.Dense(embed_dim),
-        ])
-        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.dropout1 = tf.keras.layers.Dropout(dropout)
-        self.dropout2 = tf.keras.layers.Dropout(dropout)
-    def call(self, inputs: tf.Tensor, training: bool, mask: Optional[tf.Tensor] = None) -> tf.Tensor:
-        # Pre-layer normalization
-        norm_inputs = self.layernorm1(inputs)
-        # Self-attention
-        attention_output = self.attention(
-            query=norm_inputs,
-            value=norm_inputs,
-            key=norm_inputs,
-            attention_mask=mask,
-            training=training
-        )
-        attention_output = self.dropout1(attention_output, training=training)
-        attention_output = inputs + attention_output
-        # Feed-forward network
-        norm_attention = self.layernorm2(attention_output)
-        ffn_output = self.ffn(norm_attention)
-        ffn_output = self.dropout2(ffn_output, training=training)
-        return attention_output + ffn_output
-    def get_config(self) -> dict:
-        config = super().get_config()
-        config.update({
-            "embed_dim": self.embed_dim,
-            "num_heads": self.num_heads,
-            "ff_dim": self.ff_dim,
-            "dropout": self.dropout,
-        })
-        return config
-class EncoderModel(tf.keras.Model):
-    """Dual encoder model with pretrained embeddings."""
-    def __init__(
-        self,
-        config: ChatbotConfig,
-        name: str = "encoder",
-        shared_weights: bool = False,
-        **kwargs
-    ):
-        super().__init__(name=name, **kwargs)
-        self.config = config
-        self.shared_weights = shared_weights
-        # Load pretrained model and tokenizer
-        self.pretrained = TFAutoModel.from_pretrained(config.pretrained_model)
-        # Freeze pretrained weights if specified
-        if config.freeze_embeddings:
-            self.pretrained.trainable = False
-        # Transformer blocks for additional processing
-        self.transformer_blocks = [
-            TransformerBlock(
-                config.embedding_dim,
-                config.num_attention_heads,
-                config.encoder_units * 4,
-                config.dropout_rate,
-                name=f"{name}_transformer_{i}"
-            ) for i in range(2)  # Reduced number of blocks since we're using pretrained
-        ]
-        # Final LSTM layer
-        self.final_lstm = tf.keras.layers.LSTM(
-            config.encoder_units,
-            kernel_regularizer=tf.keras.regularizers.l2(config.l2_reg_weight),
-            name=f"{name}_final_lstm"
-        )
-        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
-        self.normalize = tf.keras.layers.Lambda(
-            lambda x: tf.nn.l2_normalize(x, axis=1)
-        )
-    def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
-        # Get pretrained embeddings
-        pretrained_outputs = self.pretrained(inputs, training=training)
-        x = pretrained_outputs.last_hidden_state
-        # Get attention mask from input
-        attention_mask = tf.cast(tf.not_equal(inputs, 0), tf.float32)
-        attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
-        # Apply transformer blocks
-        for transformer_block in self.transformer_blocks:
-            x = transformer_block(x, training=training, mask=attention_mask)
-        # Final processing
-        x = self.final_lstm(x)
-        x = self.dropout(x, training=training)
-        return self.normalize(x)
-class RetrievalChatbot:
-    """Modified chatbot using pretrained embeddings with full functionality."""
-    def __init__(self, config: ChatbotConfig):
-        self.config = config
-        self.nlp = spacy.load(config.spacy_model)
-        # Use HuggingFace tokenizer instead of Keras
-        self.tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
-        # Special tokens
-        self.special_tokens = {
-            "user": "<USER>",
-            "assistant": "<ASSISTANT>",
-            "context": "<CONTEXT>",
-            "sep": "<SEP>"
-        }
-        # Add special tokens to tokenizer
-        self.tokenizer.add_special_tokens(
-            {'additional_special_tokens': list(self.special_tokens.values())}
-        )
-        # Build models
-        self._build_models()
-        # Initialize training tracking
-        self.history = {
-            "train_loss": [],
-            "val_loss": [],
-            "train_metrics": {},
-            "val_metrics": {}
-        }
-        self.similarity_cache = {}
-    def _build_models(self):
-        """Initialize the encoder models."""
-        # Query encoder
-        self.query_encoder = EncoderModel(
-            self.config,
-            name="query_encoder",
-            shared_weights=False
-        )
-        # Response encoder (can share weights with query encoder)
-        self.response_encoder = EncoderModel(
-            self.config,
-            name="response_encoder",
-            shared_weights=False
-        )
-        # Resize token embeddings to match the tokenizer's vocab size
-        new_vocab_size = len(self.tokenizer)
-        self.query_encoder.pretrained.resize_token_embeddings(new_vocab_size)
-        self.response_encoder.pretrained.resize_token_embeddings(new_vocab_size)
-    def save_models(self, save_dir: Union[str, Path]):
-        """Save models and configuration."""
-        save_dir = Path(save_dir)
-        save_dir.mkdir(parents=True, exist_ok=True)
-        # Save config
-        with open(save_dir / "config.json", "w") as f:
-            json.dump(self.config.to_dict(), f, indent=2)
-        # Save models
-        self.query_encoder.pretrained.save_pretrained(save_dir / "query_encoder")
-        self.response_encoder.pretrained.save_pretrained(save_dir / "response_encoder")
-        # Save tokenizer
-        self.tokenizer.save_pretrained(save_dir / "tokenizer")
-    @classmethod
-    def load_models(cls, load_dir: Union[str, Path]) -> 'RetrievalChatbot':
-        """Load saved models and configuration."""
-        load_dir = Path(load_dir)
-        # Load config
-        with open(load_dir / "config.json", "r") as f:
-            config = ChatbotConfig.from_dict(json.load(f))
-        # Initialize chatbot
-        chatbot = cls(config)
-        # Load models
-        chatbot.query_encoder.pretrained = TFAutoModel.from_pretrained(
-            load_dir / "query_encoder",
-            config=config
-        )
-        chatbot.response_encoder.pretrained = TFAutoModel.from_pretrained(
-            load_dir / "response_encoder",
-            config=config
-        )
-        # Load tokenizer
-        chatbot.tokenizer = AutoTokenizer.from_pretrained(load_dir / "tokenizer")
-        return chatbot
-    def _improved_spacy_similarity(self, text1: str, text2: str) -> float:
-        """Calculate semantic similarity between texts with preprocessing."""
-        def preprocess(text: str) -> str:
-            # Basic cleaning
-            text = ' '.join(text.split())
-            return text if text.strip() else "empty_document"
-        # Get cache key
-        cache_key = f"{hash(text1)}_{hash(text2)}"
-        if cache_key in self.similarity_cache:
-            return self.similarity_cache[cache_key]
-        # Process texts
-        text1, text2 = preprocess(text1), preprocess(text2)
-        doc1, doc2 = self.nlp(text1), self.nlp(text2)
-        # Calculate similarity
-        if doc1.has_vector and doc2.has_vector:
-            sim = doc1.similarity(doc2)
-        else:
-            # Fallback to token overlap similarity
-            tokens1 = {t.lower_ for t in doc1 if not t.is_stop and not t.is_punct}
-            tokens2 = {t.lower_ for t in doc2 if not t.is_stop and not t.is_punct}
-            intersection = len(tokens1.intersection(tokens2))
-            union = len(tokens1.union(tokens2))
-            sim = intersection / union if union > 0 else 0.0
-        # Cache result
-        self.similarity_cache[cache_key] = sim
-        return sim
-    def prepare_dataset(
-        self,
-        dialogues: List[dict],
-        neg_samples_per_pos: int = 3,
-        debug_samples: Optional[int] = None
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-        """Prepare dataset with enhanced logging and statistics."""
-        logger.info("Preparing dataset...")
-        # Apply debug_samples limit if specified
-        if debug_samples is not None:
-            dialogues = dialogues[:debug_samples]
-            logger.info(f"Debug mode: Limited to {debug_samples} dialogues")
-        # Log dataset statistics
-        total_dialogues = len(dialogues)
-        total_turns = sum(len(d['turns']) for d in dialogues)
-        avg_turns = total_turns / total_dialogues if total_dialogues > 0 else 0
-        logger.info(f"Dataset statistics:")
-        logger.info(f"  Total dialogues: {total_dialogues}")
-        logger.info(f"  Total turns: {total_turns}")
-        logger.info(f"  Average turns per dialogue: {avg_turns:.2f}")
-        # Extract and filter responses with logging
-        response_pool = []
-        skipped_short = 0
-        skipped_long = 0
-        for d in dialogues:
-            for turn in d['turns']:
-                if turn.get('speaker') == 'assistant' and 'text' in turn:
-                    text = turn['text'].strip()
-                    length = len(text.split())
-                    if length < self.config.min_text_length:
-                        skipped_short += 1
-                        continue
-                    if length > self.config.max_sequence_length:
-                        skipped_long += 1
-                        continue
-                    response_pool.append(text)
-        logger.info(f"Response pool statistics:")
-        logger.info(f"  Total responses: {len(response_pool)}")
-        logger.info(f"  Skipped (too short): {skipped_short}")
-        logger.info(f"  Skipped (too long): {skipped_long}")
-        # Process dialogues and create training examples
-        queries, positives, negatives = [], [], []
-        for dialogue in tqdm(dialogues, desc="Processing dialogues"):
-            turns = dialogue.get('turns', [])
-            for i in range(len(turns) - 1):
-                current_turn = turns[i]
-                next_turn = turns[i+1]
-                if (current_turn.get('speaker') == 'user' and
-                    next_turn.get('speaker') == 'assistant' and
-                    'text' in current_turn and
-                    'text' in next_turn):
-                    query = current_turn['text'].strip()
-                    positive = next_turn['text'].strip()
-                    # Skip short texts
-                    if (len(query.split()) < self.config.min_text_length or
-                        len(positive.split()) < self.config.min_text_length):
-                        continue
-                    # Get negative samples
-                    neg_samples = self._smart_negative_sampling(
-                        positive,
-                        response_pool,
-                        neg_samples_per_pos
-                    )
-                    if len(neg_samples) == neg_samples_per_pos:
-                        for neg in neg_samples:
-                            queries.append(query)
-                            positives.append(positive)
-                            negatives.append(neg)
-                    else:
-                        logger.warning(f"Insufficient negative samples for positive response: '{positive}'")
-        # Log final dataset statistics
-        logger.info(f"Final dataset statistics:")
-        logger.info(f"  Training examples: {len(queries)}")
-        logger.info(f"  Unique queries: {len(set(queries))}")
-        logger.info(f"  Unique responses: {len(set(positives))}")
-        return self._prepare_sequences(queries, positives, negatives)
-    def _smart_negative_sampling(
-        self,
-        positive: str,
-        response_pool: List[str],
-        n_samples: int,
-        max_attempts: int = 200,
-        similarity_bounds: Tuple[float, float] = (0.2, 0.9),
-        batch_size: int = 10
-    ) -> List[str]:
-        """Smart negative sampling with similarity bounds and fallback strategies."""
-        candidates = []
-        seen = set()
-        attempts = 0
-        while len(candidates) < n_samples and attempts < max_attempts:
-            remaining = min(batch_size, len(response_pool) - len(seen), max_attempts - attempts)
-            if remaining <= 0:
-                break
-            batch = random.sample(
-                [r for r in response_pool if r not in seen and r != positive],
-                remaining
-            )
-            for candidate in batch:
-                seen.add(candidate)
-                sim = self._improved_spacy_similarity(candidate, positive)
-                if similarity_bounds[0] < sim < similarity_bounds[1]:
-                    candidates.append(candidate)
-                    if len(candidates) == n_samples:
-                        break
-            attempts += len(batch)
-        if len(candidates) < n_samples:
-            logger.warning(f"Only found {len(candidates)} negative samples for positive response: '{positive}'")
-            # Fallback to random negatives without similarity constraints
-            fallback_needed = n_samples - len(candidates)
-            available_negatives = [r for r in response_pool if r != positive and r not in seen]
-            if available_negatives:
-                additional_negatives = random.sample(
-                    available_negatives,
-                    min(fallback_needed, len(available_negatives))
-                )
-                candidates.extend(additional_negatives)
-        return candidates
-    def _prepare_sequences(
-        self,
-        queries: List[str],
-        positives: List[str],
-        negatives: List[str]
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-        """Modified sequence preparation for pretrained tokenizer."""
-        logger.info("Preparing sequences...")
-        # Process texts with special tokens
-        queries = [f"{self.special_tokens['user']} {q}" for q in queries]
-        positives = [f"{self.special_tokens['assistant']} {p}" for p in positives]
-        negatives = [f"{self.special_tokens['assistant']} {n}" for n in negatives]
-        # Tokenize using HuggingFace tokenizer
-        def encode_batch(texts: List[str]) -> tf.Tensor:
-            # HuggingFace tokenizer returns TensorFlow tensors when return_tensors='tf'
-            encodings = self.tokenizer(
-                texts,
-                padding='max_length',
-                truncation=True,
-                max_length=self.config.max_sequence_length,
-                return_tensors='tf'
-            )
-            return encodings['input_ids']
-        # Encode all sequences
-        q_tensor = encode_batch(queries)
-        p_tensor = encode_batch(positives)
-        n_tensor = encode_batch(negatives)
-        # Log statistics about encoded sequences
-        logger.info("Sequence statistics:")
-        logger.info(f"  Query sequence shape: {q_tensor.shape}")
-        logger.info(f"  Positive response sequence shape: {p_tensor.shape}")
-        logger.info(f"  Negative response sequence shape: {n_tensor.shape}")
-        return q_tensor, p_tensor, n_tensor
-    def train(
-        self,
-        q_pad: tf.Tensor,
-        p_pad: tf.Tensor,
-        n_pad: tf.Tensor,
-        epochs: int = 3,
-        batch_size: int = 32,
-        validation_split: float = 0.2,
-        checkpoint_dir: Optional[Union[str, Path]] = None
-    ):
-        """Train the model with improved training loop."""
-        # Setup training
-        total_samples = tf.shape(q_pad)[0]
-        train_size = int((1 - validation_split) * total_samples.numpy())
-        # Shuffle and split data
-        indices = tf.random.shuffle(tf.range(start=0, limit=total_samples, dtype=tf.int32))
-        train_idx = indices[:train_size]
-        val_idx = indices[train_size:]
-        # Split data using TF indexing
-        train_data = (
-            tf.gather(q_pad, train_idx),
-            tf.gather(p_pad, train_idx),
-            tf.gather(n_pad, train_idx)
-        )
-        val_data = (
-            tf.gather(q_pad, val_idx),
-            tf.gather(p_pad, val_idx),
-            tf.gather(n_pad, val_idx)
-        )
-        # Setup optimizer with learning rate schedule
-        steps_per_epoch = train_size // batch_size
-        total_steps = steps_per_epoch * epochs
-        lr_schedule = self._get_lr_schedule(
-            total_steps,
-            self.config.learning_rate,
-            self.config.warmup_steps
-        )
-        optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
-        # Setup checkpointing
-        if checkpoint_dir:
-            checkpoint_dir = Path(checkpoint_dir)
-            checkpoint_dir.mkdir(parents=True, exist_ok=True)
-            # Setup checkpoint callback with correct file format
-            checkpoint_template = str(checkpoint_dir / "model_epoch_{epoch:04d}.weights.h5")
-            checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
-                checkpoint_template,
-                save_weights_only=True,
-                save_best_only=True,
-                monitor='val_loss',
-                mode='min',
-                verbose=1
-            )
-        # Training loop
-        best_val_loss = float('inf')
-        patience = 5
-        wait = 0
-        for epoch in range(epochs):
-            # Training
-            train_loss = self._train_epoch(
-                train_data,
-                optimizer,
-                batch_size,
-                training=True
-            )
-            # Validation
-            val_loss = self._train_epoch(
-                val_data,
-                optimizer,
-                batch_size,
-                training=False
-            )
-            # Update history
-            self.history['train_loss'].append(train_loss)
-            self.history['val_loss'].append(val_loss)
-            logger.info(
-                f"Epoch {epoch + 1}/{epochs} - "
-                f"train_loss: {train_loss:.4f} - "
-                f"val_loss: {val_loss:.4f}"
-            )
-            # Early stopping
-            if val_loss < best_val_loss:
-                best_val_loss = val_loss
-                wait = 0
-                if checkpoint_dir:
-                    self.save_models(checkpoint_dir / f"best_model")
-            else:
-                wait += 1
-                if wait >= patience:
-                    logger.info("Early stopping triggered")
-                    break
-    def _train_epoch(
-        self,
-        data: Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
-        optimizer: tf.keras.optimizers.Optimizer,
-        batch_size: int,
-        training: bool = True
-    ) -> float:
-        """Train for one epoch with enhanced logging and progress tracking."""
-        q_data, p_data, n_data = data
-        total_loss = 0.0
-        num_batches = tf.shape(q_data)[0] // batch_size
-        # Log current learning rate at start of epoch
-        if training:
-            if hasattr(optimizer.learning_rate, '__call__'):
-                current_lr = optimizer.learning_rate(optimizer.iterations)
-            else:
-                current_lr = optimizer.learning_rate
-            logger.info(f"Current learning rate: {float(current_lr):.6f}")
-        # Create progress bar
-        mode = "Training" if training else "Validation"
-        pbar = tqdm(
-            total=num_batches.numpy(),
-            desc=f"{mode} batches",
-            unit="batch",
-            dynamic_ncols=True
-        )
-        # Process batches
-        for i in range(num_batches):
-            start_idx = i * batch_size
-            end_idx = start_idx + batch_size
-            batch_loss = self._train_step(
-                q_data[start_idx:end_idx],
-                p_data[start_idx:end_idx],
-                n_data[start_idx:end_idx],
-                optimizer,
-                training
-            )
-            total_loss += batch_loss.numpy()
-            # Update progress bar with current loss
-            avg_loss = total_loss / (i + 1)
-            pbar.set_postfix({
-                'loss': f'{avg_loss:.4f}',
-                'lr': f'{float(current_lr):.6f}' if training else 'N/A'
-            })
-            pbar.update(1)
-        pbar.close()
-        return total_loss / num_batches.numpy() if num_batches > 0 else 0.0
-    @tf.function
-    def _train_step(
-        self,
-        q_batch: tf.Tensor,
-        p_batch: tf.Tensor,
-        n_batch: tf.Tensor,
-        optimizer: tf.keras.optimizers.Optimizer,
-        training: bool = True
-    ) -> tf.Tensor:
-        """Single training step with triplet loss."""
-        with tf.GradientTape() as tape:
-            # Get embeddings
-            q_emb = self.query_encoder(q_batch, training=training)
-            p_emb = self.response_encoder(p_batch, training=training)
-            n_emb = self.response_encoder(n_batch, training=training)
-            # Calculate triplet loss
-            pos_dist = tf.reduce_sum(tf.square(q_emb - p_emb), axis=1)
-            neg_dist = tf.reduce_sum(tf.square(q_emb - n_emb), axis=1)
-            loss = tf.maximum(0.0, self.config.margin + pos_dist - neg_dist)
-            loss = tf.reduce_mean(loss)
-        if training:
-            # Apply gradients
-            gradients = tape.gradient(
-                loss,
-                self.query_encoder.trainable_variables +
-                self.response_encoder.trainable_variables
-            )
-            optimizer.apply_gradients(zip(
-                gradients,
-                self.query_encoder.trainable_variables +
-                self.response_encoder.trainable_variables
-            ))
-        return loss
-    def _get_lr_schedule(
-        self,
-        total_steps: int,
-        peak_lr: float,
-        warmup_steps: int
-    ) -> tf.keras.optimizers.schedules.LearningRateSchedule:
-        """Enhanced learning rate schedule with better error handling and logging."""
-        class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
-            def __init__(
-                self,
-                total_steps: int,
-                peak_lr: float,
-                warmup_steps: int
-            ):
-                super().__init__()
-                self.total_steps = tf.cast(total_steps, tf.float32)
-                self.peak_lr = tf.cast(peak_lr, tf.float32)
-                self.warmup_steps = tf.cast(max(1, warmup_steps), tf.float32)  # Prevent 0
-                # Calculate and store constants
-                self.initial_lr = self.peak_lr * 0.1  # Start at 10% of peak
-                self.min_lr = self.peak_lr * 0.01     # Minimum 1% of peak
-                logger.info(f"Learning rate schedule initialized:")
-                logger.info(f"  Initial LR: {float(self.initial_lr):.6f}")
-                logger.info(f"  Peak LR: {float(self.peak_lr):.6f}")
-                logger.info(f"  Min LR: {float(self.min_lr):.6f}")
-                logger.info(f"  Warmup steps: {int(self.warmup_steps)}")
-                logger.info(f"  Total steps: {int(self.total_steps)}")
-            def __call__(self, step):
-                step = tf.cast(step, tf.float32)
-                # Warmup phase
-                warmup_factor = tf.minimum(1.0, step / self.warmup_steps)
-                warmup_lr = self.initial_lr + (self.peak_lr - self.initial_lr) * warmup_factor
-                # Decay phase
-                decay_steps = tf.maximum(1.0, self.total_steps - self.warmup_steps)
-                decay_factor = (step - self.warmup_steps) / decay_steps
-                decay_factor = tf.minimum(tf.maximum(0.0, decay_factor), 1.0)  # Clip to [0,1]
-                cosine_decay = 0.5 * (1.0 + tf.cos(np.pi * decay_factor))
-                decay_lr = self.min_lr + (self.peak_lr - self.min_lr) * cosine_decay
-                # Choose between warmup and decay
-                final_lr = tf.where(step < self.warmup_steps, warmup_lr, decay_lr)
-                # Ensure learning rate is valid
-                final_lr = tf.maximum(self.min_lr, final_lr)
-                final_lr = tf.where(tf.math.is_finite(final_lr), final_lr, self.min_lr)
-                return final_lr
-            def get_config(self):
-                return {
-                    "total_steps": self.total_steps,
-                    "peak_lr": self.peak_lr,
-                    "warmup_steps": self.warmup_steps,
-                }
-        return CustomSchedule(total_steps, peak_lr, warmup_steps)
-    def encode_query(self, query: str, context: Optional[List[Tuple[str, str]]] = None) -> tf.Tensor:
-        """Encode a query with optional conversation context."""
-        # Prepare query with context
-        if context:
-            context_str = ' '.join([
-                f"{self.special_tokens['user']} {q} "
-                f"{self.special_tokens['assistant']} {r}"
-                for q, r in context[-self.config.max_context_turns:]
-            ])
-            query = f"{context_str} {self.special_tokens['user']} {query}"
-        else:
-            query = f"{self.special_tokens['user']} {query}"
-        # Tokenize and pad using TensorFlow tensors
-        encodings = self.tokenizer(
-            [query],
-            padding='max_length',
-            truncation=True,
-            max_length=self.config.max_sequence_length,
-            return_tensors='tf'
-        )
-        input_ids = encodings['input_ids']
-        return self.query_encoder(input_ids, training=False)
-    def encode_responses(self, responses: List[str]) -> tf.Tensor:
-        """Encode a batch of responses."""
-        # Prepare responses
-        responses = [
-            f"{self.special_tokens['assistant']} {r}"
-            for r in responses
-        ]
-        # Tokenize and pad using TensorFlow tensors
-        encodings = self.tokenizer(
-            responses,
-            padding='max_length',
-            truncation=True,
-            max_length=self.config.max_sequence_length,
-            return_tensors='tf'
-        )
-        input_ids = encodings['input_ids']
-        return self.response_encoder(input_ids, training=False)
-    def retrieve_responses(
-        self,
-        query: str,
-        candidates: List[str],
-        context: Optional[List[Tuple[str, str]]] = None,
-        top_k: int = 5
-    ) -> List[Tuple[str, float]]:
-        """Retrieve top-k responses for a query."""
-        # Encode query and candidates
-        q_emb = self.encode_query(query, context)
-        c_emb = self.encode_responses(candidates)
-        # Calculate similarities
-        similarities = tf.matmul(q_emb, c_emb, transpose_b=True).numpy()[0]
-        # Get top-k responses
-        top_indices = np.argsort(similarities)[::-1][:top_k]
-        return [(candidates[i], similarities[i]) for i in top_indices]
-    def chat(
-        self,
-        query: str,
-        response_pool: List[str],
-        conversation_history: Optional[List[Tuple[str, str]]] = None,
-        top_k: int = 5
-    ) -> Tuple[str, List[Tuple[str, float]]]:
-        """Interactive chat with response selection."""
-        # Get responses with scores
-        responses = self.retrieve_responses(
-            query,
-            response_pool,
-            conversation_history,
-            top_k
-        )
-        # Return best response and all candidates with scores
-        return responses[0][0], responses

chatbot4.py → chatbot_model.py RENAMED Viewed

@@ -2,30 +2,25 @@ from transformers import TFAutoModel, AutoTokenizer
 import tensorflow as tf
 import numpy as np
 from typing import List, Tuple, Dict, Optional, Union, Any
 from dataclasses import dataclass
-import logging
 import json
 from tqdm import tqdm
 from pathlib import Path
 import faiss
 from response_quality_checker import ResponseQualityChecker
-policy = tf.keras.mixed_precision.Policy('mixed_float16')
-tf.keras.mixed_precision.set_global_policy(policy)
-# Configure logging
-logging.basicConfig(
-    level=logging.DEBUG,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
 @dataclass
 class ChatbotConfig:
     """Configuration for the RetrievalChatbot."""
     vocab_size: int = 30526  # DistilBERT vocab size
-    max_sequence_length: int = 512
-    embedding_dim: int = 768  # Match DistilBERT's dimension
     encoder_units: int = 256
     num_attention_heads: int = 8
     dropout_rate: float = 0.2
@@ -70,13 +65,20 @@ class EncoderModel(tf.keras.Model):
         # Freeze pretrained weights if specified
         self.pretrained.distilbert.embeddings.trainable = False
         for i, layer_module in enumerate(self.pretrained.distilbert.transformer.layer):
-            if i < 3:  # freeze first 2 layers
                 layer_module.trainable = False
             else:
                 layer_module.trainable = True
         # Pooling layer (Global Average Pooling)
         self.pooler = tf.keras.layers.GlobalAveragePooling1D()
         # Dropout and normalization
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
@@ -90,14 +92,11 @@ class EncoderModel(tf.keras.Model):
         pretrained_outputs = self.pretrained(inputs, training=training)
         x = pretrained_outputs.last_hidden_state  # Shape: [batch_size, seq_len, embedding_dim]
-        # Apply pooling
-        x = self.pooler(x)  # Shape: [batch_size, embedding_dim]
-        # Apply dropout
-        x = self.dropout(x, training=training)
-        # L2 normalization
-        x = self.normalize(x)  # Shape: [batch_size, embedding_dim]
         return x
@@ -110,42 +109,34 @@ class EncoderModel(tf.keras.Model):
             "name": self.name
         })
         return config
-# class CustomLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
-#     def __init__(self, initial_lr, peak_lr, min_lr, warmup_steps, total_steps):
-#         super().__init__()
-#         self.initial_lr = initial_lr
-#         self.peak_lr = peak_lr
-#         self.min_lr = min_lr
-#         self.warmup_steps = min(warmup_steps, total_steps // 2)  # Ensure warmup_steps <= total_steps
-#         self.total_steps = total_steps
-#     def __call__(self, step):
-#         if step < self.warmup_steps:
-#             # Linear warmup
-#             lr = self.initial_lr + (self.peak_lr - self.initial_lr) * (step / self.warmup_steps)
-#         else:
-#             # Linear decay
-#             decay_steps = self.total_steps - self.warmup_steps
-#             if decay_steps > 0:
-#                 lr = self.peak_lr - (self.peak_lr - self.min_lr) * ((step - self.warmup_steps) / decay_steps)
-#             else:
-#                 lr = self.peak_lr
-#         return lr
-#     def get_config(self):
-#         return {
-#             "initial_lr": self.initial_lr,
-#             "peak_lr": self.peak_lr,
-#             "min_lr": self.min_lr,
-#             "warmup_steps": self.warmup_steps,
-#             "total_steps": self.total_steps,
-#         }
-class RetrievalChatbot:
     """Retrieval-based chatbot using pretrained embeddings and FAISS for similarity search."""
-    def __init__(self, config: ChatbotConfig, dialogues: List[dict] = []):
         self.config = config
         # Special tokens
         self.special_tokens = {
@@ -161,8 +152,12 @@ class RetrievalChatbot:
             {'additional_special_tokens': list(self.special_tokens.values())}
         )
-        # Build encoders
-        self._build_models()
         # Initialize FAISS index
         self._initialize_faiss()
@@ -193,31 +188,41 @@ class RetrievalChatbot:
         self.encoder.pretrained.resize_token_embeddings(new_vocab_size)
         logger.info(f"Token embeddings resized to: {new_vocab_size}")
-        # Inspect embeddings attributes for debugging
         logger.info("Inspecting embeddings attributes:")
         for attr in dir(self.encoder.pretrained.distilbert.embeddings):
             if not attr.startswith('_'):
                 logger.info(f"  {attr}")
-        # Verify embedding layers without accessing word_embeddings directly
-        embedding_dim = getattr(self.encoder.pretrained.distilbert.embeddings, 'embedding_dim', 'Unknown')
-        vocab_size = getattr(self.encoder.pretrained.distilbert.embeddings, 'input_dim', len(self.tokenizer))
         logger.info(f"Encoder Embedding Dimension: {embedding_dim}")
         logger.info(f"Encoder Embedding Vocabulary Size: {vocab_size}")
-        logger.info("Encoder model built and embeddings resized successfully.")
-        for var in self.encoder.pretrained.trainable_variables:
-            logger.info(f"{var.name}, {var.shape}")
-    def check_trainable_variables(self):
-        """Logs the trainable variables in both encoders."""
-        logger.info("Checking trainable variables in shared_encoder:")
-        for var in self.encoder.pretrained.trainable_variables:
-            logger.info(f"  {var.name}, shape: {var.shape}")
-        # logger.info("Checking trainable variables in response_encoder:")
-        # for var in self.response_encoder.pretrained.trainable_variables:
-        #     logger.info(f"  {var.name}, shape: {var.shape}")
     def _initialize_faiss(self):
         """Initialize FAISS index based on available resources."""
@@ -239,10 +244,10 @@ class RetrievalChatbot:
             self.index = faiss.IndexFlatIP(self.config.embedding_dim)
         logger.info("FAISS index initialized.")
-    def verify_faiss_index(chatbot):
         """Verify that FAISS index matches the response pool."""
-        indexed_size = chatbot.index.ntotal
-        pool_size = len(chatbot.response_pool)
         logger.info(f"FAISS index size: {indexed_size}")
         logger.info(f"Response pool size: {pool_size}")
         if indexed_size != pool_size:
@@ -268,12 +273,12 @@ class RetrievalChatbot:
         logger.info(f"Found {len(unique_responses)} unique responses.")
         # Encode responses
         response_embeddings = self.encode_responses(unique_responses)
         response_embeddings = response_embeddings.numpy()
         # Ensure float32
         if response_embeddings.dtype != np.float32:
-            logger.info(f"Converting embeddings from {response_embeddings.dtype} to float32.")
             response_embeddings = response_embeddings.astype('float32')
         # Ensure the array is contiguous in memory
@@ -312,14 +317,8 @@ class RetrievalChatbot:
         Returns:
             tf.Tensor: Tensor of shape (N, emb_dim) with all response embeddings.
         """
-        logger.info(f"Encoding {len(responses)} responses in batches of size {batch_size}...")
-        # We'll accumulate embeddings in a list and concatenate at the end
         all_embeddings = []
-        # Set up a progress bar
-        from tqdm import tqdm
-        pbar = tqdm(total=len(responses), desc="Encoding responses")
         # Process the responses in chunks of 'batch_size'
         for start_idx in range(0, len(responses), batch_size):
@@ -331,7 +330,7 @@ class RetrievalChatbot:
                 batch_texts,
                 padding='max_length',
                 truncation=True,
-                max_length=self.config.max_sequence_length,
                 return_tensors='tf',
             )
@@ -346,11 +345,6 @@ class RetrievalChatbot:
             # Collect
             all_embeddings.append(embeddings_batch)
-            # Update progress bar
-            pbar.update(len(batch_texts))
-        pbar.close()
         # Concatenate all batch embeddings along axis=0
         if len(all_embeddings) == 1:
             # Only one batch
@@ -359,10 +353,6 @@ class RetrievalChatbot:
             # Multiple batches, concatenate
             final_embeddings = tf.concat(all_embeddings, axis=0)
-        logger.info(
-            f"Finished encoding {len(responses)} responses. "
-            f"Final shape: {final_embeddings.shape}"
-        )
         return final_embeddings
     def encode_query(self, query: str, context: Optional[List[Tuple[str, str]]] = None) -> tf.Tensor:
@@ -383,7 +373,7 @@ class RetrievalChatbot:
             [query],
             padding='max_length',
             truncation=True,
-            max_length=self.config.max_sequence_length,
             return_tensors='tf'
         )
         input_ids = encodings['input_ids']
@@ -391,7 +381,6 @@ class RetrievalChatbot:
         # Verify token IDs
         max_id = tf.reduce_max(input_ids).numpy()
         new_vocab_size = len(self.tokenizer)
-        logger.info(f"Maximum input_id: {max_id}, Vocab Size: {new_vocab_size}")
         if max_id >= new_vocab_size:
             logger.error(f"Token ID {max_id} exceeds the vocabulary size {new_vocab_size}.")
@@ -399,6 +388,46 @@ class RetrievalChatbot:
         # Get embeddings from the shared encoder
         return self.encoder(input_ids, training=False)
     def retrieve_responses_faiss(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]:
         """Retrieve top-k responses using FAISS."""
@@ -456,10 +485,6 @@ class RetrievalChatbot:
             load_dir / "shared_encoder",
             config=config
         )
-        # chatbot.response_encoder.pretrained = TFAutoModel.from_pretrained(
-        #     load_dir / "response_encoder",
-        #     config=config
-        # )
         # Load tokenizer
         chatbot.tokenizer = AutoTokenizer.from_pretrained(load_dir / "tokenizer")
@@ -498,77 +523,137 @@ class RetrievalChatbot:
     def prepare_dataset(
         self,
         dialogues: List[dict],
         debug_samples: int = None
     ) -> Tuple[tf.Tensor, tf.Tensor]:
         """
-        Prepares dataset for in-batch negatives:
-        Only returns (query, positive) pairs.
         """
-        logger.info("Preparing in-batch dataset...")
         queries, positives = [], []
         for dialogue in dialogues:
             turns = dialogue.get('turns', [])
             for i in range(len(turns) - 1):
                 current_turn = turns[i]
                 next_turn = turns[i+1]
-                if (current_turn.get('speaker') == 'user' and
-                    next_turn.get('speaker') == 'assistant' and
-                    'text' in current_turn and
-                    'text' in next_turn):
-                    query = current_turn['text'].strip()
-                    positive = next_turn['text'].strip()
-                    queries.append(query)
-                    positives.append(positive)
-        # Optional debug slicing
         if debug_samples is not None:
             queries = queries[:debug_samples]
             positives = positives[:debug_samples]
             logger.info(f"Debug mode: limited to {debug_samples} pairs.")
-        logger.info(f"Prepared {len(queries)} (query, positive) pairs.")
-        # Tokenize queries
         encoded_queries = self.tokenizer(
-            queries,
             padding='max_length',
             truncation=True,
-            max_length=self.config.max_sequence_length,
             return_tensors='tf'
         )
-        # Tokenize positives
         encoded_positives = self.tokenizer(
-            positives,
             padding='max_length',
             truncation=True,
-            max_length=self.config.max_sequence_length,
             return_tensors='tf'
         )
         q_tensor = encoded_queries['input_ids']
         p_tensor = encoded_positives['input_ids']
-        logger.info("Tokenized and padded sequences for in-batch training.")
         return q_tensor, p_tensor
     def train(
         self,
         q_pad: tf.Tensor,
         p_pad: tf.Tensor,
-        epochs: int,
-        batch_size: int,
-        validation_split: float,
-        checkpoint_dir: str,
         use_lr_schedule: bool = True,
         peak_lr: float = 2e-5,
         warmup_steps_ratio: float = 0.1,
         early_stopping_patience: int = 3,
-        min_delta: float = 1e-4
     ):
         dataset_size = tf.shape(q_pad)[0].numpy()
         val_size = int(dataset_size * validation_split)
@@ -604,21 +689,20 @@ class RetrievalChatbot:
         val_q = q_pad[train_size:]
         val_p = p_pad[train_size:]
-        train_dataset = tf.data.Dataset.from_tensor_slices((train_q, train_p))
-        train_dataset = train_dataset.shuffle(buffer_size=4096).batch(batch_size)
-        val_dataset = tf.data.Dataset.from_tensor_slices((val_q, val_p))
-        val_dataset = val_dataset.batch(batch_size)
         # 3) Checkpoint + manager
         checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.encoder)
         manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)
         # 4) TensorBoard setup
-        import datetime
-        import os
-        from pathlib import Path
         log_dir = Path(checkpoint_dir) / "tensorboard_logs"
         log_dir.mkdir(parents=True, exist_ok=True)
@@ -638,48 +722,91 @@ class RetrievalChatbot:
         logger.info("Beginning training loop...")
         global_step = 0
         from tqdm import tqdm
         for epoch in range(1, epochs + 1):
             logger.info(f"\n=== Epoch {epoch}/{epochs} ===")
             epoch_loss_avg = tf.keras.metrics.Mean()
-            # Training loop
             with tqdm(total=steps_per_epoch, desc=f"Training Epoch {epoch}") as pbar:
                 for (q_batch, p_batch) in train_dataset:
                     global_step += 1
-                    # Train step
-                    batch_loss = self._train_step(q_batch, p_batch)
-                    epoch_loss_avg(batch_loss)
-                    # Get current LR
                     if use_lr_schedule:
                         lr = self.optimizer.learning_rate
                         if isinstance(lr, tf.keras.optimizers.schedules.LearningRateSchedule):
-                            # Get the current step
                             current_step = tf.cast(self.optimizer.iterations, tf.float32)
-                            # Compute the current learning rate
                             current_lr = lr(current_step)
                         else:
-                            # If learning_rate is not a schedule, use it directly
                             current_lr = lr
-                        # Convert to float for logging
                         current_lr_value = float(current_lr.numpy())
                     else:
-                        # If using fixed learning rate
                         current_lr_value = float(self.optimizer.learning_rate.numpy())
-                    # Update tqdm
                     pbar.update(1)
                     pbar.set_postfix({
-                        "loss": f"{batch_loss.numpy():.4f}",
                         "lr": f"{current_lr_value:.2e}"
                     })
-                    # TensorBoard: log train metrics per step
-                    with train_summary_writer.as_default():
-                        tf.summary.scalar("loss", batch_loss, step=global_step)
-                        tf.summary.scalar("learning_rate", current_lr_value, step=global_step)
             # Validation
             val_loss_avg = tf.keras.metrics.Mean()
@@ -726,90 +853,6 @@ class RetrievalChatbot:
         logger.info("In-batch training completed!")
-    @tf.function
-    def _train_step(self, q_batch, p_batch):
-        """
-        Single training step using in-batch negatives.
-        q_batch: (batch_size, seq_len) int32 input_ids for queries
-        p_batch: (batch_size, seq_len) int32 input_ids for positives
-        """
-        with tf.GradientTape() as tape:
-            # Encode queries and positives
-            q_enc = self.encoder(q_batch, training=True)  # [B, emb_dim]
-            p_enc = self.encoder(p_batch, training=True)  # [B, emb_dim]
-            # Compute similarity matrix: (B, B) = q_enc * p_enc^T
-            # If embeddings are L2-normalized, this is cosine similarity
-            sim_matrix = tf.matmul(q_enc, p_enc, transpose_b=True)  # [B, B]
-            # Labels are just the diagonal indices
-            batch_size = tf.shape(q_enc)[0]
-            labels = tf.range(batch_size, dtype=tf.int32)  # [0..B-1]
-            # Softmax cross-entropy
-            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-                labels=labels,
-                logits=sim_matrix
-            )
-            loss = tf.reduce_mean(loss)
-        # Compute gradients for the pretrained DistilBERT variables only
-        train_vars = self.encoder.pretrained.trainable_variables
-        gradients = tape.gradient(loss, train_vars)
-        # Remove any None grads (in case some layers are frozen)
-        grads_and_vars = [(g, v) for g, v in zip(gradients, train_vars) if g is not None]
-        if grads_and_vars:
-            self.optimizer.apply_gradients(grads_and_vars)
-        return loss
-    def _prepare_sequences(
-        self,
-        queries: List[str],
-        positives: List[str],
-        negatives: List[str]
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-        """Prepare and tokenize sequences for training."""
-        logger.info("Preparing sequences for training...")
-        # Handle empty lists
-        if not queries:
-            logger.error("No queries to encode. Skipping sequence preparation.")
-            return tf.constant([]), tf.constant([]), tf.constant([])
-        # Process texts with special tokens
-        queries = [f"{self.special_tokens['user']} {q}" for q in queries]
-        positives = [f"{self.special_tokens['assistant']} {p}" for p in positives]
-        negatives = [f"{self.special_tokens['assistant']} {n}" for n in negatives]
-        # Tokenize using HuggingFace tokenizer
-        def encode_batch(texts: List[str]) -> tf.Tensor:
-            if not texts:
-                logger.error("Empty text list provided to tokenizer.")
-                return tf.constant([])
-            encodings = self.tokenizer(
-                texts,
-                padding='max_length',
-                truncation=True,
-                max_length=self.config.max_sequence_length,
-                return_tensors='tf'
-            )
-            return encodings['input_ids']
-        # Encode all sequences
-        q_tensor = encode_batch(queries)
-        p_tensor = encode_batch(positives)
-        n_tensor = encode_batch(negatives)
-        # Log statistics about encoded sequences
-        logger.info("Sequence statistics:")
-        logger.info(f"  Query sequence shape: {q_tensor.shape}")
-        logger.info(f"  Positive response sequence shape: {p_tensor.shape}")
-        logger.info(f"  Negative response sequence shape: {n_tensor.shape}")
-        return q_tensor, p_tensor, n_tensor
     def _get_lr_schedule(
         self,
         total_steps: int,
@@ -855,7 +898,7 @@ class RetrievalChatbot:
                 decay_factor = (step - self.warmup_steps) / decay_steps
                 decay_factor = tf.minimum(tf.maximum(0.0, decay_factor), 1.0)  # Clip to [0,1]
-                cosine_decay = 0.5 * (1.0 + tf.cos(np.pi * decay_factor))
                 decay_lr = self.min_lr + (self.peak_lr - self.min_lr) * cosine_decay
                 # Choose between warmup and decay
@@ -881,411 +924,334 @@ class RetrievalChatbot:
         normalized_emb1 = emb1 / np.linalg.norm(emb1, axis=1, keepdims=True)
         normalized_emb2 = emb2 / np.linalg.norm(emb2, axis=1, keepdims=True)
         return np.dot(normalized_emb1, normalized_emb2.T)
-    def run_automatic_validation(
-        self,
-        quality_checker: 'ResponseQualityChecker',
-        num_examples: int = 5
-    ) -> Dict[str, Any]:
-        """
-        Run automatic validation with quality metrics using FAISS-based retrieval.
-        """
-        logger.info("\n=== Running Automatic Validation ===")
-        test_queries = [
-            "Hello, how are you today?",
-            "What's the weather like?",
-            "Can you help me with a problem?",
-            "Tell me a joke",
-            "What time is it?",
-            "I need help with my homework",
-            "Where's a good place to eat?",
-            "What movies are playing?",
-            "How do I reset my password?",
-            "Can you recommend a book?"
-        ]
-        test_queries = test_queries[:num_examples]
-        metrics_history = []
-        for i, query in enumerate(test_queries, 1):
-            logger.info(f"\nTest Case {i}:")
-            logger.info(f"Query: {query}")
-            # Get responses and scores using FAISS
-            responses = self.retrieve_responses_faiss(query, top_k=5)
-            # Check quality
-            quality_metrics = quality_checker.check_response_quality(query, responses)
-            metrics_history.append(quality_metrics)
-            # Log results
-            logger.info(f"Quality Metrics: {quality_metrics}")
-            logger.info("Top responses:")
-            for j, (response, score) in enumerate(responses[:3], 1):
-                logger.info(f"{j}. Score: {score:.4f}")
-                logger.info(f"   Response: {response}")
-                if j == 1 and not quality_metrics.get('is_confident', False):
-                    logger.info("   [Low Confidence - Would abstain from answering]")
-        # Calculate aggregate metrics
-        aggregate_metrics = {
-            'num_queries_tested': len(test_queries),
-            'avg_top_response_score': np.mean([m.get('top_score', 0) for m in metrics_history]),
-            'avg_diversity': np.mean([m.get('response_diversity', 0) for m in metrics_history]),
-            'avg_relevance': np.mean([m.get('query_response_relevance', 0) for m in metrics_history]),
-            'avg_length_score': np.mean([m.get('response_length_score', 0) for m in metrics_history]),
-            'avg_score_gap': np.mean([m.get('top_3_score_gap', 0) for m in metrics_history]),
-            'confidence_rate': np.mean([m.get('is_confident', False) for m in metrics_history]),
-        }
-        logger.info("\n=== Validation Summary ===")
-        for metric, value in aggregate_metrics.items():
-            logger.info(f"{metric}: {value:.4f}")
-        return aggregate_metrics
     def chat(
         self,
         query: str,
         conversation_history: Optional[List[Tuple[str, str]]] = None,
         quality_checker: Optional['ResponseQualityChecker'] = None,
-        top_k: int = 5
     ) -> Tuple[str, List[Tuple[str, float]], Dict[str, Any]]:
         """
-        Interactive chat function with quality checking using FAISS-based retrieval.
-        Args:
-            query (str): The user's input query.
-            conversation_history (Optional[List[Tuple[str, str]]]): List of past (user, assistant) exchanges.
-            quality_checker (Optional['ResponseQualityChecker']): Quality checker instance.
-            top_k (int): Number of top responses to retrieve.
-        Returns:
-            Tuple[str, List[Tuple[str, float]], Dict[str, Any]]: (Response, Candidates, Quality Metrics)
         """
-        # Retrieve responses using FAISS
-        responses = self.retrieve_responses_faiss(query, top_k)
-        # If no quality checker provided, return the top response
-        if quality_checker is None:
-            return responses[0][0] if responses else "I'm sorry, I don't have an answer for that.", responses, {}
-        # Check quality
-        quality_metrics = quality_checker.check_response_quality(query, responses)
-        if quality_metrics.get('is_confident', False):
-            return responses[0][0], responses, quality_metrics
-        else:
-            uncertainty_response = (
-                "I apologize, but I don't feel confident providing an answer to that "
-                "question at the moment. Could you please rephrase or ask something else?"
-            )
-            return uncertainty_response, responses, quality_metrics
-    # TODO: consider removal
-    # def prepare_dataset(self, dialogues: List[dict], neg_samples_per_pos: int = 1, debug_samples: int = None) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-    #     """Prepares the dataset for training."""
-    #     logger.info("Preparing dataset...")
-    #     # Extract (query, positive, negative) triples
-    #     queries, positives, negatives = [], [], []
-    #     for dialogue in dialogues:
-    #         turns = dialogue.get('turns', [])
-    #         for i in range(len(turns) - 1):
-    #             current_turn = turns[i]
-    #             next_turn = turns[i+1]
-    #             if (current_turn.get('speaker') == 'user' and
-    #                 next_turn.get('speaker') == 'assistant' and
-    #                 'text' in current_turn and
-    #                 'text' in next_turn):
-    #                 query = current_turn['text'].strip()
-    #                 positive = next_turn['text'].strip()
-    #                 # Generate hard negative samples
-    #                 hard_negatives = self.hard_negative_sampling(positive, n_samples=neg_samples_per_pos)
-    #                 for negative in hard_negatives:
-    #                     negatives.append(negative)
-    #                     queries.append(query)
-    #                     positives.append(positive)
-    #     logger.info(f"Prepared {len(queries)} training examples.")
-    #     # Tokenize and pad sequences
-    #     encoded_queries = self.tokenizer(
-    #         queries,
-    #         padding='max_length',
-    #         truncation=True,
-    #         max_length=self.config.max_sequence_length,
-    #         return_tensors='tf'
-    #     )
-    #     encoded_positives = self.tokenizer(
-    #         positives,
-    #         padding='max_length',
-    #         truncation=True,
-    #         max_length=self.config.max_sequence_length,
-    #         return_tensors='tf'
-    #     )
-    #     encoded_negatives = self.tokenizer(
-    #         negatives,
-    #         padding='max_length',
-    #         truncation=True,
-    #         max_length=self.config.max_sequence_length,
-    #         return_tensors='tf'
-    #     )
-    #     q_tensor = encoded_queries['input_ids']
-    #     p_tensor = encoded_positives['input_ids']
-    #     n_tensor = encoded_negatives['input_ids']
-    #     logger.info(f"Tokenized and padded sequences.")
-    #     return q_tensor, p_tensor, n_tensor
-    # # TODO: consider removal
-    # def hard_negative_sampling(self, positive_response, n_samples=1):
-    #     """Select hard negatives based on cosine similarity."""
-    #     try:
-    #         # Ensure we don't request more negatives than available
-    #         max_neg_samples = len(self.response_pool) - 1  # Exclude the positive response
-    #         n_samples = min(n_samples, max_neg_samples)
-    #         if n_samples <= 0:
-    #             logger.error("Not enough responses to sample negatives.")
-    #             return []
-    #         # Encode the positive response using the chatbot's encode_responses method
-    #         pos_emb = self.encode_responses([positive_response]).numpy()
-    #         faiss.normalize_L2(pos_emb)
-    #         #logger.info(f"Normalized positive embedding for response: {positive_response}")
-    #         # Search for the top n_samples + 1 most similar responses (including the positive itself)
-    #         D, I = self.index.search(pos_emb, n_samples + 1)
-    #         #logger.info(f"FAISS search results: {I}")
-    #         # Exclude the positive response itself (assuming it's indexed)
-    #         negatives = []
-    #         for i in range(n_samples):
-    #             idx = I[0][i + 1]  # Skip the first one as it's the positive
-    #             if idx < len(self.response_pool):
-    #                 negative_response = self.response_pool[idx]
-    #                 negatives.append(negative_response)
-    #                 logger.info(f"Selected negative: {negative_response}")
-    #             else:
-    #                 logger.warning(f"Index {idx} out of range for response_pool with size {len(self.response_pool)}.")
-    #         return negatives
-    #     except Exception as e:
-    #         logger.error(f"An error occurred during hard negative sampling: {e}")
-    #         return []
-        # def train(
-    #     self,
-    #     q_pad: tf.Tensor,
-    #     p_pad: tf.Tensor,
-    #     n_pad: tf.Tensor,
-    #     epochs: int,
-    #     batch_size: int,
-    #     validation_split: float,
-    #     checkpoint_dir: str,
-    #     callbacks: Optional[List[tf.keras.callbacks.Callback]] = None
-    # ):
-    #     """
-    #     Train the chatbot model.
-    #     Args:
-    #         q_pad (tf.Tensor): Padded query input_ids.
-    #         p_pad (tf.Tensor): Padded positive response input_ids.
-    #         n_pad (tf.Tensor): Padded negative response input_ids.
-    #         epochs (int): Number of training epochs.
-    #         batch_size (int): Training batch size.
-    #         validation_split (float): Fraction of data to use for validation.
-    #         checkpoint_dir (str): Directory to save model checkpoints.
-    #         callbacks (list, optional): List of Keras callbacks.
-    #     """
-    #     dataset_size = tf.shape(q_pad)[0].numpy()
-    #     val_size = int(dataset_size * validation_split)
-    #     train_size = dataset_size - val_size
-    #     logger.info(f"Total samples: {dataset_size}")
-    #     logger.info(f"Training samples: {train_size}")
-    #     logger.info(f"Validation samples: {val_size}")
-    #     # Calculate steps_per_epoch
-    #     steps_per_epoch = train_size // batch_size
-    #     if train_size % batch_size != 0:
-    #         steps_per_epoch += 1
-    #     total_steps = steps_per_epoch * epochs
-    #     logger.info(f"Total training steps: {total_steps}")
-    #     # Initialize learning rate schedule with adjusted warmup_steps
-    #     lr_schedule = self._get_lr_schedule(
-    #         total_steps=total_steps,
-    #         peak_lr=self.config.learning_rate,
-    #         warmup_steps=self.config.warmup_steps
-    #     )
-    #     #     callbacks = []
-    #     #     if checkpoint_dir:
-    #     #         checkpoint_dir = Path(checkpoint_dir)
-    #     #         checkpoint_dir.mkdir(parents=True, exist_ok=True)
-    #     #         # Setup checkpoint callback with correct file format
-    #     #         checkpoint_template = str(checkpoint_dir / "model_epoch_{epoch:04d}.weights.h5")
-    #     #         checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
-    #     #             checkpoint_template,
-    #     #             save_weights_only=True,
-    #     #             save_best_only=True,
-    #     #             monitor='val_loss',
-    #     #             mode='min',
-    #     #             verbose=1
-    #     #         )
-    #     #         callbacks.append(checkpoint_callback)
-    #     #     # Early stopping callback
-    #     #     early_stopping = tf.keras.callbacks.EarlyStopping(
-    #     #         monitor='val_loss',
-    #     #         patience=5,
-    #     #         restore_best_weights=True,
-    #     #         verbose=1
-    #     #     )
-    #     #     callbacks.append(early_stopping)
-    #     #     # TensorBoard callback
-    #     #     tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs')
-    #     #     callbacks.append(tensorboard_callback)
-    #     # Update optimizer with the new learning rate schedule
-    #     self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
-    #     # Split the data
-    #     train_q = q_pad[:train_size]
-    #     train_p = p_pad[:train_size]
-    #     train_n = n_pad[:train_size]
-    #     val_q = q_pad[train_size:]
-    #     val_p = p_pad[train_size:]
-    #     val_n = n_pad[train_size:]
-    #     # Create TensorFlow datasets
-    #     train_dataset = tf.data.Dataset.from_tensor_slices((train_q, train_p, train_n))
-    #     train_dataset = train_dataset.shuffle(buffer_size=1000).batch(batch_size)
-    #     val_dataset = tf.data.Dataset.from_tensor_slices((val_q, val_p, val_n))
-    #     val_dataset = val_dataset.batch(batch_size)
-    #     # Log dataset sizes
-    #     logger.info(f"Training dataset batches: {len(list(train_dataset))}")
-    #     logger.info(f"Validation dataset batches: {len(list(val_dataset))}")
-    #     # Create checkpoint manager
-    #     checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.encoder)
-    #     manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)
-    #     for epoch in range(1, epochs + 1):
-    #         logger.info(f"Epoch {epoch}/{epochs}")
-    #         epoch_loss_avg = tf.keras.metrics.Mean()
-    #         # Training loop
-    #         for q_batch, p_batch, n_batch in train_dataset:
-    #             batch_loss = self._train_step(q_batch, p_batch, n_batch)
-    #             epoch_loss_avg(batch_loss)
-    #         # Validation loop
-    #         val_loss_avg = tf.keras.metrics.Mean()
-    #         try:
-    #             for q_val, p_val, n_val in val_dataset:
-    #                 # Encode queries, positives, and negatives without training
-    #                 q_enc = self.encoder(q_val, training=False)
-    #                 p_enc = self.encoder(p_val, training=False)
-    #                 n_enc = self.encoder(n_val, training=False)
-    #                 # Compute cosine similarities
-    #                 pos_sim = tf.reduce_sum(tf.multiply(q_enc, p_enc), axis=1)
-    #                 neg_sim = tf.reduce_sum(tf.multiply(q_enc, n_enc), axis=1)
-    #                 # Ensure similarities are float32
-    #                 pos_sim = tf.cast(pos_sim, tf.float32)
-    #                 neg_sim = tf.cast(neg_sim, tf.float32)
-    #                 # Compute loss with margin
-    #                 margin = tf.cast(self.config.margin, tf.float32)
-    #                 loss = tf.maximum(0.0, margin - pos_sim + neg_sim)
-    #                 val_loss_avg(tf.reduce_mean(loss))
-    #                 # Optional: Log individual batch validation loss
-    #                 logger.debug(f"Batch Validation Loss: {tf.reduce_mean(loss).numpy():.6f}")
-    #             train_loss = epoch_loss_avg.result().numpy()
-    #             val_loss = val_loss_avg.result().numpy()
-    #             logger.info(f"Epoch {epoch}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
-    #             # Save checkpoint
-    #             manager.save()
-    #             # Update history
-    #             self.history['train_loss'].append(train_loss)
-    #             self.history['val_loss'].append(val_loss)
-    #             # Invoke callbacks if any
-    #             if callbacks:
-    #                 for callback in callbacks:
-    #                     callback.on_epoch_end(epoch, logs={'loss': train_loss, 'val_loss': val_loss})
-    #         except tf.errors.OutOfRangeError:
-    #             logger.warning("Validation dataset is exhausted before expected.")
-    #             self.history['val_loss'].append(val_loss_avg.result().numpy())
-    #     logger.info("Training completed.")
-    # @tf.function
-    # def _train_step(self, q_batch, p_batch, n_batch):
-    #     """
-    #     Performs a single training step with query, positive, and negative batches.
-    #     Args:
-    #         q_batch (tf.Tensor): Batch of query input_ids.
-    #         p_batch (tf.Tensor): Batch of positive response input_ids.
-    #         n_batch (tf.Tensor): Batch of negative response input_ids.
-    #     Returns:
-    #         tf.Tensor: Mean loss for the batch.
-    #     """
-    #     with tf.GradientTape() as tape:
-    #         # Encode queries, positives, and negatives using the shared encoder
-    #         q_enc = self.encoder(q_batch, training=True)  # Shape: (batch_size, embedding_dim)
-    #         p_enc = self.encoder(p_batch, training=True)  # Shape: (batch_size, embedding_dim)
-    #         n_enc = self.encoder(n_batch, training=True)  # Shape: (batch_size, embedding_dim)
-    #         # Compute cosine similarities
-    #         pos_sim = tf.reduce_sum(tf.multiply(q_enc, p_enc), axis=1)  # Shape: (batch_size,)
-    #         neg_sim = tf.reduce_sum(tf.multiply(q_enc, n_enc), axis=1)  # Shape: (batch_size,)
-    #         # Ensure similarities are float32
-    #         pos_sim = tf.cast(pos_sim, tf.float32)
-    #         neg_sim = tf.cast(neg_sim, tf.float32)
-    #         # Compute loss with margin
-    #         margin = tf.cast(self.config.margin, tf.float32)
-    #         loss = tf.maximum(0.0, margin - pos_sim + neg_sim)
-    #     # Compute gradients and update encoder weights
-    #     gradients = tape.gradient(loss, self.encoder.pretrained.trainable_variables)
-    #     # Filter out None gradients (if any)
-    #     grads_and_vars = [
-    #         (g, v) for g, v in zip(gradients, self.encoder.pretrained.trainable_variables)
-    #         if g is not None
-    #     ]
-    #     if grads_and_vars:
-    #         self.optimizer.apply_gradients(grads_and_vars)
-    #     # Return mean loss
-    #     return tf.reduce_mean(loss)

 import tensorflow as tf
 import numpy as np
 from typing import List, Tuple, Dict, Optional, Union, Any
+import math
 from dataclasses import dataclass
 import json
 from tqdm import tqdm
 from pathlib import Path
+import datetime
 import faiss
 from response_quality_checker import ResponseQualityChecker
+from cross_encoder_reranker import CrossEncoderReranker
+from conversation_summarizer import DeviceAwareModel, Summarizer
+from logger_config import config_logger
+logger = config_logger(__name__)
 @dataclass
 class ChatbotConfig:
     """Configuration for the RetrievalChatbot."""
     vocab_size: int = 30526  # DistilBERT vocab size
+    max_context_token_limit: int = 512
+    embedding_dim: int = 512  # Match DistilBERT's dimension
     encoder_units: int = 256
     num_attention_heads: int = 8
     dropout_rate: float = 0.2
         # Freeze pretrained weights if specified
         self.pretrained.distilbert.embeddings.trainable = False
         for i, layer_module in enumerate(self.pretrained.distilbert.transformer.layer):
+            if i < 1:  # freeze first layer
                 layer_module.trainable = False
             else:
                 layer_module.trainable = True
         # Pooling layer (Global Average Pooling)
         self.pooler = tf.keras.layers.GlobalAveragePooling1D()
+        # Projection layer
+        self.projection = tf.keras.layers.Dense(
+            config.embedding_dim,
+            activation='tanh',
+            name="projection"
+        )
         # Dropout and normalization
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
         pretrained_outputs = self.pretrained(inputs, training=training)
         x = pretrained_outputs.last_hidden_state  # Shape: [batch_size, seq_len, embedding_dim]
+        # Apply pooling, projection, dropout, and normalization
+        x = self.pooler(x)  # Shape: [batch_size, 768]
+        x = self.projection(x) # Shape: [batch_size, 512]
+        x = self.dropout(x, training=training) # Apply dropout
+        x = self.normalize(x)  # Shape: [batch_size, 512]
         return x
             "name": self.name
         })
         return config
+class RetrievalChatbot(DeviceAwareModel):
     """Retrieval-based chatbot using pretrained embeddings and FAISS for similarity search."""
+    def __init__(self, config: ChatbotConfig, dialogues: List[dict] = [], device: str = None, strategy=None, reranker: Optional[CrossEncoderReranker] = None, summarizer: Optional[Summarizer] = None):
         self.config = config
+        self.strategy = strategy
+        self.setup_device(device)
+        if reranker is None:
+            logger.info("Creating default CrossEncoderReranker...")
+            reranker = CrossEncoderReranker(model_name="cross-encoder/ms-marco-MiniLM-L-12-v2")
+        self.reranker = reranker
+        if summarizer is None:
+            logger.info("Creating default Summarizer...")
+            summarizer = Summarizer(device=self.device)
+        self.summarizer = summarizer
+        # Configure XLA optimization if on GPU/TPU
+        if self.device in ["GPU", "TPU"]:
+            tf.config.optimizer.set_jit(True)
+            logger.info(f"XLA compilation enabled for {self.device}")
+        # Configure mixed precision for GPU/TPU
+        if self.device != "CPU":
+            policy = tf.keras.mixed_precision.Policy('mixed_float16')
+            tf.keras.mixed_precision.set_global_policy(policy)
+            logger.info("Mixed precision training enabled (float16)")
         # Special tokens
         self.special_tokens = {
             {'additional_special_tokens': list(self.special_tokens.values())}
         )
+        # Build encoders within device strategy scope
+        if self.strategy:
+            with self.strategy.scope():
+                self._build_models()
+        else:
+            self._build_models()
         # Initialize FAISS index
         self._initialize_faiss()
         self.encoder.pretrained.resize_token_embeddings(new_vocab_size)
         logger.info(f"Token embeddings resized to: {new_vocab_size}")
+        # Debug embeddings attributes
         logger.info("Inspecting embeddings attributes:")
         for attr in dir(self.encoder.pretrained.distilbert.embeddings):
             if not attr.startswith('_'):
                 logger.info(f"  {attr}")
+        # Try different ways to get embedding dimension
+        try:
+            # First try: from config
+            embedding_dim = self.encoder.pretrained.config.dim
+            logger.info("Got embedding dim from config")
+        except AttributeError:
+            try:
+                # Second try: from word embeddings
+                embedding_dim = self.encoder.pretrained.distilbert.embeddings.word_embeddings.embedding_dim
+                logger.info("Got embedding dim from word embeddings")
+            except AttributeError:
+                try:
+                    # Third try: from embeddings module
+                    embedding_dim = self.encoder.pretrained.distilbert.embeddings.embedding_dim
+                    logger.info("Got embedding dim from embeddings module")
+                except AttributeError:
+                    # Fallback to config value
+                    embedding_dim = self.config.embedding_dim
+                    logger.info("Using config embedding dim")
+        vocab_size = len(self.tokenizer)
         logger.info(f"Encoder Embedding Dimension: {embedding_dim}")
         logger.info(f"Encoder Embedding Vocabulary Size: {vocab_size}")
+        if vocab_size >= embedding_dim:
+            logger.info("Encoder model built and embeddings resized successfully.")
+        else:
+            logger.error("Vocabulary size is less than embedding dimension.")
+            raise ValueError("Vocabulary size is less than embedding dimension.")
     def _initialize_faiss(self):
         """Initialize FAISS index based on available resources."""
             self.index = faiss.IndexFlatIP(self.config.embedding_dim)
         logger.info("FAISS index initialized.")
+    def verify_faiss_index(self):
         """Verify that FAISS index matches the response pool."""
+        indexed_size = self.index.ntotal
+        pool_size = len(self.response_pool)
         logger.info(f"FAISS index size: {indexed_size}")
         logger.info(f"Response pool size: {pool_size}")
         if indexed_size != pool_size:
         logger.info(f"Found {len(unique_responses)} unique responses.")
         # Encode responses
+        logger.info("Encoding unique responses")
         response_embeddings = self.encode_responses(unique_responses)
         response_embeddings = response_embeddings.numpy()
         # Ensure float32
         if response_embeddings.dtype != np.float32:
             response_embeddings = response_embeddings.astype('float32')
         # Ensure the array is contiguous in memory
         Returns:
             tf.Tensor: Tensor of shape (N, emb_dim) with all response embeddings.
         """
+        # Accumulate embeddings in a list and concatenate at the end
         all_embeddings = []
         # Process the responses in chunks of 'batch_size'
         for start_idx in range(0, len(responses), batch_size):
                 batch_texts,
                 padding='max_length',
                 truncation=True,
+                max_length=self.config.max_context_token_limit,
                 return_tensors='tf',
             )
             # Collect
             all_embeddings.append(embeddings_batch)
         # Concatenate all batch embeddings along axis=0
         if len(all_embeddings) == 1:
             # Only one batch
             # Multiple batches, concatenate
             final_embeddings = tf.concat(all_embeddings, axis=0)
         return final_embeddings
     def encode_query(self, query: str, context: Optional[List[Tuple[str, str]]] = None) -> tf.Tensor:
             [query],
             padding='max_length',
             truncation=True,
+            max_length=self.config.max_context_token_limit,
             return_tensors='tf'
         )
         input_ids = encodings['input_ids']
         # Verify token IDs
         max_id = tf.reduce_max(input_ids).numpy()
         new_vocab_size = len(self.tokenizer)
         if max_id >= new_vocab_size:
             logger.error(f"Token ID {max_id} exceeds the vocabulary size {new_vocab_size}.")
         # Get embeddings from the shared encoder
         return self.encoder(input_ids, training=False)
+    def retrieve_responses_cross_encoder(
+        self,
+        query: str,
+        top_k: int,
+        reranker: Optional[CrossEncoderReranker] = None,
+        summarizer: Optional[Summarizer] = None,
+        summarize_threshold: int = 512  # Summarize over 512 tokens
+    ) -> List[Tuple[str, float]]:
+        """
+        Retrieve top-k from FAISS, then re-rank them with a cross-encoder.
+        Optionally summarize the user query if it's too long.
+        """
+        if reranker is None:
+            reranker = self.reranker
+        if summarizer is None:
+            summarizer = self.summarizer
+        # Optional summarization
+        if summarizer and len(query.split()) > summarize_threshold:
+            logger.info(f"Query is long. Summarizing before cross-encoder. Original length: {len(query.split())}")
+            query = summarizer.summarize_text(query)
+            logger.info(f"Summarized query: {query}")
+        # 2) Dense retrieval
+        dense_topk = self.retrieve_responses_faiss(query, top_k=top_k)  # [(resp, dense_score), ...]
+        if not dense_topk:
+            return []
+        # 3) Cross-encoder rerank
+        candidate_texts = [pair[0] for pair in dense_topk]
+        cross_scores = reranker.rerank(query, candidate_texts, max_length=256)
+        # Combine
+        combined = [(text, score) for (text, _), score in zip(dense_topk, cross_scores)]
+        # Sort descending by cross-encoder score
+        combined.sort(key=lambda x: x[1], reverse=True)
+        return combined
     def retrieve_responses_faiss(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]:
         """Retrieve top-k responses using FAISS."""
             load_dir / "shared_encoder",
             config=config
         )
         # Load tokenizer
         chatbot.tokenizer = AutoTokenizer.from_pretrained(load_dir / "tokenizer")
     def prepare_dataset(
         self,
         dialogues: List[dict],
+        neg_samples: int = 1,
         debug_samples: int = None
     ) -> Tuple[tf.Tensor, tf.Tensor]:
         """
+        Prepares dataset for multiple-negatives ranking,
+        but also appends 'hard negative' pairs for each query.
+        We'll generate:
+        - (query, positive) as usual
+        - (query, negative) for each query, using FAISS top-1 approx. negative.
+        Then, in-batch training sees them as 'two different positives'
+        for the same query, forcing the model to discriminate them.
         """
+        logger.info("Preparing in-batch dataset with hard negatives...")
         queries, positives = [], []
+        # Assemble (q, p)
         for dialogue in dialogues:
             turns = dialogue.get('turns', [])
             for i in range(len(turns) - 1):
                 current_turn = turns[i]
                 next_turn = turns[i+1]
+                if (current_turn.get('speaker') == 'user'
+                    and next_turn.get('speaker') == 'assistant'
+                    and 'text' in current_turn
+                    and 'text' in next_turn):
+                    query_text = current_turn['text'].strip()
+                    pos_text   = next_turn['text'].strip()
+                    queries.append(query_text)
+                    positives.append(pos_text)
+        # Debug slicing
         if debug_samples is not None:
             queries = queries[:debug_samples]
             positives = positives[:debug_samples]
             logger.info(f"Debug mode: limited to {debug_samples} pairs.")
+        logger.info(f"Prepared {len(queries)} (query, positive) pairs initially.")
+        # Find a hard negative from FAISS for each (q, p)
+        # Create a second 'positive' row => (q, negative). In-batch, it's seen as a different 'positive' row, but is a hard negative.
+        augmented_queries = []
+        augmented_positives = []
+        for q_text, p_text in zip(queries, positives):
+            neg_texts = self._find_hard_negative(q_text, p_text, top_k=5, neg_samples=neg_samples)
+            for neg_text in neg_texts:
+                augmented_queries.append(q_text)
+                augmented_positives.append(neg_text)
+        logger.info(f"Found hard negatives for {len(augmented_queries)} queries.")
+        # Combine them into a single big list -> Original pairs: (q, p) & Hard neg pairs: (q, n)
+        final_queries = queries + augmented_queries
+        final_positives = positives + augmented_positives
+        logger.info(f"Total dataset size after adding hard neg: {len(final_queries)}")
+        # Tokenize
         encoded_queries = self.tokenizer(
+            final_queries,
             padding='max_length',
             truncation=True,
+            max_length=self.config.max_context_token_limit,
             return_tensors='tf'
         )
         encoded_positives = self.tokenizer(
+            final_positives,
             padding='max_length',
             truncation=True,
+            max_length=self.config.max_context_token_limit,
             return_tensors='tf'
         )
         q_tensor = encoded_queries['input_ids']
         p_tensor = encoded_positives['input_ids']
+        logger.info("Tokenized and padded sequences for in-batch training + hard negatives.")
         return q_tensor, p_tensor
+    def _find_hard_negative(
+        self,
+        query_text: str,
+        positive_text: str,
+        top_k: int = 5,
+        neg_samples: int = 1
+    ) -> List[str]:
+        """
+        Return up to `neg_samples` unique negatives from top_k FAISS results,
+        excluding the known positive_text.
+        """
+        # Encode the query to get the embedding
+        query_emb = self.encode_query(query_text)
+        q_emb_np = query_emb.numpy().astype('float32')
+        # Normalize for cosine similarity
+        faiss.normalize_L2(q_emb_np)
+        # Search in FAISS
+        distances, indices = self.index.search(q_emb_np, top_k)
+        # Exclude the actual positive from these results
+        hard_negatives = []
+        for idx in indices[0]:
+            if idx < len(self.response_pool):
+                candidate = self.response_pool[idx].strip()
+                if candidate != positive_text.strip():
+                    hard_negatives.append(candidate)
+                    if len(hard_negatives) == neg_samples:
+                        break
+        return hard_negatives
     def train(
         self,
         q_pad: tf.Tensor,
         p_pad: tf.Tensor,
+        epochs: int = 20,
+        batch_size: int = 16,
+        validation_split: float = 0.2,
+        checkpoint_dir: str = "checkpoints/",
         use_lr_schedule: bool = True,
         peak_lr: float = 2e-5,
         warmup_steps_ratio: float = 0.1,
         early_stopping_patience: int = 3,
+        min_delta: float = 1e-4,
+        accum_steps: int = 2  # Gradient accumulation steps
     ):
         dataset_size = tf.shape(q_pad)[0].numpy()
         val_size = int(dataset_size * validation_split)
         val_q = q_pad[train_size:]
         val_p = p_pad[train_size:]
+        train_dataset = (tf.data.Dataset.from_tensor_slices((train_q, train_p))
+                    .shuffle(4096)
+                    .batch(batch_size)
+                    .prefetch(tf.data.AUTOTUNE))
+        val_dataset = (tf.data.Dataset.from_tensor_slices((val_q, val_p))
+                    .batch(batch_size)
+                    .prefetch(tf.data.AUTOTUNE))
         # 3) Checkpoint + manager
         checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.encoder)
         manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)
         # 4) TensorBoard setup
         log_dir = Path(checkpoint_dir) / "tensorboard_logs"
         log_dir.mkdir(parents=True, exist_ok=True)
         logger.info("Beginning training loop...")
         global_step = 0
+        # Prepare zero-initialized accumulators for your trainable variables
+        # We'll accumulate gradients across mini-batches, then apply them every accum_steps.
+        train_vars = self.encoder.pretrained.trainable_variables
+        accum_grads = [tf.zeros_like(var, dtype=tf.float32) for var in train_vars]
         from tqdm import tqdm
         for epoch in range(1, epochs + 1):
             logger.info(f"\n=== Epoch {epoch}/{epochs} ===")
             epoch_loss_avg = tf.keras.metrics.Mean()
+            step_in_epoch = 0
             with tqdm(total=steps_per_epoch, desc=f"Training Epoch {epoch}") as pbar:
                 for (q_batch, p_batch) in train_dataset:
+                    step_in_epoch += 1
                     global_step += 1
+                    with tf.GradientTape() as tape:
+                        q_enc = self.encoder(q_batch, training=True)
+                        p_enc = self.encoder(p_batch, training=True)
+                        sim_matrix = tf.matmul(q_enc, p_enc, transpose_b=True)
+                        bsz = tf.shape(q_enc)[0]
+                        labels = tf.range(bsz, dtype=tf.int32)
+                        loss_value = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                            labels=labels, logits=sim_matrix
+                        )
+                        loss_value = tf.reduce_mean(loss_value)
+                    gradients = tape.gradient(loss_value, train_vars)
+                    # -- Accumulate gradients --
+                    for i, grad in enumerate(gradients):
+                        if grad is not None:
+                            accum_grads[i] += tf.cast(grad, tf.float32)
+                    epoch_loss_avg(loss_value)
+                    # -- Apply gradients every 'accum_steps' mini-batches --
+                    if (step_in_epoch % accum_steps) == 0:
+                        # Scale by 1/accum_steps so that each accumulation cycle
+                        # is effectively the same as one “normal” update
+                        for i in range(len(accum_grads)):
+                            accum_grads[i] /= accum_steps
+                        self.optimizer.apply_gradients(
+                            [(accum_grads[i], train_vars[i]) for i in range(len(accum_grads))]
+                        )
+                        # Reset the accumulator
+                        accum_grads = [tf.zeros_like(var, dtype=tf.float32) for var in train_vars]
+                    # Logging / tqdm updates
                     if use_lr_schedule:
+                        # measure current LR
                         lr = self.optimizer.learning_rate
                         if isinstance(lr, tf.keras.optimizers.schedules.LearningRateSchedule):
                             current_step = tf.cast(self.optimizer.iterations, tf.float32)
                             current_lr = lr(current_step)
                         else:
                             current_lr = lr
                         current_lr_value = float(current_lr.numpy())
                     else:
                         current_lr_value = float(self.optimizer.learning_rate.numpy())
                     pbar.update(1)
                     pbar.set_postfix({
+                        "loss": f"{loss_value.numpy():.4f}",
                         "lr": f"{current_lr_value:.2e}"
                     })
+                    # TensorBoard logging omitted for brevity...
+            # -- Handle leftover partial accumulation at epoch end --
+            leftover = (step_in_epoch % accum_steps)
+            if leftover != 0:
+                logger.info(f"Applying leftover accum_grads for partial batch group (size={leftover}).")
+                # If you want each leftover batch to contribute proportionally:
+                # multiply by leftover/accum_steps (this ensures leftover
+                # steps have the same "average" effect as a full accumulation cycle)
+                for i in range(len(accum_grads)):
+                    accum_grads[i] *= float(leftover) / float(accum_steps)
+                self.optimizer.apply_gradients(
+                    [(accum_grads[i], train_vars[i]) for i in range(len(accum_grads))]
+                )
+                accum_grads = [tf.zeros_like(var, dtype=tf.float32) for var in train_vars]
             # Validation
             val_loss_avg = tf.keras.metrics.Mean()
         logger.info("In-batch training completed!")
     def _get_lr_schedule(
         self,
         total_steps: int,
                 decay_factor = (step - self.warmup_steps) / decay_steps
                 decay_factor = tf.minimum(tf.maximum(0.0, decay_factor), 1.0)  # Clip to [0,1]
+                cosine_decay = 0.5 * (1.0 + tf.cos(tf.constant(math.pi) * decay_factor))
                 decay_lr = self.min_lr + (self.peak_lr - self.min_lr) * cosine_decay
                 # Choose between warmup and decay
         normalized_emb1 = emb1 / np.linalg.norm(emb1, axis=1, keepdims=True)
         normalized_emb2 = emb2 / np.linalg.norm(emb2, axis=1, keepdims=True)
         return np.dot(normalized_emb1, normalized_emb2.T)
     def chat(
         self,
         query: str,
         conversation_history: Optional[List[Tuple[str, str]]] = None,
         quality_checker: Optional['ResponseQualityChecker'] = None,
+        top_k: int = 5,
     ) -> Tuple[str, List[Tuple[str, float]], Dict[str, Any]]:
         """
+        Example chat method that always uses cross-encoder re-ranking
+        if self.reranker is available.
         """
+        @self.run_on_device
+        def get_response(self_arg, query_arg):  # Add parameters that match decorator's expectations
+            # 1) Build conversation context string
+            conversation_str = self_arg._build_conversation_context(query_arg, conversation_history)
+            # 2) Retrieve + cross-encoder re-rank
+            results = self_arg.retrieve_responses_cross_encoder(
+                query=conversation_str,
+                top_k=top_k,
+                reranker=self_arg.reranker,
+                summarizer=self_arg.summarizer,
+                summarize_threshold=512
+            )
+            # 3) Handle empty or confidence
+            if not results:
+                return (
+                    "I'm sorry, but I couldn't find a relevant response.",
+                    [],
+                    {}
+                )
+            if quality_checker:
+                metrics = quality_checker.check_response_quality(query_arg, results)
+                if not metrics.get('is_confident', False):
+                    return (
+                        "I need more information to provide a good answer. Could you please clarify?",
+                        results,
+                        metrics
+                    )
+                return results[0][0], results, metrics
+            return results[0][0], results, {}
+        return get_response(self, query)
+    def _build_conversation_context(
+        self,
+        query: str,
+        conversation_history: Optional[List[Tuple[str, str]]]
+    ) -> str:
+        """Build conversation context with better memory management."""
+        if not conversation_history:
+            return f"{self.special_tokens['user']} {query}"
+        conversation_parts = []
+        for user_txt, assistant_txt in conversation_history:
+            conversation_parts.extend([
+                f"{self.special_tokens['user']} {user_txt}",
+                f"{self.special_tokens['assistant']} {assistant_txt}"
+            ])
+        conversation_parts.append(f"{self.special_tokens['user']} {query}")
+        return "\n".join(conversation_parts)
+# def prepare_dataset(
+#     self,
+#     dialogues: List[dict],
+#     debug_samples: int = None
+# ) -> Tuple[tf.Tensor, tf.Tensor]:
+#     """
+#     Prepares dataset for in-batch negatives:
+#     Only returns (query, positive) pairs.
+#     """
+#     logger.info("Preparing in-batch dataset...")
+#     queries, positives = [], []
+#     for dialogue in dialogues:
+#         turns = dialogue.get('turns', [])
+#         for i in range(len(turns) - 1):
+#             current_turn = turns[i]
+#             next_turn = turns[i+1]
+#             if (current_turn.get('speaker') == 'user' and
+#                 next_turn.get('speaker') == 'assistant' and
+#                 'text' in current_turn and
+#                 'text' in next_turn):
+#                 query = current_turn['text'].strip()
+#                 positive = next_turn['text'].strip()
+#                 queries.append(query)
+#                 positives.append(positive)
+#     # Optional debug slicing
+#     if debug_samples is not None:
+#         queries = queries[:debug_samples]
+#         positives = positives[:debug_samples]
+#         logger.info(f"Debug mode: limited to {debug_samples} pairs.")
+#     logger.info(f"Prepared {len(queries)} (query, positive) pairs.")
+#     # Tokenize queries
+#     encoded_queries = self.tokenizer(
+#         queries,
+#         padding='max_length',
+#         truncation=True,
+#         max_length=self.config.max_sequence_length,
+#         return_tensors='tf'
+#     )
+#     # Tokenize positives
+#     encoded_positives = self.tokenizer(
+#         positives,
+#         padding='max_length',
+#         truncation=True,
+#         max_length=self.config.max_sequence_length,
+#         return_tensors='tf'
+#     )
+#     q_tensor = encoded_queries['input_ids']
+#     p_tensor = encoded_positives['input_ids']
+#     logger.info("Tokenized and padded sequences for in-batch training.")
+#     return q_tensor, p_tensor
+# def train(
+#     self,
+#     q_pad: tf.Tensor,
+#     p_pad: tf.Tensor,
+#     epochs: int = 20,
+#     batch_size: int = 16,
+#     validation_split: float = 0.2,
+#     checkpoint_dir: str = "checkpoints/",
+#     use_lr_schedule: bool = True,
+#     peak_lr: float = 2e-5,
+#     warmup_steps_ratio: float = 0.1,
+#     early_stopping_patience: int = 3,
+#     min_delta: float = 1e-4
+# ):
+#     dataset_size = tf.shape(q_pad)[0].numpy()
+#     val_size = int(dataset_size * validation_split)
+#     train_size = dataset_size - val_size
+#     logger.info(f"Total samples: {dataset_size}")
+#     logger.info(f"Training samples: {train_size}")
+#     logger.info(f"Validation samples: {val_size}")
+#     steps_per_epoch = train_size // batch_size
+#     if train_size % batch_size != 0:
+#         steps_per_epoch += 1
+#     total_steps = steps_per_epoch * epochs
+#     logger.info(f"Total training steps (approx): {total_steps}")
+#     # 1) Set up LR schedule or fixed LR
+#     if use_lr_schedule:
+#         warmup_steps = int(total_steps * warmup_steps_ratio)
+#         lr_schedule = self._get_lr_schedule(
+#             total_steps=total_steps,
+#             peak_lr=peak_lr,
+#             warmup_steps=warmup_steps
+#         )
+#         self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+#         logger.info("Using custom learning rate schedule.")
+#     else:
+#         self.optimizer = tf.keras.optimizers.Adam(learning_rate=peak_lr)
+#         logger.info("Using fixed learning rate.")
+#     # 2) Prepare data splits
+#     train_q = q_pad[:train_size]
+#     train_p = p_pad[:train_size]
+#     val_q = q_pad[train_size:]
+#     val_p = p_pad[train_size:]
+#     train_dataset = tf.data.Dataset.from_tensor_slices((train_q, train_p))
+#     train_dataset = train_dataset.shuffle(buffer_size=4096).batch(batch_size)
+#     val_dataset = tf.data.Dataset.from_tensor_slices((val_q, val_p))
+#     val_dataset = val_dataset.batch(batch_size)
+#     # 3) Checkpoint + manager
+#     checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.encoder)
+#     manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)
+#     # 4) TensorBoard setup
+#     log_dir = Path(checkpoint_dir) / "tensorboard_logs"
+#     log_dir.mkdir(parents=True, exist_ok=True)
+#     current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+#     train_log_dir = str(log_dir / f"train_{current_time}")
+#     val_log_dir = str(log_dir / f"val_{current_time}")
+#     train_summary_writer = tf.summary.create_file_writer(train_log_dir)
+#     val_summary_writer = tf.summary.create_file_writer(val_log_dir)
+#     logger.info(f"TensorBoard logs will be saved in {log_dir}")
+#     # 5) Early stopping
+#     best_val_loss = float("inf")
+#     epochs_no_improve = 0
+#     logger.info("Beginning training loop...")
+#     global_step = 0
+#     from tqdm import tqdm
+#     for epoch in range(1, epochs + 1):
+#         logger.info(f"\n=== Epoch {epoch}/{epochs} ===")
+#         epoch_loss_avg = tf.keras.metrics.Mean()
+#         # Training loop
+#         with tqdm(total=steps_per_epoch, desc=f"Training Epoch {epoch}") as pbar:
+#             for (q_batch, p_batch) in train_dataset:
+#                 global_step += 1
+#                 # Train step
+#                 batch_loss = self._train_step(q_batch, p_batch)
+#                 epoch_loss_avg(batch_loss)
+#                 # Get current LR
+#                 if use_lr_schedule:
+#                     lr = self.optimizer.learning_rate
+#                     if isinstance(lr, tf.keras.optimizers.schedules.LearningRateSchedule):
+#                         # Get the current step
+#                         current_step = tf.cast(self.optimizer.iterations, tf.float32)
+#                         # Compute the current learning rate
+#                         current_lr = lr(current_step)
+#                     else:
+#                         # If learning_rate is not a schedule, use it directly
+#                         current_lr = lr
+#                     # Convert to float for logging
+#                     current_lr_value = float(current_lr.numpy())
+#                 else:
+#                     # If using fixed learning rate
+#                     current_lr_value = float(self.optimizer.learning_rate.numpy())
+#                 # Update tqdm
+#                 pbar.update(1)
+#                 pbar.set_postfix({
+#                     "loss": f"{batch_loss.numpy():.4f}",
+#                     "lr": f"{current_lr_value:.2e}"
+#                 })
+#                 # TensorBoard: log train metrics per step
+#                 with train_summary_writer.as_default():
+#                     tf.summary.scalar("loss", batch_loss, step=global_step)
+#                     tf.summary.scalar("learning_rate", current_lr_value, step=global_step)
+#         # Validation
+#         val_loss_avg = tf.keras.metrics.Mean()
+#         for q_val, p_val in val_dataset:
+#             q_enc = self.encoder(q_val, training=False)
+#             p_enc = self.encoder(p_val, training=False)
+#             sim_matrix = tf.matmul(q_enc, p_enc, transpose_b=True)
+#             bs_val = tf.shape(q_enc)[0]
+#             labels_val = tf.range(bs_val, dtype=tf.int32)
+#             loss_val = tf.nn.sparse_softmax_cross_entropy_with_logits(
+#                 labels=labels_val,
+#                 logits=sim_matrix
+#             )
+#             val_loss_avg(tf.reduce_mean(loss_val))
+#         train_loss = epoch_loss_avg.result().numpy()
+#         val_loss = val_loss_avg.result().numpy()
+#         logger.info(f"Epoch {epoch} Complete: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
+#         # TensorBoard: validation loss
+#         with val_summary_writer.as_default():
+#             tf.summary.scalar("val_loss", val_loss, step=epoch)
+#         # Save checkpoint
+#         manager.save()
+#         # Update history
+#         self.history['train_loss'].append(train_loss)
+#         self.history['val_loss'].append(val_loss)
+#         self.history.setdefault('learning_rate', []).append(float(current_lr_value))
+#         # Early stopping
+#         if val_loss < best_val_loss - min_delta:
+#             best_val_loss = val_loss
+#             epochs_no_improve = 0
+#             logger.info(f"Validation loss improved to {val_loss:.4f}. Reset patience.")
+#         else:
+#             epochs_no_improve += 1
+#             logger.info(f"No improvement this epoch. Patience: {epochs_no_improve}/{early_stopping_patience}")
+#             if epochs_no_improve >= early_stopping_patience:
+#                 logger.info("Early stopping triggered.")
+#                 break
+#     logger.info("In-batch training completed!")
+# @tf.function
+# def _train_step(self, q_batch, p_batch):
+#     """
+#     Single training step using in-batch negatives.
+#     q_batch: (batch_size, seq_len) int32 input_ids for queries
+#     p_batch: (batch_size, seq_len) int32 input_ids for positives
+#     """
+#     with tf.GradientTape() as tape:
+#         # Encode queries and positives
+#         q_enc = self.encoder(q_batch, training=True)  # [B, emb_dim]
+#         p_enc = self.encoder(p_batch, training=True)  # [B, emb_dim]
+#         # Compute similarity matrix: (B, B) = q_enc * p_enc^T
+#         # If embeddings are L2-normalized, this is cosine similarity
+#         sim_matrix = tf.matmul(q_enc, p_enc, transpose_b=True)  # [B, B]
+#         # Labels are just the diagonal indices
+#         batch_size = tf.shape(q_enc)[0]
+#         labels = tf.range(batch_size, dtype=tf.int32)  # [0..B-1]
+#         # Softmax cross-entropy
+#         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+#             labels=labels,
+#             logits=sim_matrix
+#         )
+#         loss = tf.reduce_mean(loss)
+#     # Compute gradients for the pretrained DistilBERT variables only
+#     train_vars = self.encoder.pretrained.trainable_variables
+#     gradients = tape.gradient(loss, train_vars)
+#     # Remove any None grads (in case some layers are frozen)
+#     grads_and_vars = [(g, v) for g, v in zip(gradients, train_vars) if g is not None]
+#     if grads_and_vars:
+#         self.optimizer.apply_gradients(grads_and_vars)
+#     return loss

chatbot_validator.py ADDED Viewed

	@@ -0,0 +1,207 @@

+from typing import Dict, List, Tuple, Any, Optional
+import numpy as np
+from logger_config import config_logger
+logger = config_logger(__name__)
+class ChatbotValidator:
+    """Handles automated validation and performance analysis for the chatbot."""
+    def __init__(self, chatbot, quality_checker):
+        """
+        Initialize the validator.
+        Args:
+            chatbot: RetrievalChatbot instance
+            quality_checker: ResponseQualityChecker instance
+        """
+        self.chatbot = chatbot
+        self.quality_checker = quality_checker
+        # Domain-specific test queries aligned with Taskmaster-1 and Schema-Guided
+        self.domain_queries = {
+            'restaurant': [
+                "I'd like to make a reservation for dinner tonight.",
+                "Can you book a table for 4 people at an Italian place?",
+                "Do you have any availability for tomorrow at 7pm?",
+                "I need to change my dinner reservation time.",
+                "What's the wait time for a table right now?"
+            ],
+            'movie_tickets': [
+                "I want to buy tickets for the new Marvel movie.",
+                "Are there any showings of Avatar after 6pm?",
+                "Can I get 3 tickets for the 8pm show?",
+                "What movies are playing this weekend?",
+                "Do you have any matinee showtimes available?"
+            ],
+            'rideshare': [
+                "I need a ride from the airport to downtown.",
+                "How much would it cost to get to the mall?",
+                "Can you book a car for tomorrow morning?",
+                "Is there a driver available now?",
+                "What's the estimated arrival time?"
+            ],
+            'services': [
+                "I need to schedule an oil change for my car.",
+                "When can I bring my car in for maintenance?",
+                "Do you have any openings for auto repair today?",
+                "How long will the service take?",
+                "Can I get an estimate for brake repair?"
+            ],
+            'events': [
+                "I need tickets to the concert this weekend.",
+                "What events are happening near me?",
+                "Can I book seats for the basketball game?",
+                "Are there any comedy shows tonight?",
+                "How much are tickets to the theater?"
+            ]
+        }
+    def run_validation(
+        self,
+        num_examples: int = 10,
+        top_k: int = 10,
+        domains: Optional[List[str]] = None
+    ) -> Dict[str, Any]:
+        """
+        Run comprehensive validation across specified domains.
+        Args:
+            num_examples: Number of test queries per domain
+            top_k: Number of responses to retrieve for each query
+            domains: Optional list of specific domains to test
+        Returns:
+            Dict containing detailed validation metrics and domain-specific performance
+        """
+        logger.info("\n=== Running Enhanced Automatic Validation ===")
+        # Select domains to test
+        test_domains = domains if domains else list(self.domain_queries.keys())
+        metrics_history = []
+        domain_metrics = {}
+        # Run validation for each domain
+        for domain in test_domains:
+            domain_metrics[domain] = []
+            queries = self.domain_queries[domain][:num_examples]
+            logger.info(f"\n=== Testing {domain.title()} Domain ===")
+            for i, query in enumerate(queries, 1):
+                logger.info(f"\nTest Case {i}:")
+                logger.info(f"Query: {query}")
+                # Get responses with increased top_k
+                responses = self.chatbot.retrieve_responses_cross_encoder(query, top_k=top_k)
+                # Enhanced quality checking with context
+                quality_metrics = self.quality_checker.check_response_quality(query, responses)
+                # Add domain info
+                quality_metrics['domain'] = domain
+                metrics_history.append(quality_metrics)
+                domain_metrics[domain].append(quality_metrics)
+                # Detailed logging
+                self._log_validation_results(query, responses, quality_metrics, i)
+        # Calculate and log overall metrics
+        aggregate_metrics = self._calculate_aggregate_metrics(metrics_history)
+        domain_analysis = self._analyze_domain_performance(domain_metrics)
+        confidence_analysis = self._analyze_confidence_distribution(metrics_history)
+        aggregate_metrics.update({
+            'domain_performance': domain_analysis,
+            'confidence_analysis': confidence_analysis
+        })
+        self._log_validation_summary(aggregate_metrics)
+        return aggregate_metrics
+    def _calculate_aggregate_metrics(self, metrics_history: List[Dict]) -> Dict[str, float]:
+        """Calculate comprehensive aggregate metrics."""
+        metrics = {
+            'num_queries_tested': len(metrics_history),
+            'avg_top_response_score': np.mean([m.get('top_score', 0) for m in metrics_history]),
+            'avg_diversity': np.mean([m.get('response_diversity', 0) for m in metrics_history]),
+            'avg_relevance': np.mean([m.get('query_response_relevance', 0) for m in metrics_history]),
+            'avg_length_score': np.mean([m.get('response_length_score', 0) for m in metrics_history]),
+            'avg_score_gap': np.mean([m.get('top_3_score_gap', 0) for m in metrics_history]),
+            'confidence_rate': np.mean([m.get('is_confident', False) for m in metrics_history]),
+            # Additional statistical metrics
+            'median_top_score': np.median([m.get('top_score', 0) for m in metrics_history]),
+            'score_std': np.std([m.get('top_score', 0) for m in metrics_history]),
+            'min_score': np.min([m.get('top_score', 0) for m in metrics_history]),
+            'max_score': np.max([m.get('top_score', 0) for m in metrics_history])
+        }
+        return metrics
+    def _analyze_domain_performance(self, domain_metrics: Dict[str, List[Dict]]) -> Dict[str, Dict]:
+        """Analyze performance by domain."""
+        domain_analysis = {}
+        for domain, metrics in domain_metrics.items():
+            domain_analysis[domain] = {
+                'confidence_rate': np.mean([m.get('is_confident', False) for m in metrics]),
+                'avg_relevance': np.mean([m.get('query_response_relevance', 0) for m in metrics]),
+                'avg_diversity': np.mean([m.get('response_diversity', 0) for m in metrics]),
+                'avg_top_score': np.mean([m.get('top_score', 0) for m in metrics]),
+                'num_samples': len(metrics)
+            }
+        return domain_analysis
+    def _analyze_confidence_distribution(self, metrics_history: List[Dict]) -> Dict[str, float]:
+        """Analyze the distribution of confidence scores."""
+        scores = [m.get('top_score', 0) for m in metrics_history]
+        return {
+            'percentile_25': np.percentile(scores, 25),
+            'percentile_50': np.percentile(scores, 50),
+            'percentile_75': np.percentile(scores, 75),
+            'percentile_90': np.percentile(scores, 90)
+        }
+    def _log_validation_results(
+        self,
+        query: str,
+        responses: List[Tuple[str, float]],
+        metrics: Dict[str, Any],
+        case_num: int
+    ):
+        """Log detailed validation results."""
+        logger.info(f"\nTest Case {case_num}:")
+        logger.info(f"Query: {query}")
+        logger.info(f"Domain: {metrics.get('domain', 'Unknown')}")
+        logger.info(f"Confidence: {'Yes' if metrics.get('is_confident', False) else 'No'}")
+        logger.info("\nQuality Metrics:")
+        for metric, value in metrics.items():
+            if isinstance(value, (int, float)):
+                logger.info(f"  {metric}: {value:.4f}")
+        logger.info("\nTop Responses:")
+        for i, (response, score) in enumerate(responses[:3], 1):
+            logger.info(f"{i}. Score: {score:.4f}. Response: {response}")
+            if i == 1 and not metrics.get('is_confident', False):
+                logger.info("   [Low Confidence]")
+    def _log_validation_summary(self, metrics: Dict[str, Any]):
+        """Log comprehensive validation summary."""
+        logger.info("\n=== Validation Summary ===")
+        logger.info("\nOverall Metrics:")
+        for metric, value in metrics.items():
+            if isinstance(value, (int, float)):
+                logger.info(f"{metric}: {value:.4f}")
+        logger.info("\nDomain Performance:")
+        for domain, domain_metrics in metrics['domain_performance'].items():
+            logger.info(f"\n{domain.title()}:")
+            for metric, value in domain_metrics.items():
+                logger.info(f"  {metric}: {value:.4f}")
+        logger.info("\nConfidence Distribution:")
+        for percentile, value in metrics['confidence_analysis'].items():
+            logger.info(f"{percentile}: {value:.4f}")

conversation_summarizer.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import tensorflow as tf
+from typing import List, Dict
+from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
+import logging
+from dataclasses import dataclass
+logger = logging.getLogger(__name__)
+@dataclass
+class ChatConfig:
+    max_sequence_length: int = 512
+    default_top_k: int = 5
+    chunk_size: int = 512
+    chunk_overlap: int = 256
+    min_confidence_score: float = 0.7
+class DeviceAwareModel:
+    """Mixin to handle device placement and mixed precision training."""
+    def setup_device(self, device: str = None):
+        if device is None:
+            device = 'GPU' if tf.config.list_physical_devices('GPU') else 'CPU'
+        self.device = device.upper()
+        self.strategy = None
+        if self.device == 'GPU':
+            # Enable mixed precision for better performance
+            policy = tf.keras.mixed_precision.Policy('mixed_float16')
+            tf.keras.mixed_precision.set_global_policy(policy)
+            # Setup distribution strategy for multi-GPU if available
+            gpus = tf.config.list_physical_devices('GPU')
+            if len(gpus) > 1:
+                self.strategy = tf.distribute.MirroredStrategy()
+        return self.device
+    def run_on_device(self, func):
+        """Decorator to ensure ops run on the correct device."""
+        def wrapper(*args, **kwargs):
+            with tf.device(f'/{self.device}:0'):
+                return func(*args, **kwargs)
+        return wrapper
+class Summarizer(DeviceAwareModel):
+    """
+    Enhanced T5-based summarizer with better chunking and device management.
+    Handles long conversations by intelligent chunking and progressive summarization.
+    """
+    def __init__(self, model_name="t5-small", max_summary_length=128, device=None, max_summary_rounds=2):
+        self.setup_device(device)
+        # Initialize model within strategy scope if using distribution
+        if self.strategy:
+            with self.strategy.scope():
+                self._setup_model(model_name)
+        else:
+            self._setup_model(model_name)
+        self.max_summary_length = max_summary_length
+        self.max_summary_rounds = max_summary_rounds
+    def _setup_model(self, model_name):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
+        # Optimize model for inference
+        self.model.predict = tf.function(
+            self.model.predict,
+            input_signature=[
+                {
+                    'input_ids': tf.TensorSpec(shape=[None, None], dtype=tf.int32),
+                    'attention_mask': tf.TensorSpec(shape=[None, None], dtype=tf.int32)
+                }
+            ]
+        )
+    @tf.function
+    def _generate_summary(self, inputs):
+        return self.model.generate(
+            inputs,
+            max_length=self.max_summary_length,
+            num_beams=4,
+            length_penalty=2.0,
+            early_stopping=True,
+            no_repeat_ngram_size=3
+        )
+    def chunk_text(self, text: str, chunk_size: int = 512, overlap: int = 256) -> List[str]:
+        """Split text into overlapping chunks for better context preservation."""
+        tokens = self.tokenizer.encode(text)
+        chunks = []
+        for i in range(0, len(tokens), chunk_size - overlap):
+            chunk = tokens[i:i + chunk_size]
+            chunks.append(self.tokenizer.decode(chunk, skip_special_tokens=True))
+        return chunks
+    def summarize_text(
+        self,
+        text: str,
+        progressive: bool = True,
+        round_idx: int = 0
+    ) -> str:
+        """
+        Summarize text with optional progressive summarization
+        and limit the maximum number of re-summarization rounds.
+        """
+        @self.run_on_device
+        def _summarize_chunk(chunk: str) -> str:
+            input_text = "summarize: " + chunk
+            inputs = self.tokenizer(
+                input_text,
+                return_tensors="tf",
+                padding=True,
+                truncation=True
+            )
+            summary_ids = self._generate_summary(inputs)
+            return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        # If we've hit our max allowed summarization rounds, just do a single pass
+        if round_idx >= self.max_summary_rounds:
+            return _summarize_chunk(text)
+        # If text is longer than threshold and progressive summarization is on
+        if len(text.split()) > 512 and progressive:
+            chunks = self.chunk_text(text)
+            chunk_summaries = [_summarize_chunk(chunk) for chunk in chunks]
+            # Combine chunk-level summaries
+            combined_summary = " ".join(chunk_summaries)
+            # If still too long, do another summarization pass but increment round_idx
+            if len(combined_summary.split()) > 512:
+                return self.summarize_text(
+                    combined_summary,
+                    progressive=True,
+                    round_idx=round_idx + 1
+                )
+            return combined_summary
+        else:
+            # If text is not too long, just summarize once and return
+            return _summarize_chunk(text)

cross_encoder_reranker.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+import tensorflow as tf
+from typing import List, Tuple
+from logger_config import config_logger
+logger = config_logger(__name__)
+class CrossEncoderReranker:
+    """
+    Cross-Encoder Re-Ranker: Takes (query, candidate) pairs,
+    outputs a single relevance score (one logit).
+    """
+    def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-12-v2"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+        # Model outputs shape [batch_size, 1] -> Interpret the logit as relevance score.
+    def rerank(
+        self,
+        query: str,
+        candidates: List[str],
+        max_length: int = 256
+    ) -> List[float]:
+        """
+        Returns a list of re_scores, one for each candidate, indicating
+        how relevant the candidate is to the query.
+        """
+        # Build (query, candidate) pairs
+        pair_texts = [(query, candidate) for candidate in candidates]
+        # Tokenize the entire batch
+        encodings = self.tokenizer(
+            pair_texts,
+            padding=True,
+            truncation=True,
+            max_length=max_length,
+            return_tensors="tf"
+        )
+        # Forward pass -> logits shape [batch_size, 1]
+        outputs = self.model(
+            input_ids=encodings["input_ids"],
+            attention_mask=encodings["attention_mask"],
+            token_type_ids=encodings.get("token_type_ids")
+        )
+        logits = outputs.logits
+        # Flatten to shape [batch_size]
+        scores = tf.reshape(logits, [-1]).numpy()
+        return scores.tolist()

dialogue_augmenter.py CHANGED Viewed

@@ -7,7 +7,6 @@ from pipeline_config import PipelineConfig
 from quality_metrics import QualityMetrics
 from paraphraser import Paraphraser
 import nlpaug.augmenter.word as naw
-from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 from sklearn.metrics.pairwise import cosine_similarity

 from quality_metrics import QualityMetrics
 from paraphraser import Paraphraser
 import nlpaug.augmenter.word as naw
 from functools import lru_cache
 from sklearn.metrics.pairwise import cosine_similarity

environment_setup.py ADDED Viewed

	@@ -0,0 +1,207 @@

+from typing import Dict, Optional, Tuple
+from pathlib import Path
+import tensorflow as tf
+import os
+import subprocess
+from datetime import datetime
+from logger_config import config_logger
+logger = config_logger(__name__)
+class EnvironmentSetup:
+    def __init__(self):
+        self.device_type, self.strategy = self.setup_devices()
+        self.cache_dir = None
+    def initialize(self, cache_dir: Optional[Path] = None):
+        self.cache_dir = self.setup_model_cache(cache_dir)
+        self.training_dirs = self.setup_training_directories()
+    @staticmethod
+    def setup_model_cache(cache_dir: Optional[Path] = None) -> Path:
+        """Setup and manage model cache directory."""
+        if cache_dir is None:
+            cache_dir = Path.home() / '.chatbot_cache'
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        # Set environment variables for various libraries
+        os.environ['TRANSFORMERS_CACHE'] = str(cache_dir / 'transformers')
+        os.environ['TORCH_HOME'] = str(cache_dir / 'torch')
+        os.environ['HF_HOME'] = str(cache_dir / 'huggingface')
+        logger.info(f"Using cache directory: {cache_dir}")
+        return cache_dir
+    @staticmethod
+    def setup_training_directories(base_dir: str = "chatbot_training") -> Dict[str, Path]:
+        """Setup directory structure for training artifacts."""
+        base_dir = Path(base_dir)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        train_dir = base_dir / f"training_run_{timestamp}"
+        directories = {
+            'base': train_dir,
+            'checkpoints': train_dir / 'checkpoints',
+            'plots': train_dir / 'plots',
+            'logs': train_dir / 'logs'
+        }
+        # Create directories
+        for dir_path in directories.values():
+            dir_path.mkdir(parents=True, exist_ok=True)
+        return directories
+    @staticmethod
+    def is_colab() -> bool:
+        """Check if code is running in Google Colab."""
+        try:
+            # Handle both import and attribute checks
+            import google.colab  # type: ignore
+            import IPython  # type: ignore
+            return True
+        except (ImportError, AttributeError):
+            return False
+    def setup_colab_tpu(self) -> Optional[tf.distribute.Strategy]:
+        """Setup TPU in Colab environment if available."""
+        if not self.is_colab():
+            return None
+        try:
+            import requests
+            import os
+            # Check TPU availability
+            if 'COLAB_TPU_ADDR' not in os.environ:
+                return None
+            # TPU address should be set
+            tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
+            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=tpu_address)
+            tf.config.experimental_connect_to_cluster(resolver)
+            tf.tpu.experimental.initialize_tpu_system(resolver)
+            strategy = tf.distribute.TPUStrategy(resolver)
+            return strategy
+        except Exception as e:
+            logger.warning(f"Failed to initialize Colab TPU: {e}")
+            return None
+    def setup_devices(self) -> Tuple[str, tf.distribute.Strategy]:
+        """Configure available compute devices with Colab-specific optimizations."""
+        logger.info("Checking available compute devices...")
+        # Colab-specific setup
+        if self.is_colab():
+            logger.info("Running in Google Colab environment")
+            # Try TPU first in Colab
+            tpu_strategy = self.setup_colab_tpu()
+            if tpu_strategy is not None:
+                logger.info("Colab TPU detected and initialized")
+                return "TPU", tpu_strategy
+            # Colab GPU setup
+            gpus = tf.config.list_physical_devices('GPU')
+            if gpus:
+                try:
+                    # Colab-specific GPU memory management
+                    for gpu in gpus:
+                        tf.config.experimental.set_memory_growth(gpu, True)
+                    # Get GPU info using subprocess
+                    try:
+                        gpu_name = subprocess.check_output(
+                            ['nvidia-smi', '--query-gpu=gpu_name', '--format=csv,noheader'],
+                            stderr=subprocess.DEVNULL
+                        ).decode('utf-8').strip()
+                        logger.info(f"Colab GPU detected: {gpu_name}")
+                    except (subprocess.SubprocessError, FileNotFoundError):
+                        logger.warning("Could not detect specific GPU model")
+                    # Enable XLA
+                    tf.config.optimizer.set_jit(True)
+                    logger.info("XLA compilation enabled for Colab GPU")
+                    # Set mixed precision policy
+                    policy = tf.keras.mixed_precision.Policy('mixed_float16')
+                    tf.keras.mixed_precision.set_global_policy(policy)
+                    logger.info("Mixed precision training enabled (float16)")
+                    strategy = tf.distribute.OneDeviceStrategy("/GPU:0")
+                    return "GPU", strategy
+                except Exception as e:
+                    logger.error(f"Error configuring Colab GPU: {str(e)}")
+        # Non-Colab setup (same as before)
+        else:
+            # Check for TPU
+            try:
+                resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
+                tf.config.experimental_connect_to_cluster(resolver)
+                tf.tpu.experimental.initialize_tpu_system(resolver)
+                strategy = tf.distribute.TPUStrategy(resolver)
+                logger.info("TPU detected and initialized")
+                return "TPU", strategy
+            except ValueError:
+                logger.info("No TPU detected. Checking for GPUs...")
+            # Check for GPUs
+            gpus = tf.config.list_physical_devices('GPU')
+            if gpus:
+                try:
+                    for gpu in gpus:
+                        tf.config.experimental.set_memory_growth(gpu, True)
+                    if len(gpus) > 1:
+                        strategy = tf.distribute.MirroredStrategy()
+                        logger.info(f"Multi-GPU strategy set up with {len(gpus)} GPUs")
+                    else:
+                        strategy = tf.distribute.OneDeviceStrategy("/GPU:0")
+                        logger.info("Single GPU strategy set up")
+                    return "GPU", strategy
+                except Exception as e:
+                    logger.error(f"Error configuring GPU: {str(e)}")
+        # CPU fallback
+        strategy = tf.distribute.OneDeviceStrategy("/CPU:0")
+        logger.info("Using CPU strategy")
+        return "CPU", strategy
+    def optimize_batch_size(self, base_batch_size: int = 16) -> int:
+        """Apply Colab-specific optimizations for training."""
+        if not self.is_colab():
+            return base_batch_size
+        # Colab-specific batch size optimization
+        if self.device_type == "GPU":
+            try:
+                gpu_name = subprocess.check_output(
+                    ['nvidia-smi', '--query-gpu=gpu_name', '--format=csv,noheader'],
+                    stderr=subprocess.DEVNULL
+                ).decode('utf-8').strip()
+                if "T4" in gpu_name:
+                    # T4 optimizations
+                    logger.info("Optimizing for Colab T4 GPU")
+                    base_batch_size = min(base_batch_size * 2, 32)  # T4 can handle larger batches
+                elif "V100" in gpu_name:
+                    # V100 optimizations
+                    logger.info("Optimizing for Colab V100 GPU")
+                    base_batch_size = min(base_batch_size * 3, 48)  # V100 can handle even larger batches
+            except (subprocess.SubprocessError, FileNotFoundError):
+                logger.warning("Could not detect specific GPU model, using default settings")
+        elif self.device_type == "TPU":
+            # TPU optimizations
+            base_batch_size = min(base_batch_size * 4, 64)  # TPUs can handle very large batches
+            logger.info("Optimizing for Colab TPU")
+        logger.info(f"Optimized batch size for Colab: {base_batch_size}")
+        return base_batch_size

logger_config.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import logging
+def config_logger(name):
+    logging.basicConfig(
+        level=logging.DEBUG,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    return logger

requirements.txt CHANGED Viewed

@@ -1,6 +1,12 @@
 nlpaug>=1.1.0  # Data augmentation for NLP
 nltk>=3.6.0  # Natural language toolkit
 numpy>=1.19.0  # General numerical computation
 scikit-learn>=1.0.0  # Machine learning tools
 sacremoses>=0.0.53   # Required for some HuggingFace models
 sentencepiece>=0.1.99  # Required for HuggingFace transformers
@@ -11,4 +17,10 @@ tokenizers>=0.13.0   # Required for HuggingFace transformers
 torch>=2.0.0  # PyTorch, for deep learning
 tqdm>=4.64.0  # Progress bar
 transformers>=4.30.0  # Hugging Face Transformers library
-faiss-cpu>=1.7.0  # Required for Facebook AI Similarity Search

+faiss-cpu>=1.7.0  # Required for Facebook AI Similarity Search
+ipython>=8.0.0  # For interactive Python
+loguru>=0.7.0  # Enhanced logging (optional but recommended)
+matplotlib>=3.5.0  # For validation plotting
 nlpaug>=1.1.0  # Data augmentation for NLP
 nltk>=3.6.0  # Natural language toolkit
 numpy>=1.19.0  # General numerical computation
+pandas>=1.5.0  # For data handling
+pyyaml>=6.0.0  # For config management
 scikit-learn>=1.0.0  # Machine learning tools
 sacremoses>=0.0.53   # Required for some HuggingFace models
 sentencepiece>=0.1.99  # Required for HuggingFace transformers
 torch>=2.0.0  # PyTorch, for deep learning
 tqdm>=4.64.0  # Progress bar
 transformers>=4.30.0  # Hugging Face Transformers library
+typing-extensions>=4.0.0  # For better type hints
+# Dev dependencies
+black>=22.0.0  # For code formatting
+isort>=5.10.0  # For import sorting
+mypy>=1.0.0  # For type checking
+pytest>=7.0.0  # For testing

response_quality_checker.py CHANGED Viewed

@@ -1,164 +1,170 @@
 import numpy as np
-from typing import List, Tuple, Dict, Any
 from sklearn.metrics.pairwise import cosine_similarity
-from chatbot4 import RetrievalChatbot
 class ResponseQualityChecker:
-    """Handles quality checking and confidence scoring for chatbot responses."""
     def __init__(
         self,
-        chatbot: RetrievalChatbot,
-        confidence_threshold: float = 0.5,
-        diversity_threshold: float = 0.1,
-        min_response_length: int = 3,
-        max_similarity_ratio: float = 0.9
     ):
         self.confidence_threshold = confidence_threshold
         self.diversity_threshold = diversity_threshold
         self.min_response_length = min_response_length
-        self.max_similarity_ratio = max_similarity_ratio
         self.chatbot = chatbot
     def check_response_quality(
         self,
         query: str,
         responses: List[Tuple[str, float]]
     ) -> Dict[str, Any]:
         """
-        Evaluate the quality of the responses based on various metrics.
-        """
-        # Calculate diversity based on the responses themselves
-        diversity = self.calculate_diversity(responses)
-        # Calculate relevance based on some criteria
-        relevance = self.calculate_relevance(query, responses)
-        # Calculate length scores for each response
-        length_scores = [self._calculate_length_score(response) for response, _ in responses]
-        avg_length_score = np.mean(length_scores) if length_scores else 0.0
-        # Extract similarity scores
-        similarity_scores = [score for _, score in responses]
-        # Calculate score gap
-        score_gap = self._calculate_score_gap(similarity_scores, top_n=3)
-        # Aggregate metrics
-        metrics = {
-            'top_score': similarity_scores[0] if similarity_scores else 0.0,
-            'response_diversity': diversity,
-            'query_response_relevance': relevance,
-            'response_length_score': avg_length_score,
-            'top_3_score_gap': score_gap
-        }
-        # Determine overall confidence
-        is_confident = self._determine_confidence(metrics)
-        return {
-            'diversity': diversity,
-            'relevance': relevance,
-            'is_confident': is_confident,
-            'top_score': metrics['top_score'],
-            'response_diversity': metrics['response_diversity'],
-            'query_response_relevance': metrics['query_response_relevance'],
-            'response_length_score': metrics['response_length_score'],
-            'top_3_score_gap': metrics['top_3_score_gap']
-        }
     def calculate_diversity(self, responses: List[Tuple[str, float]]) -> float:
-        """
-        Calculate diversity as the average pairwise similarity between responses.
-        Lower similarity indicates higher diversity.
-        """
         if not responses:
             return 0.0
-        # Encode responses
         embeddings = [self.encode_text(response) for response, _ in responses]
         if len(embeddings) < 2:
-            return 1.0  # Maximum diversity
-        # Compute pairwise cosine similarity
         similarity_matrix = cosine_similarity(embeddings)
-        # Exclude diagonal
-        sum_similarities = np.sum(similarity_matrix) - len(responses)
         num_pairs = len(responses) * (len(responses) - 1)
         avg_similarity = sum_similarities / num_pairs if num_pairs > 0 else 0.0
-        diversity_score = 1 - avg_similarity  # Higher value indicates more diversity
-        return diversity_score
-    def calculate_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
-        """
-        Calculate relevance as the average similarity between the query and each response.
-        """
-        if not responses:
-            return 0.0
-        # Encode query
-        query_embedding = self.encode_query(query)
-        # Encode responses
-        response_embeddings = [self.encode_text(response) for response, _ in responses]
-        # Compute cosine similarity
-        similarities = cosine_similarity([query_embedding], response_embeddings)[0]
-        avg_relevance = np.mean(similarities) if similarities.size > 0 else 0.0
-        return avg_relevance
     def _calculate_length_score(self, response: str) -> float:
-        """Score based on response length appropriateness."""
-        length = len(response.split())
-        if length < self.min_response_length:
-            return length / self.min_response_length
         return 1.0
     def _calculate_score_gap(self, scores: List[float], top_n: int = 3) -> float:
-        """
-        Calculate the average gap between the top N scores.
-        Args:
-            scores (List[float]): List of similarity scores.
-            top_n (int): Number of top scores to consider.
-        Returns:
-            float: Average score gap.
-        """
         if len(scores) < top_n + 1:
             return 0.0
-        gaps = [scores[i] - scores[i + 1] for i in range(top_n)]
-        avg_gap = np.mean(gaps)
-        return avg_gap
-    def _determine_confidence(self, metrics: Dict[str, float]) -> bool:
-        """
-        Determine if we're confident enough in the response.
-        Returns:
-            bool: True if we should use this response, False if we should abstain
-        """
-        conditions = [
-            metrics['top_score'] >= self.confidence_threshold,
-            metrics['response_diversity'] >= self.diversity_threshold,
-            metrics['response_length_score'] >= 0.8,
-            metrics['query_response_relevance'] >= 0.3,  # was 0.5
-            metrics['top_3_score_gap'] >= 0.05  # was 0.1
-        ]
-        return all(conditions)
     def encode_text(self, text: str) -> np.ndarray:
-        # 1) Turn text into a list if your encode_responses() expects a list.
-        # 2) Then call the method from the chatbot to get the embedding.
-        embedding_tensor = self.chatbot.encode_responses([text])  # returns tf.Tensor of shape (1, emb_dim)
-        embedding = embedding_tensor.numpy()[0].astype('float32')  # shape: (emb_dim,)
-        embedding = embedding / np.linalg.norm(embedding) if np.linalg.norm(embedding) > 0 else embedding
-        return embedding
     def encode_query(self, query: str) -> np.ndarray:
-        embedding_tensor = self.chatbot.encode_query(query)  # returns tf.Tensor of shape (1, emb_dim)
-        embedding = embedding_tensor.numpy()[0].astype('float32')  # shape: (emb_dim,)
-        embedding = embedding / np.linalg.norm(embedding) if np.linalg.norm(embedding) > 0 else embedding
-        return embedding

 import numpy as np
+from typing import List, Tuple, Dict, Any, TYPE_CHECKING
 from sklearn.metrics.pairwise import cosine_similarity
+from logger_config import config_logger
+logger = config_logger(__name__)
+if TYPE_CHECKING:
+    from chatbot_model import RetrievalChatbot
 class ResponseQualityChecker:
+    """Enhanced quality checking with dynamic thresholds."""
     def __init__(
         self,
+        chatbot: 'RetrievalChatbot',
+        confidence_threshold: float = 0.6,
+        diversity_threshold: float = 0.15,
+        min_response_length: int = 5,
+        similarity_cap: float = 0.85  # Renamed from max_similarity_ratio and used in diversity calc
     ):
         self.confidence_threshold = confidence_threshold
         self.diversity_threshold = diversity_threshold
         self.min_response_length = min_response_length
+        self.similarity_cap = similarity_cap
         self.chatbot = chatbot
+        # Dynamic thresholds based on response patterns
+        self.thresholds = {
+            'relevance': 0.35,
+            'length_score': 0.85,
+            'score_gap': 0.07
+        }
     def check_response_quality(
         self,
         query: str,
         responses: List[Tuple[str, float]]
     ) -> Dict[str, Any]:
         """
+        Evaluate the quality of responses based on various metrics.
+        Args:
+            query: The user's query
+            responses: List of (response_text, score) tuples
+        Returns:
+            Dict containing quality metrics and confidence assessment
+        """
+        if not responses:
+            return {
+                'response_diversity': 0.0,
+                'query_response_relevance': 0.0,
+                'is_confident': False,
+                'top_score': 0.0,
+                'response_length_score': 0.0,
+                'top_3_score_gap': 0.0
+            }
+        # Calculate core metrics
+        metrics = {
+            'response_diversity': self.calculate_diversity(responses),
+            'query_response_relevance': self.calculate_relevance(query, responses),
+            'response_length_score': np.mean([
+                self._calculate_length_score(response) for response, _ in responses
+            ]),
+            'top_score': responses[0][1],
+            'top_3_score_gap': self._calculate_score_gap([score for _, score in responses], top_n=3)
+        }
+        # Determine confidence using thresholds
+        metrics['is_confident'] = self._determine_confidence(metrics)
+        logger.info(f"Quality metrics: {metrics}")
+        return metrics
+    def calculate_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
+        """Calculate relevance as weighted similarity between query and responses."""
+        if not responses:
+            return 0.0
+        # Get embeddings
+        query_embedding = self.encode_query(query)
+        response_embeddings = [self.encode_text(response) for response, _ in responses]
+        # Compute similarities with decreasing weights for later responses
+        similarities = cosine_similarity([query_embedding], response_embeddings)[0]
+        weights = np.array([1.0 / (i + 1) for i in range(len(similarities))])
+        return np.average(similarities, weights=weights)
     def calculate_diversity(self, responses: List[Tuple[str, float]]) -> float:
+        """Calculate diversity with length normalization and similarity capping."""
         if not responses:
             return 0.0
         embeddings = [self.encode_text(response) for response, _ in responses]
         if len(embeddings) < 2:
+            return 1.0
+        # Calculate similarities and apply cap
         similarity_matrix = cosine_similarity(embeddings)
+        similarity_matrix = np.minimum(similarity_matrix, self.similarity_cap)
+        # Apply length normalization
+        lengths = [len(resp[0].split()) for resp in responses]
+        length_ratios = np.array([min(a, b) / max(a, b) for a in lengths for b in lengths])
+        length_ratios = length_ratios.reshape(len(responses), len(responses))
+        # Combine factors with weights
+        adjusted_similarity = (similarity_matrix * 0.7 + length_ratios * 0.3)
+        # Calculate final score
+        sum_similarities = np.sum(adjusted_similarity) - len(responses)
         num_pairs = len(responses) * (len(responses) - 1)
         avg_similarity = sum_similarities / num_pairs if num_pairs > 0 else 0.0
+        return 1 - avg_similarity
+    def _determine_confidence(self, metrics: Dict[str, float]) -> bool:
+        """Determine confidence using primary and secondary conditions."""
+        # Primary conditions (must all be met)
+        primary_conditions = [
+            metrics['top_score'] >= self.confidence_threshold,
+            metrics['response_diversity'] >= self.diversity_threshold,
+            metrics['response_length_score'] >= self.thresholds['length_score']
+        ]
+        # Secondary conditions (majority must be met)
+        secondary_conditions = [
+            metrics['query_response_relevance'] >= self.thresholds['relevance'],
+            metrics['top_3_score_gap'] >= self.thresholds['score_gap'],
+            metrics['top_score'] >= (self.confidence_threshold * 1.1)  # Extra confidence boost
+        ]
+        return all(primary_conditions) and sum(secondary_conditions) >= 2
     def _calculate_length_score(self, response: str) -> float:
+        """Calculate length score with penalty for very short or long responses."""
+        words = len(response.split())
+        if words < self.min_response_length:
+            return words / self.min_response_length
+        elif words > 50:  # Penalty for very long responses
+            return min(1.0, 50 / words)
         return 1.0
     def _calculate_score_gap(self, scores: List[float], top_n: int = 3) -> float:
+        """Calculate average gap between top N scores."""
         if len(scores) < top_n + 1:
             return 0.0
+        gaps = [scores[i] - scores[i + 1] for i in range(min(len(scores) - 1, top_n))]
+        return np.mean(gaps)
     def encode_text(self, text: str) -> np.ndarray:
+        """Encode response text to embedding."""
+        embedding_tensor = self.chatbot.encode_responses([text])
+        embedding = embedding_tensor.numpy()[0].astype('float32')
+        return self._normalize_embedding(embedding)
     def encode_query(self, query: str) -> np.ndarray:
+        """Encode query text to embedding."""
+        embedding_tensor = self.chatbot.encode_query(query)
+        embedding = embedding_tensor.numpy()[0].astype('float32')
+        return self._normalize_embedding(embedding)
+    def _normalize_embedding(self, embedding: np.ndarray) -> np.ndarray:
+        """Normalize embedding vector."""
+        norm = np.linalg.norm(embedding)
+        return embedding / norm if norm > 0 else embedding

run_model.py DELETED Viewed

@@ -1,162 +0,0 @@
-import json
-import glob
-import os
-from chatbot import RetrievalChatbot
-import tensorflow as tf
-from sklearn.model_selection import train_test_split
-import matplotlib.pyplot as plt
-def load_training_data(data_directory: str) -> list:
-    """Load and combine dialogue data from multiple JSON files."""
-    all_dialogues = []
-    # Get all json files matching the pattern
-    pattern = os.path.join(data_directory, "batch_*.json")
-    json_files = sorted(glob.glob(pattern))
-    print(f"Found {len(json_files)} batch files")
-    for file_path in json_files:
-        try:
-            with open(file_path, 'r', encoding='utf-8') as f:
-                batch_dialogues = json.load(f)
-                all_dialogues.extend(batch_dialogues)
-                print(f"Loaded {len(batch_dialogues)} dialogues from {os.path.basename(file_path)}")
-        except Exception as e:
-            print(f"Error loading {file_path}: {str(e)}")
-    print(f"Total dialogues loaded: {len(all_dialogues)}")
-    return all_dialogues
-def plot_training_history(train_losses, val_losses):
-    # Plot training and validation loss
-    plt.figure()
-    plt.plot(train_losses, label='Train Loss')
-    plt.plot(val_losses, label='Val Loss')
-    plt.xlabel('Epoch')
-    plt.ylabel('Triplet Loss')
-    plt.legend()
-    plt.show()
-dialogues = load_training_data('processed_outputs')
-# Initialize the chatbot
-chatbot = RetrievalChatbot(
-    vocab_size=10000,
-    max_sequence_length=80,
-    embedding_dim=256,
-    lstm_units=256,
-    num_attention_heads=8,
-    margin=0.3
-)
-# Prepare the dataset for triplet training
-q_pad, p_pad, n_pad = chatbot.prepare_dataset(dialogues, neg_samples_per_pos=3)
-# Train with triplet loss
-train_losses, val_losses = chatbot.train_with_triplet_loss(
-    q_pad, p_pad, n_pad,
-    epochs=1,
-    batch_size=32,
-    validation_split=0.2
-)
-plot_training_history(train_losses, val_losses)
-# After training, test prediction
-response_candidates = [turn['text'] for d in dialogues for turn in d['turns'] if turn['speaker'] == 'assistant']
-# Test retrieval
-test_query = "I'd like a recommendation for a Korean restaurant in NYC."
-top_responses = chatbot.retrieve_top_n(test_query, response_candidates, top_n=5)
-print("Top responses:")
-for resp, score in top_responses:
-    print(f"Score: {score:.4f} - {resp}")
-# Single-turn validation:
-test_queries = [
-    "I want to book a Korean restaurant in NYC.",
-    "Can I get two tickets for 'What Men Want'?",
-    "What's the best time to watch the movie today?"
-]
-for query in test_queries:
-    top_responses = chatbot.retrieve_top_n(query, response_candidates, top_n=3)
-    print(f"\nQuery: {query}")
-    for resp, score in top_responses:
-        print(f"Score: {score:.4f} - {resp}")
-# Multi-turn conversation:
-multi_turn_history = []
-def update_context(multi_turn_history, query, response, max_context_turns=3):
-    multi_turn_history.append((query, response))
-    if len(multi_turn_history) > max_context_turns:
-        multi_turn_history.pop(0)
-def get_context_enhanced_query(multi_turn_history, query):
-    if not multi_turn_history:
-        return query
-    context = " ".join([f"User: {q} Assistant: {r}" for q, r in multi_turn_history])
-    return f"{context} User: {query}"
-conversation_queries = [
-    "I'd like to watch a movie tonight.",
-    "Is there a showing of 'What Men Want'?",
-    "What time is the last show?",
-    "Can I get two tickets?"
-]
-for query in conversation_queries:
-    context_query = get_context_enhanced_query(multi_turn_history, query)
-    top_responses = chatbot.retrieve_top_n(context_query, response_candidates, top_n=3)
-    best_response = top_responses[0][0]
-    print(f"\nUser: {query}\nAssistant: {best_response}")
-    update_context(multi_turn_history, query, best_response)
-#queries, responses, labels = chatbot.prepare_dataset(dialogues, neg_samples_per_pos=3)
-#train_dialogues, val_dialogues = train_test_split(dialogues, test_size=0.2, random_state=20)
-#query_train, query_val, response_train, response_val, labels_train, labels_val = train_test_split(queries, responses, labels, test_size=0.2, random_state=20)
-# chatbot.model.compile(
-#     optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0),
-#     loss='binary_crossentropy',
-#     metrics=['accuracy']
-# )
-# # Train the model with early stopping to prevent overfitting
-# callbacks = [
-#     tf.keras.callbacks.EarlyStopping(
-#         monitor='val_loss',
-#         patience=3,
-#         restore_best_weights=True
-#     ),
-#     tf.keras.callbacks.ReduceLROnPlateau(
-#         monitor='val_loss',
-#         factor=0.5,
-#         patience=2,
-#         min_lr=1e-6,
-#         verbose=1
-#     ),
-#     tf.keras.callbacks.ModelCheckpoint(
-#         'chatbot_model.keras',
-#         monitor='val_loss',
-#         save_best_only=True
-#     )
-# ]
-# history = chatbot.model.fit(
-#     {'query_input': query_train, 'response_input': response_train},
-#     labels_train,
-#     validation_data=({'query_input': query_val, 'response_input': response_val}, labels_val),
-#     epochs=5,
-#     batch_size=32,
-#     callbacks=callbacks
-# )

run_model2.py DELETED Viewed

@@ -1,340 +0,0 @@
-from chatbot2 import RetrievalChatbot, ChatbotConfig
-import os
-import json
-import glob
-import matplotlib.pyplot as plt
-import logging
-from pathlib import Path
-from typing import List, Dict, Optional, Any, Tuple
-import numpy as np
-from datetime import datetime
-from response_quality_checker import ResponseQualityChecker
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-def load_training_data(data_directory: str, debug_samples: Optional[int] = None) -> list:
-    """
-    Load and combine dialogue data from multiple JSON files.
-    Args:
-        data_directory: Directory containing the dialogue files
-        debug_samples: If set, only load this many dialogues for debugging
-    """
-    all_dialogues = []
-    data_directory = Path(data_directory)
-    # Get all json files matching the pattern
-    pattern = "batch_*.json"
-    json_files = sorted(data_directory.glob(pattern))
-    logger.info(f"Found {len(json_files)} batch files")
-    if debug_samples:
-        logger.info(f"Debug mode: Will load up to {debug_samples} dialogues")
-    for file_path in json_files:
-        try:
-            with open(file_path, 'r', encoding='utf-8') as f:
-                batch_dialogues = json.load(f)
-                # If in debug mode, only take what we need from this batch
-                if debug_samples is not None:
-                    remaining_samples = debug_samples - len(all_dialogues)
-                    if remaining_samples <= 0:
-                        break
-                    batch_dialogues = batch_dialogues[:remaining_samples]
-                all_dialogues.extend(batch_dialogues)
-                logger.info(f"Loaded {len(batch_dialogues)} dialogues from {file_path.name}")
-                # If we've reached our debug sample limit, stop loading
-                if debug_samples is not None and len(all_dialogues) >= debug_samples:
-                    logger.info(f"Debug mode: Reached {debug_samples} samples, stopping load")
-                    break
-        except Exception as e:
-            logger.error(f"Error loading {file_path}: {str(e)}")
-    total_loaded = len(all_dialogues)
-    if debug_samples:
-        logger.info(f"Debug mode: Loaded {total_loaded}/{debug_samples} requested dialogues")
-    else:
-        logger.info(f"Total dialogues loaded: {total_loaded}")
-    return all_dialogues
-def plot_training_history(history: Dict[str, List[float]], save_dir: Path = None):
-    """Plot and optionally save training history."""
-    # Create figure with two subplots
-    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
-    # Plot losses
-    ax1.plot(history['train_loss'], label='Train Loss')
-    ax1.plot(history['val_loss'], label='Validation Loss')
-    ax1.set_xlabel('Epoch')
-    ax1.set_ylabel('Triplet Loss')
-    ax1.set_title('Training and Validation Loss')
-    ax1.legend()
-    ax1.grid(True)
-    # Plot learning rate if available
-    if 'learning_rate' in history:
-        ax2.plot(history['learning_rate'], label='Learning Rate')
-        ax2.set_xlabel('Step')
-        ax2.set_ylabel('Learning Rate')
-        ax2.set_title('Learning Rate Schedule')
-        ax2.legend()
-        ax2.grid(True)
-    plt.tight_layout()
-    # Save if directory provided
-    if save_dir:
-        save_dir = Path(save_dir)
-        save_dir.mkdir(parents=True, exist_ok=True)
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        plt.savefig(save_dir / f'training_history_{timestamp}.png')
-    plt.show()
-def setup_training_directories(base_dir: str = "chatbot_training") -> Dict[str, Path]:
-    """Setup directory structure for training artifacts."""
-    base_dir = Path(base_dir)
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    train_dir = base_dir / f"training_run_{timestamp}"
-    directories = {
-        'base': train_dir,
-        'checkpoints': train_dir / 'checkpoints',
-        'plots': train_dir / 'plots',
-        'logs': train_dir / 'logs'
-    }
-    # Create directories
-    for dir_path in directories.values():
-        dir_path.mkdir(parents=True, exist_ok=True)
-    return directories
-def run_automatic_validation(
-    chatbot,
-    response_pool: List[str],
-    quality_checker: ResponseQualityChecker,
-    num_examples: int = 5
-) -> Dict[str, Any]:
-    """
-    Run automatic validation with quality metrics.
-    """
-    logger.info("\n=== Running Automatic Validation ===")
-    test_queries = [
-        "Hello, how are you today?",
-        "What's the weather like?",
-        "Can you help me with a problem?",
-        "Tell me a joke",
-        "What time is it?",
-        "I need help with my homework",
-        "Where's a good place to eat?",
-        "What movies are playing?",
-        "How do I reset my password?",
-        "Can you recommend a book?"
-    ]
-    test_queries = test_queries[:num_examples]
-    metrics_history = []
-    for i, query in enumerate(test_queries, 1):
-        logger.info(f"\nTest Case {i}:")
-        logger.info(f"Query: {query}")
-        # Get responses and scores
-        responses = chatbot.retrieve_responses(
-            query,
-            response_pool,
-            context=None,
-            top_k=5
-        )
-        # Check quality
-        quality_metrics = quality_checker.check_response_quality(
-            query, responses, response_pool
-        )
-        metrics_history.append(quality_metrics)
-        # Log results
-        logger.info(f"Quality Metrics: {quality_metrics}")
-        logger.info("Top responses:")
-        for j, (response, score) in enumerate(responses[:3], 1):
-            logger.info(f"{j}. Score: {score:.4f}")
-            logger.info(f"   Response: {response}")
-            if j == 1 and not quality_metrics['is_confident']:
-                logger.info("   [Low Confidence - Would abstain from answering]")
-    # Calculate aggregate metrics
-    aggregate_metrics = {
-        'num_queries_tested': len(test_queries),
-        'avg_top_response_score': np.mean([m['top_score'] for m in metrics_history]),
-        'avg_diversity': np.mean([m['response_diversity'] for m in metrics_history]),
-        'avg_relevance': np.mean([m['query_response_relevance'] for m in metrics_history]),
-        'confidence_rate': np.mean([m['is_confident'] for m in metrics_history]),
-    }
-    logger.info("\n=== Validation Summary ===")
-    for metric, value in aggregate_metrics.items():
-        logger.info(f"{metric}: {value:.4f}")
-    return aggregate_metrics
-def chat_with_quality_check(
-    chatbot,
-    query: str,
-    response_pool: List[str],
-    conversation_history: List[Tuple[str, str]],
-    quality_checker: ResponseQualityChecker
-) -> Tuple[Optional[str], List[Tuple[str, float]], Dict[str, Any]]:
-    """
-    Enhanced chat function with quality checking.
-    """
-    # Get responses and scores
-    responses = chatbot.retrieve_responses(
-        query,
-        response_pool,
-        conversation_history
-    )
-    # Check quality
-    quality_metrics = quality_checker.check_response_quality(
-        query, responses, response_pool
-    )
-    if quality_metrics['is_confident']:
-        return responses[0][0], responses, quality_metrics
-    else:
-        uncertainty_response = (
-            "I apologize, but I don't feel confident providing an answer to that "
-            "question at the moment. Could you please rephrase or ask something else?"
-        )
-        return uncertainty_response, responses, quality_metrics
-def get_total_steps(dialogues: List[Dict[str, Any]], batch_size: int, epochs: int) -> int:
-    """
-    Calculate total training steps based on dialogues and batch size.
-    Assume 80% of data for training due to validation split
-    """
-    estimated_train_samples = int(len(dialogues) * 0.8)
-    steps_per_epoch = estimated_train_samples // batch_size
-    return steps_per_epoch * epochs
-def main():
-    DEBUG_SAMPLES = 350
-    BATCH_SIZE = 32
-    EPOCHS = 5 if DEBUG_SAMPLES else 10
-    # Setup training directories
-    dirs = setup_training_directories()
-    # Load training data
-    dialogues = load_training_data('processed_outputs', debug_samples=DEBUG_SAMPLES)
-    total_steps = get_total_steps(dialogues, BATCH_SIZE, EPOCHS)
-    # Initialize configuration
-    config = ChatbotConfig(
-        embedding_dim=32, # TODO: 256
-        encoder_units=32, # TODO: 256
-        num_attention_heads=2, # TODO: 8
-        warmup_steps=int(total_steps * 0.1),  # 10% of total steps for warmup
-    )
-    # Save configuration
-    with open(dirs['base'] / 'config.json', 'w') as f:
-        json.dump(config.to_dict(), f, indent=2)
-    # Initialize chatbot
-    chatbot = RetrievalChatbot(config)
-    # Prepare dataset
-    logger.info("Preparing dataset...")
-    # Prepare and train with debug samples
-    q_pad, p_pad, n_pad = chatbot.prepare_dataset(
-        dialogues,
-        neg_samples_per_pos=3,
-        debug_samples=DEBUG_SAMPLES
-    )
-    # Train model
-    logger.info("Starting training...")
-    chatbot.train(
-        q_pad, p_pad, n_pad,
-        epochs=EPOCHS,
-        batch_size=BATCH_SIZE,
-        checkpoint_dir=dirs['checkpoints']
-    )
-    # Plot and save training history
-    plot_training_history(chatbot.history, save_dir=dirs['plots'])
-    # Save final model
-    chatbot.save_models(dirs['base'] / 'final_model')
-    # Prepare response pool for chat
-    response_pool = [
-        turn['text'] for d in dialogues
-        for turn in d['turns'] if turn['speaker'] == 'assistant'
-    ]
-    # Initialize quality checker with appropriate thresholds
-    quality_checker = ResponseQualityChecker(
-        confidence_threshold=0.6 if not DEBUG_SAMPLES else 0.4,  # Lower threshold for debug
-        diversity_threshold=0.2,
-        min_response_length=10,
-        max_similarity_ratio=0.9
-    )
-    # Run automatic validation
-    validation_metrics = run_automatic_validation(
-        chatbot,
-        response_pool,
-        quality_checker,
-        num_examples=5 if DEBUG_SAMPLES else 10
-    )
-    # Log validation metrics
-    logger.info(f"Validation Metrics: {validation_metrics}")
-    # Now continue with interactive chat
-    logger.info("\nStarting interactive chat session...")
-    conversation_history = []
-    while True:
-        query = input("\nYou: ")
-        if query.lower() in ['quit', 'exit', 'bye']:
-            break
-        try:
-            response, candidates = chatbot.chat(
-                query,
-                response_pool,
-                conversation_history
-            )
-            print(f"\nAssistant: {response}")
-            # Print top alternative responses
-            print("\nAlternative responses:")
-            for resp, score in candidates[1:4]:
-                print(f"Score: {score:.4f} - {resp}")
-            # Update history
-            conversation_history.append((query, response))
-        except Exception as e:
-            logger.error(f"Error during chat: {str(e)}")
-            print("Sorry, I encountered an error. Please try again.")
-if __name__ == "__main__":
-    main()

run_model3.py DELETED Viewed

@@ -1,434 +0,0 @@
-from chatbot3 import RetrievalChatbot, ChatbotConfig
-import os
-import json
-import glob
-import tensorflow as tf
-import matplotlib.pyplot as plt
-import logging
-from pathlib import Path
-from typing import List, Dict, Optional, Any, Tuple
-import numpy as np
-from datetime import datetime
-from response_quality_checker import ResponseQualityChecker
-import torch
-from transformers import TFAutoModel, AutoTokenizer
-policy = tf.keras.mixed_precision.Policy('mixed_float16')
-tf.keras.mixed_precision.set_global_policy(policy)
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-def setup_model_cache(cache_dir: Optional[Path] = None) -> Path:
-    """Setup and manage model cache directory."""
-    if cache_dir is None:
-        cache_dir = Path.home() / '.chatbot_cache'
-    cache_dir.mkdir(parents=True, exist_ok=True)
-    # Set environment variables for various libraries
-    os.environ['TRANSFORMERS_CACHE'] = str(cache_dir / 'transformers')
-    os.environ['TORCH_HOME'] = str(cache_dir / 'torch')
-    os.environ['HF_HOME'] = str(cache_dir / 'huggingface')
-    logger.info(f"Using cache directory: {cache_dir}")
-    return cache_dir
-def setup_gpu():
-    """Configure GPU settings for optimal performance."""
-    logger.info("Checking GPU availability...")
-    gpus = tf.config.list_physical_devices('GPU')
-    if gpus:
-        try:
-            # Allow memory growth to prevent taking all GPU memory at once
-            for gpu in gpus:
-                tf.config.experimental.set_memory_growth(gpu, True)
-            logger.info(f"Found {len(gpus)} GPU(s). Memory growth enabled.")
-            # Log GPU info
-            for gpu in gpus:
-                logger.info(f"GPU Device: {gpu}")
-                return True
-        except Exception as e:
-            logger.error(f"Error configuring GPU: {str(e)}")
-            return False
-    else:
-        logger.info("No GPU found. Using CPU.")
-        return False
-def preload_models(config: ChatbotConfig, cache_dir: Path):
-    """Preload and cache models."""
-    logger.info("Preloading models...")
-    # Cache DistilBERT
-    model_name = config.pretrained_model
-    cache_path = cache_dir / 'transformers' / model_name
-    if not cache_path.exists():
-        logger.info(f"Downloading and caching {model_name}...")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = TFAutoModel.from_pretrained(model_name)
-        # Save to cache
-        tokenizer.save_pretrained(cache_path)
-        model.save_pretrained(cache_path)
-    else:
-        logger.info(f"Using cached model from {cache_path}")
-    return cache_path
-def load_training_data(data_directory: str, debug_samples: Optional[int] = None) -> list:
-    """
-    Load and combine dialogue data from multiple JSON files.
-    Args:
-        data_directory: Directory containing the dialogue files
-        debug_samples: If set, only load this many dialogues for debugging
-    """
-    all_dialogues = []
-    data_directory = Path(data_directory)
-    # Get all json files matching the pattern
-    pattern = "batch_*.json"
-    json_files = sorted(data_directory.glob(pattern))
-    logger.info(f"Found {len(json_files)} batch files")
-    if debug_samples:
-        logger.info(f"Debug mode: Will load up to {debug_samples} dialogues")
-    for file_path in json_files:
-        try:
-            with open(file_path, 'r', encoding='utf-8') as f:
-                batch_dialogues = json.load(f)
-                # If in debug mode, only take what we need from this batch
-                if debug_samples is not None:
-                    remaining_samples = debug_samples - len(all_dialogues)
-                    if remaining_samples <= 0:
-                        break
-                    batch_dialogues = batch_dialogues[:remaining_samples]
-                all_dialogues.extend(batch_dialogues)
-                logger.info(f"Loaded {len(batch_dialogues)} dialogues from {file_path.name}")
-                # If we've reached our debug sample limit, stop loading
-                if debug_samples is not None and len(all_dialogues) >= debug_samples:
-                    logger.info(f"Debug mode: Reached {debug_samples} samples, stopping load")
-                    break
-        except Exception as e:
-            logger.error(f"Error loading {file_path}: {str(e)}")
-    total_loaded = len(all_dialogues)
-    if debug_samples:
-        logger.info(f"Debug mode: Loaded {total_loaded}/{debug_samples} requested dialogues")
-    else:
-        logger.info(f"Total dialogues loaded: {total_loaded}")
-    return all_dialogues
-def plot_training_history(history: Dict[str, List[float]], save_dir: Path = None):
-    """Plot and optionally save training history."""
-    # Create figure with two subplots
-    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
-    # Plot losses
-    ax1.plot(history['train_loss'], label='Train Loss')
-    ax1.plot(history['val_loss'], label='Validation Loss')
-    ax1.set_xlabel('Epoch')
-    ax1.set_ylabel('Triplet Loss')
-    ax1.set_title('Training and Validation Loss')
-    ax1.legend()
-    ax1.grid(True)
-    # Plot learning rate if available
-    if 'learning_rate' in history:
-        ax2.plot(history['learning_rate'], label='Learning Rate')
-        ax2.set_xlabel('Step')
-        ax2.set_ylabel('Learning Rate')
-        ax2.set_title('Learning Rate Schedule')
-        ax2.legend()
-        ax2.grid(True)
-    plt.tight_layout()
-    # Save if directory provided
-    if save_dir:
-        save_dir = Path(save_dir)
-        save_dir.mkdir(parents=True, exist_ok=True)
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        plt.savefig(save_dir / f'training_history_{timestamp}.png')
-    plt.show()
-def setup_training_directories(base_dir: str = "chatbot_training") -> Dict[str, Path]:
-    """Setup directory structure for training artifacts."""
-    base_dir = Path(base_dir)
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    train_dir = base_dir / f"training_run_{timestamp}"
-    directories = {
-        'base': train_dir,
-        'checkpoints': train_dir / 'checkpoints',
-        'plots': train_dir / 'plots',
-        'logs': train_dir / 'logs'
-    }
-    # Create directories
-    for dir_path in directories.values():
-        dir_path.mkdir(parents=True, exist_ok=True)
-    return directories
-def run_automatic_validation(
-    chatbot,
-    response_pool: List[str],
-    quality_checker: ResponseQualityChecker,
-    num_examples: int = 5
-) -> Dict[str, Any]:
-    """
-    Run automatic validation with quality metrics.
-    """
-    logger.info("\n=== Running Automatic Validation ===")
-    test_queries = [
-        "Hello, how are you today?",
-        "What's the weather like?",
-        "Can you help me with a problem?",
-        "Tell me a joke",
-        "What time is it?",
-        "I need help with my homework",
-        "Where's a good place to eat?",
-        "What movies are playing?",
-        "How do I reset my password?",
-        "Can you recommend a book?"
-    ]
-    test_queries = test_queries[:num_examples]
-    metrics_history = []
-    for i, query in enumerate(test_queries, 1):
-        logger.info(f"\nTest Case {i}:")
-        logger.info(f"Query: {query}")
-        # Get responses and scores
-        responses = chatbot.retrieve_responses(
-            query,
-            response_pool,
-            context=None,
-            top_k=5
-        )
-        # Check quality
-        quality_metrics = quality_checker.check_response_quality(
-            query, responses, response_pool
-        )
-        metrics_history.append(quality_metrics)
-        # Log results
-        logger.info(f"Quality Metrics: {quality_metrics}")
-        logger.info("Top responses:")
-        for j, (response, score) in enumerate(responses[:3], 1):
-            logger.info(f"{j}. Score: {score:.4f}")
-            logger.info(f"   Response: {response}")
-            if j == 1 and not quality_metrics['is_confident']:
-                logger.info("   [Low Confidence - Would abstain from answering]")
-    # Calculate aggregate metrics
-    aggregate_metrics = {
-        'num_queries_tested': len(test_queries),
-        'avg_top_response_score': np.mean([m['top_score'] for m in metrics_history]),
-        'avg_diversity': np.mean([m['response_diversity'] for m in metrics_history]),
-        'avg_relevance': np.mean([m['query_response_relevance'] for m in metrics_history]),
-        'confidence_rate': np.mean([m['is_confident'] for m in metrics_history]),
-    }
-    logger.info("\n=== Validation Summary ===")
-    for metric, value in aggregate_metrics.items():
-        logger.info(f"{metric}: {value:.4f}")
-    return aggregate_metrics
-def chat_with_quality_check(
-    chatbot,
-    query: str,
-    response_pool: List[str],
-    conversation_history: List[Tuple[str, str]],
-    quality_checker: ResponseQualityChecker
-) -> Tuple[Optional[str], List[Tuple[str, float]], Dict[str, Any]]:
-    """
-    Enhanced chat function with quality checking.
-    """
-    # Get responses and scores
-    responses = chatbot.retrieve_responses(
-        query,
-        response_pool,
-        conversation_history
-    )
-    # Check quality
-    quality_metrics = quality_checker.check_response_quality(
-        query, responses, response_pool
-    )
-    if quality_metrics['is_confident']:
-        return responses[0][0], responses, quality_metrics
-    else:
-        uncertainty_response = (
-            "I apologize, but I don't feel confident providing an answer to that "
-            "question at the moment. Could you please rephrase or ask something else?"
-        )
-        return uncertainty_response, responses, quality_metrics
-def get_total_steps(dialogues: List[Dict[str, Any]], batch_size: int, epochs: int) -> int:
-    """
-    Calculate total training steps based on dialogues and batch size.
-    Assume 80% of data for training due to validation split
-    """
-    estimated_train_samples = int(len(dialogues) * 0.8)
-    steps_per_epoch = estimated_train_samples // batch_size
-    return steps_per_epoch * epochs
-def main():
-    # Set up GPU
-    is_gpu = setup_gpu()
-    DEBUG_SAMPLES = 350
-    BATCH_SIZE = 64 if is_gpu else 32
-    EPOCHS = 5 if DEBUG_SAMPLES else 10
-    # Set device
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    logger.info(f"Using device: {device}")
-    # Set up caching
-    cache_dir = setup_model_cache()
-    # Set up training directories
-    dirs = setup_training_directories()
-    # Load training data
-    dialogues = load_training_data('processed_outputs', debug_samples=DEBUG_SAMPLES)
-    total_steps = get_total_steps(dialogues, BATCH_SIZE, EPOCHS)
-    # Initialize configuration
-    config = ChatbotConfig(
-        embedding_dim=768,  # Match DistilBERT's dimension
-        encoder_units=256,
-        num_attention_heads=8,
-        warmup_steps=int(total_steps * 0.1),
-        learning_rate=0.0003,
-        margin=0.5,
-        pretrained_model='distilbert-base-uncased'
-    )
-    # Preload models
-    preload_models(config, cache_dir)
-    # Save configuration
-    with open(dirs['base'] / 'config.json', 'w') as f:
-        json.dump(config.to_dict(), f, indent=2)
-    # Initialize chatbot
-    chatbot = RetrievalChatbot(config)
-    # Prepare dataset
-    logger.info("Preparing dataset...")
-    # Prepare and train with debug samples
-    q_pad, p_pad, n_pad = chatbot.prepare_dataset(
-        dialogues,
-        neg_samples_per_pos=3,
-        debug_samples=DEBUG_SAMPLES
-    )
-    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs')
-    # Train model
-    logger.info("Starting training...")
-    chatbot.train(
-        q_pad, p_pad, n_pad,
-        epochs=EPOCHS,
-        batch_size=BATCH_SIZE,
-        validation_split=0.2,
-        checkpoint_dir=dirs['checkpoints'],
-        callbacks=[tensorboard_callback]
-    )
-    # Plot and save training history
-    plot_training_history(chatbot.history, save_dir=dirs['plots'])
-    # Save final model
-    chatbot.save_models(dirs['base'] / 'final_model')
-    # Prepare response pool for chat
-    response_pool = [
-        turn['text'] for d in dialogues
-        for turn in d['turns'] if turn['speaker'] == 'assistant'
-    ]
-    # Initialize quality checker with appropriate thresholds
-    quality_checker = ResponseQualityChecker(
-        confidence_threshold=0.6 if not DEBUG_SAMPLES else 0.4,  # Lower threshold for debug
-        diversity_threshold=0.2,
-        min_response_length=10,
-        max_similarity_ratio=0.9
-    )
-    # Run automatic validation
-    validation_metrics = run_automatic_validation(
-        chatbot,
-        response_pool,
-        quality_checker,
-        num_examples=5 if DEBUG_SAMPLES else 10
-    )
-    # Log validation metrics
-    logger.info(f"Validation Metrics: {validation_metrics}")
-    # Now continue with interactive chat
-    logger.info("\nStarting interactive chat session...")
-    conversation_history = []
-    while True:
-        query = input("\nYou: ")
-        if query.lower() in ['quit', 'exit', 'bye']:
-            break
-        try:
-            response, candidates, quality_metrics = chat_with_quality_check(
-                chatbot,
-                query,
-                response_pool,
-                conversation_history,
-                quality_checker
-            )
-            print(f"\nAssistant: {response}")
-            # Print top alternative responses if confident
-            if quality_metrics['is_confident']:
-                print("\nAlternative responses:")
-                for resp, score in candidates[1:4]:
-                    print(f"Score: {score:.4f} - {resp}")
-                # Update history only for confident responses
-                conversation_history.append((query, response))
-            else:
-                print("\nQuality metrics indicated low confidence:")
-                print(f"Confidence score: {quality_metrics['top_score']:.4f}")
-                print(f"Response relevance: {quality_metrics['query_response_relevance']:.4f}")
-        except Exception as e:
-            logger.error(f"Error during chat: {str(e)}")
-            print("Sorry, I encountered an error. Please try again.")
-if __name__ == "__main__":
-    main()

run_model4.py DELETED Viewed

@@ -1,237 +0,0 @@
-from chatbot4 import RetrievalChatbot, ChatbotConfig
-import os
-import tensorflow as tf
-import matplotlib.pyplot as plt
-import logging
-from pathlib import Path
-from typing import List, Dict, Optional
-from datetime import datetime
-from response_quality_checker import ResponseQualityChecker
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-def setup_model_cache(cache_dir: Optional[Path] = None) -> Path:
-    """Setup and manage model cache directory."""
-    if cache_dir is None:
-        cache_dir = Path.home() / '.chatbot_cache'
-    cache_dir.mkdir(parents=True, exist_ok=True)
-    # Set environment variables for various libraries
-    os.environ['TRANSFORMERS_CACHE'] = str(cache_dir / 'transformers')
-    os.environ['TORCH_HOME'] = str(cache_dir / 'torch')
-    os.environ['HF_HOME'] = str(cache_dir / 'huggingface')
-    logger.info(f"Using cache directory: {cache_dir}")
-    return cache_dir
-def setup_gpu():
-    """Configure GPU settings for optimal performance."""
-    logger.info("Checking GPU availability...")
-    gpus = tf.config.list_physical_devices('GPU')
-    if gpus:
-        try:
-            # Allow memory growth to prevent taking all GPU memory at once
-            for gpu in gpus:
-                tf.config.experimental.set_memory_growth(gpu, True)
-            logger.info(f"Found {len(gpus)} GPU(s). Memory growth enabled.")
-            # Log GPU info
-            for gpu in gpus:
-                logger.info(f"GPU Device: {gpu}")
-                return True
-        except Exception as e:
-            logger.error(f"Error configuring GPU: {str(e)}")
-            return False
-    else:
-        logger.info("No GPU found. Using CPU.")
-        return False
-# def preload_models(config: ChatbotConfig, cache_dir: Path):
-#     """Preload and cache models."""
-#     logger.info("Preloading models...")
-#     # Cache DistilBERT
-#     model_name = config.pretrained_model
-#     cache_path = cache_dir / 'transformers' / model_name
-#     if not cache_path.exists():
-#         logger.info(f"Downloading and caching {model_name}...")
-#         tokenizer = AutoTokenizer.from_pretrained(model_name)
-#         model = TFAutoModel.from_pretrained(model_name)
-#         # Save to cache
-#         tokenizer.save_pretrained(cache_path)
-#         model.save_pretrained(cache_path)
-#     else:
-#         logger.info(f"Using cached model from {cache_path}")
-#     return cache_path
-def plot_training_history(history: Dict[str, List[float]], save_dir: Path = None):
-    """Plot and optionally save training history."""
-    # Create figure with two subplots
-    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
-    # Plot losses
-    ax1.plot(history['train_loss'], label='Train Loss')
-    ax1.plot(history['val_loss'], label='Validation Loss')
-    ax1.set_xlabel('Epoch')
-    ax1.set_ylabel('Triplet Loss')
-    ax1.set_title('Training and Validation Loss')
-    ax1.legend()
-    ax1.grid(True)
-    # Plot learning rate if available
-    if 'learning_rate' in history:
-        ax2.plot(history['learning_rate'], label='Learning Rate')
-        ax2.set_xlabel('Step')
-        ax2.set_ylabel('Learning Rate')
-        ax2.set_title('Learning Rate Schedule')
-        ax2.legend()
-        ax2.grid(True)
-    plt.tight_layout()
-    # Save if directory provided
-    if save_dir:
-        save_dir = Path(save_dir)
-        save_dir.mkdir(parents=True, exist_ok=True)
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        plt.savefig(save_dir / f'training_history_{timestamp}.png')
-    plt.show()
-def setup_training_directories(base_dir: str = "chatbot_training") -> Dict[str, Path]:
-    """Setup directory structure for training artifacts."""
-    base_dir = Path(base_dir)
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    train_dir = base_dir / f"training_run_{timestamp}"
-    directories = {
-        'base': train_dir,
-        'checkpoints': train_dir / 'checkpoints',
-        'plots': train_dir / 'plots',
-        'logs': train_dir / 'logs'
-    }
-    # Create directories
-    for dir_path in directories.values():
-        dir_path.mkdir(parents=True, exist_ok=True)
-    return directories
-def main():
-    # Set up GPU
-    is_gpu = setup_gpu()
-    DEBUG_SAMPLES = 2000
-    BATCH_SIZE = 128 if is_gpu else 64
-    EPOCHS = 5 if DEBUG_SAMPLES else 10
-    # Set up caching
-    cache_dir = setup_model_cache()
-    # Set up training directories
-    dirs = setup_training_directories()
-    # Initialize configuration
-    config = ChatbotConfig(
-        embedding_dim=768,  # Match DistilBERT's dimension
-        max_sequence_length=512,
-        freeze_embeddings=False
-    )
-    # Preload models
-    #preload_models(config, cache_dir)
-    # Save configuration
-    # with open(dirs['base'] / 'config.json', 'w') as f:
-    #     json.dump(config.to_dict(), f, indent=4)
-    # Load training data
-    dialogues = RetrievalChatbot.load_training_data(data_path='processed_outputs/batch_group_0010.json', debug_samples=DEBUG_SAMPLES)
-    # Initialize chatbot
-    chatbot = RetrievalChatbot(config, dialogues)
-    # Check trainable variables
-    chatbot.check_trainable_variables()
-    # Verify FAISS
-    chatbot.verify_faiss_index()
-    # Prepare dataset
-    logger.info("Preparing dataset...")
-    # Prepare and train with debug samples
-    q_tensor, p_tensor = chatbot.prepare_dataset(dialogues)
-    quality_checker = ResponseQualityChecker(chatbot=chatbot)
-    # Train model
-    logger.info("Starting training...")
-    tf.config.optimizer.set_jit(True)  # XLA
-    policy = tf.keras.mixed_precision.Policy('mixed_float16')
-    tf.keras.mixed_precision.set_global_policy(policy)
-    chatbot.train(
-        q_pad=q_tensor,
-        p_pad=p_tensor,
-        epochs=EPOCHS,
-        batch_size=BATCH_SIZE,
-        validation_split=0.2,
-        checkpoint_dir="checkpoints/",
-        use_lr_schedule=True,     # Enable custom schedule
-        peak_lr=2e-5,             # Peak learning rate
-        warmup_steps_ratio=0.1,   # 10% warmup
-        early_stopping_patience=3 # Stop if no improvement for 3 epochs
-    )
-    # Plot and save training history
-    #plot_training_history(chatbot.history, save_dir=dirs['plots'])
-    # Save final model
-    chatbot.save_models(dirs['base'] / 'final_model')
-    # Run automatic validation
-    validation_metrics = chatbot.run_automatic_validation(quality_checker, num_examples=5)
-    # Log validation metrics
-    logger.info(f"Validation Metrics: {validation_metrics}")
-    # Now continue with interactive chat
-    logger.info("\nStarting interactive chat session...")
-    conversation_history = []
-    while True:
-        user_input = input("You: ")
-        if user_input.lower() in ['quit', 'exit', 'bye']:
-            print("Assistant: Goodbye!")
-            break
-        response, candidates, metrics = chatbot.chat(
-            query=user_input,
-            conversation_history=None,  # Pass conversation history if available
-            quality_checker=quality_checker,
-            top_k=5
-        )
-        print(f"Assistant: {response}")
-        # Optionally, display alternative responses
-        if metrics.get('is_confident', False):
-            print("\nAlternative responses:")
-            for resp, score in candidates[1:4]:
-                print(f"Score: {score:.4f} - {resp}")
-if __name__ == "__main__":
-    main()

run_model_train.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from chatbot_model import RetrievalChatbot, ChatbotConfig
+from environment_setup import EnvironmentSetup
+from response_quality_checker import ResponseQualityChecker
+from chatbot_validator import ChatbotValidator
+from training_plotter import TrainingPlotter
+# Configure logging
+from logger_config import config_logger
+logger = config_logger(__name__)
+def run_interactive_chat(chatbot, quality_checker):
+    """Separate function for interactive chat loop"""
+    while True:
+        user_input = input("You: ")
+        if user_input.lower() in ['quit', 'exit', 'bye']:
+            print("Assistant: Goodbye!")
+            break
+        response, candidates, metrics = chatbot.chat(
+            query=user_input,
+            conversation_history=None,
+            quality_checker=quality_checker,
+            top_k=5
+        )
+        print(f"Assistant: {response}")
+        if metrics.get('is_confident', False):
+            print("\nAlternative responses:")
+            for resp, score in candidates[1:4]:
+                print(f"Score: {score:.4f} - {resp}")
+def main():
+    # Initialize environment
+    env = EnvironmentSetup()
+    env.initialize()
+    DEBUG_SAMPLES = 5
+    EPOCHS = 1 if DEBUG_SAMPLES else 20
+    TRAINING_DATA_PATH = 'processed_outputs/batch_group_0010.json'
+    # Optimize batch size for Colab
+    batch_size = env.optimize_batch_size(base_batch_size=16)
+    # Initialize configuration
+    config = ChatbotConfig(
+        embedding_dim=512, # 768,  # Match DistilBERT's dimension
+        max_context_token_limit=512,
+        freeze_embeddings=False,
+    )
+    # Load training data
+    dialogues = RetrievalChatbot.load_training_data(data_path=TRAINING_DATA_PATH, debug_samples=DEBUG_SAMPLES)
+    # Initialize chatbot and verify FAISS index
+    with env.strategy.scope():
+        chatbot = RetrievalChatbot(config, dialogues)
+    chatbot.verify_faiss_index()
+    # Prepare dataset
+    logger.info("Preparing dataset...")
+    q_tensor, p_tensor = chatbot.prepare_dataset(dialogues)
+    quality_checker = ResponseQualityChecker(chatbot=chatbot)
+    # Train model
+    logger.info("Starting training...")
+    chatbot.train(
+        q_pad=q_tensor,
+        p_pad=p_tensor,
+        epochs=EPOCHS,
+        batch_size=batch_size,
+        validation_split=0.2,
+    )
+    # Save final model
+    model_save_path = env.training_dirs['base'] / 'final_model'
+    chatbot.save_models(model_save_path)
+    # Run automatic validation
+    quality_checker = ResponseQualityChecker(chatbot=chatbot)
+    validator = ChatbotValidator(chatbot, quality_checker)
+    validation_metrics = validator.run_validation(num_examples=5)
+    logger.info(f"Validation Metrics: {validation_metrics}")
+    # Plot and save training history
+    plotter = TrainingPlotter(save_dir=env.training_dirs['plots'])
+    plotter.plot_training_history(chatbot.history)
+    plotter.plot_validation_metrics(validation_metrics)
+    # Run interactive chat
+    logger.info("\nStarting interactive chat session...")
+    run_interactive_chat(chatbot, quality_checker)
+if __name__ == "__main__":
+    main()

setup.py CHANGED Viewed

@@ -25,6 +25,27 @@ def setup_spacy_models(models=['en_core_web_sm', 'en_core_web_md']):
         print(f"Error downloading spaCy model: {model}")
         print(e)
 def setup_models():
     """
     Download other required models.
@@ -40,7 +61,7 @@ def setup_models():
         DistilBertModel
     )
-    # Download DistilBERT for chatbot
     tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
     model = DistilBertModel.from_pretrained('distilbert-base-uncased')
@@ -116,11 +137,11 @@ def setup_faiss():
         print(e)
 setup(
-    name="text-data-augmenter",
-    version="0.1.0",
     author="Joe Armani",
     author_email="[email protected]",
-    description="A tool for generating high-quality dialogue variations",
     long_description=long_description,
     long_description_content_type="text/markdown",
     packages=find_packages(),
@@ -132,24 +153,39 @@ setup(
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
         "Topic :: Text Processing :: Linguistic",
     ],
     python_requires=">=3.8",
     install_requires=requirements,
     entry_points={
         "console_scripts": [
             "dialogue-augment=dialogue_augmenter.main:main",
         ],
     },
     include_package_data=True,
     package_data={
         "dialogue_augmenter": ["data/*.json", "config/*.yaml"],
     },
 )
 if __name__ == '__main__':
     setup_spacy_models()
     setup_models()
     setup_nltk()
     setup_faiss()

         print(f"Error downloading spaCy model: {model}")
         print(e)
+def setup_gpu_dependencies():
+    """Setup GPU-specific dependencies."""
+    cuda_available = False
+    # Check CUDA availability
+    try:
+        import torch
+        cuda_available = torch.cuda.is_available()
+    except ImportError:
+        pass
+    if cuda_available:
+        try:
+            subprocess.check_call([sys.executable, "-m", "pip", "install", "faiss-gpu>=1.7.0"])
+            print("Successfully installed faiss-gpu")
+        except subprocess.CalledProcessError:
+            print("Failed to install faiss-gpu. Falling back to faiss-cpu")
+            subprocess.check_call([sys.executable, "-m", "pip", "install", "faiss-cpu>=1.7.0"])
+    else:
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "faiss-cpu>=1.7.0"])
 def setup_models():
     """
     Download other required models.
         DistilBertModel
     )
+    # Cache the models
     tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
     model = DistilBertModel.from_pretrained('distilbert-base-uncased')
         print(e)
 setup(
+    name="retrieval-chatbot",
+    version="0.2.0",
     author="Joe Armani",
     author_email="[email protected]",
+    description="A retrieval-based chatbot with enhanced validation",
     long_description=long_description,
     long_description_content_type="text/markdown",
     packages=find_packages(),
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
         "Topic :: Text Processing :: Linguistic",
     ],
     python_requires=">=3.8",
     install_requires=requirements,
+    extras_require={
+        'dev': [
+            'pytest>=7.0.0',
+            'black>=22.0.0',
+            'isort>=5.10.0',
+            'mypy>=1.0.0',
+        ],
+        'gpu': [
+            'faiss-gpu>=1.7.0',
+        ],
+    },
     entry_points={
         "console_scripts": [
             "dialogue-augment=dialogue_augmenter.main:main",
+            "run-chatbot=chatbot.main:main",
         ],
     },
     include_package_data=True,
     package_data={
+        "chatbot": ["config/*.yaml"],
         "dialogue_augmenter": ["data/*.json", "config/*.yaml"],
     },
 )
 if __name__ == '__main__':
     setup_spacy_models()
+    setup_gpu_dependencies()
     setup_models()
     setup_nltk()
     setup_faiss()

training_plotter.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+from datetime import datetime
+import logging
+logger = logging.getLogger(__name__)
+class TrainingPlotter:
+    def __init__(self, save_dir: Optional[Path] = None):
+        self.save_dir = save_dir
+        if save_dir:
+            self.save_dir.mkdir(parents=True, exist_ok=True)
+    def plot_training_history(self, history: Dict[str, List[float]], title: str = "Training History"):
+        """Plot and optionally save training metrics history.
+        Args:
+            history: Dictionary containing training metrics
+            title: Title for the plot
+        """
+        # Silence matplotlib debug messages
+        logger.setLevel(logging.WARNING)
+        # Create figure with subplots
+        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
+        # Plot losses
+        ax1.plot(history['train_loss'], label='Train Loss')
+        ax1.plot(history['val_loss'], label='Validation Loss')
+        ax1.set_xlabel('Epoch')
+        ax1.set_ylabel('Loss')
+        ax1.set_title('Training and Validation Loss')
+        ax1.legend()
+        ax1.grid(True)
+        # Plot learning rate if available
+        if 'learning_rate' in history:
+            ax2.plot(history['learning_rate'], label='Learning Rate')
+            ax2.set_xlabel('Step')
+            ax2.set_ylabel('Learning Rate')
+            ax2.set_title('Learning Rate Schedule')
+            ax2.legend()
+            ax2.grid(True)
+        plt.suptitle(title)
+        plt.tight_layout()
+        # Reset the logger level
+        logger.setLevel(logging.INFO)
+        # Save if directory provided
+        if self.save_dir:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            save_path = self.save_dir / f'training_history_{timestamp}.png'
+            plt.savefig(save_path)
+            logger.info(f"Saved training history plot to {save_path}")
+        plt.show()
+    def plot_validation_metrics(self, metrics: Dict[str, float]):
+        """Plot validation metrics as a bar chart.
+        Args:
+            metrics: Dictionary of validation metrics. Can handle nested dictionaries.
+        """
+        # Silence matplotlib debug messages
+        logger.setLevel(logging.WARNING)
+        # Flatten nested metrics dictionary
+        flat_metrics = {}
+        for key, value in metrics.items():
+            # Skip num_queries_tested
+            if key == 'num_queries_tested':
+                continue
+            if isinstance(value, dict):
+                # If value is a dictionary, flatten it with key prefix
+                for subkey, subvalue in value.items():
+                    if isinstance(subvalue, (int, float)):  # Only include numeric values
+                        flat_metrics[f"{key}_{subkey}"] = subvalue
+            elif isinstance(value, (int, float)):  # Only include numeric values
+                flat_metrics[key] = value
+        if not flat_metrics:
+            logger.warning("No numeric metrics to plot")
+            return
+        plt.figure(figsize=(12, 6))
+        # Extract metrics and values
+        metric_names = list(flat_metrics.keys())
+        values = list(flat_metrics.values())
+        # Create bar chart
+        bars = plt.bar(range(len(metric_names)), values)
+        # Customize the plot
+        plt.title('Validation Metrics')
+        plt.xticks(range(len(metric_names)), metric_names, rotation=45, ha='right')
+        plt.ylabel('Value')
+        # Add value labels on top of bars
+        for bar in bars:
+            height = bar.get_height()
+            plt.text(bar.get_x() + bar.get_width()/2., height,
+                    f'{height:.3f}',
+                    ha='center', va='bottom')
+        # Set y-axis limits to focus on metrics between 0 and 1
+        plt.ylim(0, 1.1)  # Slight padding above 1 for label visibility
+        # Adjust layout to prevent label cutoff
+        plt.tight_layout()
+        # Reset the logger level
+        logger.setLevel(logging.INFO)
+        # Save if directory provided
+        if self.save_dir:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            save_path = self.save_dir / f'validation_metrics_{timestamp}.png'
+            plt.savefig(save_path)
+            logger.info(f"Saved validation metrics plot to {save_path}")
+        plt.show()